From 5dac32794c296e6c7c4c7018de573017ae898811 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 28 Jan 2026 10:11:26 +0000
Subject: [PATCH 001/172] copy file python module

---
 common/library/modules/parallel_file_copy.py  | 174 ++++++++++++++++++
 .../slurm_config/tasks/create_slurm_dir.yml   | 162 +++++++++++-----
 discovery/roles/slurm_config/vars/main.yml    |  52 +++---
 3 files changed, 315 insertions(+), 73 deletions(-)
 create mode 100644 common/library/modules/parallel_file_copy.py

diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py
new file mode 100644
index 0000000000..4f05f041c3
--- /dev/null
+++ b/common/library/modules/parallel_file_copy.py
@@ -0,0 +1,174 @@
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/python
+# pylint: disable=import-error,no-name-in-module,line-too-long
+
+"""
+Ansible module for parallel copying of files.
+
+Supports copying multiple source → destination pairs in parallel,
+with logging, retries, and optional cleanup.
+"""
+
+import os
+import shutil
+import threading
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from ansible.module_utils.basic import AnsibleModule
+from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
+
+# ============================================================
+# Default Values
+# ============================================================
+
+DEFAULT_MAX_WORKERS = 4
+DEFAULT_RETRY_COUNT = 2
+DEFAULT_DELETE_EXISTING = True
+
+# ============================================================
+# Copy Worker Function
+# ============================================================
+
+def copy_single_file(src_file, dest_dir, retry_count, delete_existing, slogger, summary):
+    """Copy one directory pair with retry support."""
+    thread_name = threading.current_thread().name
+    start_time = datetime.now()
+
+    if not os.path.isfile(src_file):
+        slogger.info(f"NOT COPIED - Source file missing: {src_file}")
+        summary["skipped"].append(src_file)
+        return
+
+    os.makedirs(dest_dir, exist_ok=True)
+    dest_file = os.path.join(dest_dir, os.path.basename(src_file))
+
+    for attempt in range(1, retry_count + 1):
+        try:
+            slogger.info(f"[{thread_name}] START {start_time} Copying {src_file} (Attempt {attempt})")
+
+            if delete_existing and os.path.exists(dest_file):
+                os.remove(dest_file)
+                slogger.info(f"Deleted existing file: {dest_file}")
+
+            shutil.copy2(src_file, dest_file)
+
+            end_time = datetime.now()
+            duration = (end_time - start_time).total_seconds()
+            slogger.info(f"[{thread_name}] SUCCESS {end_time} Copied {src_file} -> {dest_file} (Duration={duration:.2f}s)")
+
+            summary["copied"].append(src_file)
+            return
+
+        except Exception as err:
+            slogger.error(f"[{thread_name}] ERROR copying {src_file} (Attempt {attempt}) Reason: {err}")
+            if attempt == retry_count:
+                summary["failed"].append(src_file)
+
+# ============================================================
+# Main Parallel Copy Logic
+# ============================================================
+
+def execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger):
+    """
+    Executes parallel copy for all pairs.
+    Returns summary dict.
+    """
+    summary = {"copied": [], "skipped": [], "failed": []}
+    futures = []
+
+    slogger.info("===== PARALLEL FILE COPY STARTED =====")
+    slogger.info(f"Copy pairs received: {copy_pairs}")
+    slogger.info(f"Max workers: {max_workers}")
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        for src_dir, dest_dir in copy_pairs:
+
+            if not os.path.isdir(src_dir):
+                slogger.info(f"NOT COPIED - Source directory missing: {src_dir}")
+                summary["skipped"].append(src_dir)
+                continue
+
+            files = [os.path.join(src_dir, f) for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))]
+            if not files:
+                slogger.info(f"NOT COPIED - No files found in directory: {src_dir}")
+                summary["skipped"].append(src_dir)
+                continue
+
+            # ⚡ Show Ansible warning for in-progress copy
+            module.warn(f"Copy in progress for {src_dir} -> {dest_dir}. Please wait ...")
+
+            slogger.info(f"Copying {len(files)} files from {src_dir} -> {dest_dir} ...")
+
+            for file_path in files:
+                futures.append(executor.submit(copy_single_file, file_path, dest_dir, retry_count, delete_existing, slogger, summary))
+
+        # Wait for all copies to finish
+        for future in as_completed(futures):
+            future.result()
+
+    slogger.info("===== PARALLEL FILE COPY FINISHED =====")
+    return summary
+
+# ============================================================
+# Ansible Module Entry Point
+# ============================================================
+
+def main():
+    """Main Ansible module execution entrypoint."""
+    module_args = dict(
+        copy_pairs=dict(type="list", required=True),
+        max_workers=dict(type="int", required=False, default=DEFAULT_MAX_WORKERS),
+        retry_count=dict(type="int", required=False, default=DEFAULT_RETRY_COUNT),
+        delete_existing=dict(type="bool", required=False, default=DEFAULT_DELETE_EXISTING),
+        slog_file=dict(type="str", required=False, default="/tmp/parallel_copy.log"),
+    )
+
+    module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
+
+    copy_pairs = module.params["copy_pairs"]
+    max_workers = module.params["max_workers"]
+    retry_count = module.params["retry_count"]
+    delete_existing = module.params["delete_existing"]
+    slog_file = module.params["slog_file"]
+
+    slogger = setup_standard_logger(slog_file)
+
+    result = dict(changed=False, copied=[], skipped=[], failed=[])
+
+    try:
+        summary = execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger)
+
+        result["copied"] = summary["copied"]
+        result["skipped"] = summary["skipped"]
+        result["failed"] = summary["failed"]
+        if summary["copied"]:
+            result["changed"] = True
+
+        overall_status = "SUCCESS"
+        if summary["failed"] and summary["copied"]:
+            overall_status = "PARTIAL"
+        elif summary["failed"] and not summary["copied"]:
+            overall_status = "FAILURE"
+
+        result["overall_status"] = overall_status
+        module.exit_json(**result)
+
+    except Exception as err:
+        slogger.error(f"Parallel copy execution failed: {err}")
+        module.fail_json(msg=str(err), **result)
+
+if __name__ == "__main__":
+    main()
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 662802274b..18ee917fb7 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -1,4 +1,4 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,34 +18,6 @@
 - name: Include storage vars
   ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml"
 
-- name: Load slurm_custom.json for x86_64
-  ansible.builtin.include_vars:
-    file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
-    name: slurm_custom_x86_64
-  failed_when: false
-
-- name: Load slurm_custom.json for aarch64
-  ansible.builtin.include_vars:
-    file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
-    name: slurm_custom_aarch64
-  failed_when: false
-
-- name: Extract CUDA runfile name for x86_64 from slurm_custom.json
-  ansible.builtin.set_fact:
-    cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
-  when:
-    - slurm_custom_x86_64 is defined
-    - slurm_custom_x86_64.slurm_node is defined
-    - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
-
-- name: Extract CUDA runfile name for aarch64 from slurm_custom.json
-  ansible.builtin.set_fact:
-    cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
-  when:
-    - slurm_custom_aarch64 is defined
-    - slurm_custom_aarch64.slurm_node is defined
-    - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
-
 - name: Set facts for slurm
   ansible.builtin.set_fact:
     nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
@@ -63,6 +35,10 @@
     slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
     controller_trackfile_path: "{{ share_path }}/ctld_track"
 
+- name: Build parallel copy list for HPC tools
+  ansible.builtin.set_fact:
+    parallel_copy_pairs: []
+
 - name: Configure openldap if supported
   ansible.builtin.include_tasks: openldap_config.yml
   when: hostvars['localhost']['openldap_support']
@@ -131,8 +107,49 @@
     mode: "{{ file_mode }}"
   become: true
 
-- name: Create hpc tools dirs
-  ansible.builtin.include_tasks: hpc_tools.yml
+- name: Create HPC tools directories on share
+  ansible.builtin.file:
+    path: "{{ slurm_config_path }}/hpc_tools/{{ item }}"
+    state: directory
+    owner: root
+    group: root
+    mode: "{{ common_mode }}"
+  loop:
+    - cuda
+    - runfile
+    - scripts
+    - container_images
+    - nvidia_sdk
+    - benchmarks
+
+- name: Set NFS info fact
+  ansible.builtin.set_fact:
+    oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
+
+- name: Initialize parallel copy pairs
+  ansible.builtin.set_fact:
+    parallel_copy_pairs: []
+
+- name: Check which parallel copy source directories exist
+  ansible.builtin.stat:
+    path: "{{ item.src }}"
+  loop: "{{ parallel_copy_candidates }}"
+  register: copy_source_checks
+  failed_when: false
+
+- name: Add only valid copy pairs (source exists)
+  ansible.builtin.set_fact:
+    parallel_copy_pairs: >-
+      {{ parallel_copy_pairs +
+         [[ item.item.src, item.item.dest ]] }}
+  loop: "{{ copy_source_checks.results }}"
+  when: item.stat.exists
+
+- name: Parallel copy HPC tool files
+  parallel_file_copy:
+    copy_pairs: "{{ parallel_copy_pairs }}"
+    max_workers: "{{ parallel_copy_max_workers }}"
+  when: parallel_copy_pairs | length > 0
 
 - name: Check if munge key exists top level
   ansible.builtin.stat:
@@ -156,8 +173,71 @@
             (compiler_login_list | default([])) +
             (login_list | default([])) }}"
 
-- name: Conf merge and write using slurm_conf module
-  ansible.builtin.include_tasks: confs.yml
+- name: Slurm path ops
+  ansible.builtin.set_fact:
+    conf_path_items: "{{ conf_path_items | default({}) | combine({item.key: item.value}) }}"
+  when: item.value is string
+  loop: "{{ configs_input | dict2items }}"
+
+- name: Slurm dict ops
+  ansible.builtin.set_fact:
+    conf_dict_items: "{{ conf_dict_items | default({}) | combine({item.key: item.value}) }}"
+  when: item.value is mapping
+  loop: "{{ configs_input | dict2items }}"
+
+- name: Slurm dict ops
+  ansible.builtin.set_fact:
+    apply_config: >-
+      {{ apply_config | default({})
+        | combine({
+            item: (
+              (__default_config[item] | default({}))
+              | combine(conf_dict_items[item] | default({}))
+            )
+          })
+      }}
+  loop: "{{ conf_files }}"
+
+- name: Read NodeName parameters
+  ansible.builtin.include_tasks: read_node_idrac.yml
+  when: cmpt_list
+  loop: "{{ cmpt_list }}"
+
+- name: Copy conf file if provided
+  ansible.builtin.copy:
+    src: "{{ conf_path_items.get(item.1) }}"
+    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf"
+    mode: "{{ conf_file_mode }}"
+    remote_src: "{{ copy_from_oim }}"
+  when: ctld_list
+  loop: "{{ ctld_list | product(conf_path_items.keys() | default([])) }}"
+
+- name: Add gpu parameters to slurm conf
+  ansible.builtin.set_fact:
+    apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}"
+  when: gpu_params is defined and gpu_params
+
+- name: Verify slurm conf keys only
+  ansible.builtin.assert:
+    that:
+      - (apply_config[item].keys() | list) | difference(__conf_keys[item]) | length == 0
+    fail_msg: "The following {{ item }} config keys are invalid: {{ apply_config[item].keys() | list | difference(__conf_keys[item]) | join(', ') }}"
+  when: apply_config[item] and __conf_keys[item]
+  loop: "{{ conf_files }}"
+
+- name: Slurm dict ops
+  ansible.builtin.set_fact:
+    slurm_conf_dict: "{{ apply_config['slurm'] }}"
+
+- name: Create all .conf for ctld only
+  ansible.builtin.template:
+    src: "{{ item.1 }}.conf.j2"
+    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf"
+    owner: "{{ root_user }}"
+    group: "{{ root_group }}"
+    mode: "{{ conf_file_mode }}"
+  when: ctld_list
+  loop: "{{ ctld_list | product(conf_files | difference(conf_path_items.keys() | default([]))) }}"
 
 - name: Create mariadb cnf
   ansible.builtin.template:
@@ -215,19 +295,3 @@
   ansible.builtin.set_fact:
     cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}"
     client_mount_path: "{{ share_path }}"
-
-- name: Ensure SSH key directory exists on Slurm share
-  ansible.builtin.file:
-    path: "{{ slurm_config_path }}/ssh"
-    state: directory
-    owner: root
-    group: root
-    mode: '0700'
-
-- name: Copy OIM private key to Slurm share for node-to-node SSH
-  ansible.builtin.copy:
-    src: "{{ ssh_private_key_path }}"
-    dest: "{{ slurm_config_path }}/ssh/oim_rsa"
-    owner: root
-    group: root
-    mode: '0600'
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 63bb52fb41..a4717bd662 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -103,27 +103,31 @@ auth_tls_certs_path: "/opt/omnia/auth/tls_certs/ldapserver.crt"
 slurm_installation_type: configless
 pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt"
 controller_empty_msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun discovery.yml."
-download_container_image_path: "{{ slurm_config_path }}/hpc_tools/scripts/download_container_image.sh"
-container_image_list_path: "{{ slurm_config_path }}/hpc_tools/scripts/container_image.list"
-pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225"
-packages_base_dir_x86_64: "{{ slurm_config_path }}/packages/x86_64"
-packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64"
-offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso"
-offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso"
-packages_layout_x86_64:
-  - doca-ofed
-  - cuda
-packages_layout_aarch64:
-  - doca-ofed
-  - cuda
-print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}"
-offline_path_x86_64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed"
-offline_path_aarch64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
+# nvidia sdk vars
+nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0"
+nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz"
+nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk"
 
-ssh_private_key_path: /root/.ssh/oim_rsa
+# parallel file copy
+parallel_copy_max_workers: 4
+
+# ------------------------------------------------------------
+# Parallel Copy Candidates (Only path existence matters)
+# ------------------------------------------------------------
+
+parallel_copy_candidates:
+
+  # CUDA Runfile (aarch64 repo path)
+  - name: cuda_runfile_aarch64
+    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
+    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
+
+  # CUDA Runfile (x86_64 repo path)
+  - name: cuda_runfile_x86_64
+    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
+    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
+
+  # NVIDIA HPC SDK (x86_64 tarball extracted dir)
+  - name: nvhpc_sdk_x86_64
+    src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
+    dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"

From c4a95fa2a42a03a593eeddb7d1864f796f7bad77 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 28 Jan 2026 12:57:20 +0000
Subject: [PATCH 002/172] log path change

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 common/library/modules/parallel_file_copy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py
index 4f05f041c3..8f46f5a881 100644
--- a/common/library/modules/parallel_file_copy.py
+++ b/common/library/modules/parallel_file_copy.py
@@ -37,6 +37,7 @@
 DEFAULT_MAX_WORKERS = 4
 DEFAULT_RETRY_COUNT = 2
 DEFAULT_DELETE_EXISTING = True
+PARALLEL_FILE_COPY_LOG = '/opt/omnia/log/core/playbooks/parallel_file_copy.log/'
 
 # ============================================================
 # Copy Worker Function
@@ -133,7 +134,7 @@ def main():
         max_workers=dict(type="int", required=False, default=DEFAULT_MAX_WORKERS),
         retry_count=dict(type="int", required=False, default=DEFAULT_RETRY_COUNT),
         delete_existing=dict(type="bool", required=False, default=DEFAULT_DELETE_EXISTING),
-        slog_file=dict(type="str", required=False, default="/tmp/parallel_copy.log"),
+        slog_file=dict(type="str", required=False, default=PARALLEL_FILE_COPY_LOG),
     )
 
     module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)

From af25939cd7a68d1b94896c1c4c6d31177403590a Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 28 Jan 2026 13:01:39 +0000
Subject: [PATCH 003/172] missed code

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 .../slurm_config/tasks/create_slurm_dir.yml   | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 18ee917fb7..45aa87d2d2 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -18,6 +18,34 @@
 - name: Include storage vars
   ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml"
 
+- name: Load slurm_custom.json for x86_64
+  ansible.builtin.include_vars:
+    file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
+    name: slurm_custom_x86_64
+  failed_when: false
+
+- name: Load slurm_custom.json for aarch64
+  ansible.builtin.include_vars:
+    file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
+    name: slurm_custom_aarch64
+  failed_when: false
+
+- name: Extract CUDA runfile name for x86_64 from slurm_custom.json
+  ansible.builtin.set_fact:
+    cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
+  when:
+    - slurm_custom_x86_64 is defined
+    - slurm_custom_x86_64.slurm_node is defined
+    - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
+
+- name: Extract CUDA runfile name for aarch64 from slurm_custom.json
+  ansible.builtin.set_fact:
+    cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
+  when:
+    - slurm_custom_aarch64 is defined
+    - slurm_custom_aarch64.slurm_node is defined
+    - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
+
 - name: Set facts for slurm
   ansible.builtin.set_fact:
     nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"

From 05ef165790335024e2c14b1ebc79378e75b1c9e6 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 28 Jan 2026 13:27:13 +0000
Subject: [PATCH 004/172] adding code to hpc_tools file

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 .../slurm_config/tasks/create_slurm_dir.yml   | 382 +++++-------------
 .../roles/slurm_config/tasks/hpc_tools.yml    |  49 +--
 discovery/roles/slurm_config/vars/main.yml    |  30 +-
 3 files changed, 152 insertions(+), 309 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 45aa87d2d2..c8bdb5d335 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,314 +12,144 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-- name: Include variable file omnia_config.yml
-  ansible.builtin.include_vars: "{{ input_project_dir }}/omnia_config.yml"
 
-- name: Include storage vars
-  ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml"
-
-- name: Load slurm_custom.json for x86_64
-  ansible.builtin.include_vars:
-    file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
-    name: slurm_custom_x86_64
-  failed_when: false
-
-- name: Load slurm_custom.json for aarch64
-  ansible.builtin.include_vars:
-    file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
-    name: slurm_custom_aarch64
-  failed_when: false
-
-- name: Extract CUDA runfile name for x86_64 from slurm_custom.json
-  ansible.builtin.set_fact:
-    cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
-  when:
-    - slurm_custom_x86_64 is defined
-    - slurm_custom_x86_64.slurm_node is defined
-    - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
-
-- name: Extract CUDA runfile name for aarch64 from slurm_custom.json
-  ansible.builtin.set_fact:
-    cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
-  when:
-    - slurm_custom_aarch64 is defined
-    - slurm_custom_aarch64.slurm_node is defined
-    - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
-
-- name: Set facts for slurm
-  ansible.builtin.set_fact:
-    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
-
-- name: Read the slurm mount point
-  ansible.builtin.set_fact:
-    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
-    nfs_server_ip: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_ip }}"
-    nfs_server_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_share_path }}"
-
-- name: Set facts for slurm
-  ansible.builtin.set_fact:
-    cluster_name: "{{ slurm_cluster[0].cluster_name }}"
-    configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}"
-    slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
-    controller_trackfile_path: "{{ share_path }}/ctld_track"
-
-- name: Build parallel copy list for HPC tools
-  ansible.builtin.set_fact:
-    parallel_copy_pairs: []
-
-- name: Configure openldap if supported
-  ansible.builtin.include_tasks: openldap_config.yml
-  when: hostvars['localhost']['openldap_support']
-
-- name: Set facts for slurm
-  ansible.builtin.set_fact:
-    share_prefix: "{{ slurm_config_path }}"
-  when: conf_in_nfs
-
-- name: Clear the share directory
-  ansible.builtin.file:
-    path: "{{ slurm_config_path }}"
-    state: absent
-  when: clear_slurm_files
-
-- name: Create the slurm directory in share
+- name: Create HPC tools directories on share
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}"
+    path: "{{ slurm_config_path }}/hpc_tools/{{ item }}"
     state: directory
     owner: root
     group: root
     mode: "{{ common_mode }}"
+  loop:
+    - cuda
+    - runfile
+    - scripts
+    - container_images
 
-# This directory is created to store the controller track file in NFS
-# The track file is generated only after the Slurm controller has been fully configured in a fresh deployment
-- name: Create directory for controller init track file in share
-  ansible.builtin.file:
-    path: "{{ controller_trackfile_path }}"
-    state: directory
-    owner: root
-    group: root
-    mode: "{{ common_mode }}"
+- name: Deploy download_container_image.sh to NFS share
+  ansible.builtin.template:
+    src: "download_container_image.sh.j2"
+    dest: "{{ download_container_image_path }}"
+    owner: "{{ root_user }}"
+    group: "{{ root_group }}"
+    mode: "0755"
+
+- name: Deploy container_image.list to NFS share
+  ansible.builtin.template:
+    src: "container_image.list.j2"
+    dest: "{{ container_image_list_path }}"
+    owner: "{{ root_user }}"
+    group: "{{ root_group }}"
+    mode: "0644"
+
+- name: Set fact for pulp mirror
+  ansible.builtin.set_fact:
+    pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225"
 
-- name: Create the slurm ctld directory on share
+- name: Create x86_64 package base directory
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}"
+    path: "{{ packages_base_dir_x86_64 }}"
     state: directory
-    owner: root
-    group: root
-    mode: "{{ common_mode }}"
-  when: ctld_list
-  loop: "{{ ctld_list | product(ctld_dir) }}"
+    mode: '{{ common_mode }}'
 
-- name: Create the slurm cmpt directory on share
+- name: Create aarch64 package base directory
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}"
+    path: "{{ packages_base_dir_aarch64 }}"
     state: directory
-    owner: root
-    group: root
-    mode: "{{ common_mode }}"
-  when: cmpt_list or login_list or compiler_login_list
-  loop: "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}"
+    mode: '{{ common_mode }}'
 
-- name: Create the cert directory on share
+- name: Create x86_64 package layout directories
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}/cert"
+    path: "{{ packages_base_dir_x86_64 }}/{{ item }}"
     state: directory
-    owner: root
-    group: root
-    mode: "{{ common_mode }}"
-
-- name: Copy pulp webserver certificate to client_share_path
-  ansible.builtin.copy:
-    src: "{{ pulp_webserver_cert_path }}"
-    dest: "{{ slurm_config_path }}/cert"
-    mode: "{{ file_mode }}"
-  become: true
+    mode: '{{ common_mode }}'
+  loop: "{{ packages_layout_x86_64 }}"
 
-- name: Create HPC tools directories on share
+- name: Create aarch64 package layout directories
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}/hpc_tools/{{ item }}"
+    path: "{{ packages_base_dir_aarch64 }}/{{ item }}"
     state: directory
-    owner: root
-    group: root
-    mode: "{{ common_mode }}"
-  loop:
-    - cuda
-    - runfile
-    - scripts
-    - container_images
-    - nvidia_sdk
-    - benchmarks
+    mode: '{{ common_mode }}'
+  loop: "{{ packages_layout_aarch64 }}"
 
-- name: Set NFS info fact
-  ansible.builtin.set_fact:
-    oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
+- name: Print copy paths for x86_64
+  ansible.builtin.debug:
+    msg: "{{ print_copy_msg }}"
+  loop: "{{ offline_path_x86_64 | default([]) }}"
 
-- name: Initialize parallel copy pairs
-  ansible.builtin.set_fact:
-    parallel_copy_pairs: []
+- name: Print copy paths for aarch64
+  ansible.builtin.debug:
+    msg: "{{ print_copy_msg }}"
+  loop: "{{ offline_path_aarch64 | default([]) }}"
 
-- name: Check which parallel copy source directories exist
+- name: Check x86_64 offline package sources
   ansible.builtin.stat:
-    path: "{{ item.src }}"
-  loop: "{{ parallel_copy_candidates }}"
-  register: copy_source_checks
-  failed_when: false
+    path: "{{ item.source_path }}"
+  loop: "{{ offline_path_x86_64 | default([]) }}"
+  register: x86_64_offline_pkg_sources
 
-- name: Add only valid copy pairs (source exists)
-  ansible.builtin.set_fact:
-    parallel_copy_pairs: >-
-      {{ parallel_copy_pairs +
-         [[ item.item.src, item.item.dest ]] }}
-  loop: "{{ copy_source_checks.results }}"
-  when: item.stat.exists
-
-- name: Parallel copy HPC tool files
-  parallel_file_copy:
-    copy_pairs: "{{ parallel_copy_pairs }}"
-    max_workers: "{{ parallel_copy_max_workers }}"
-  when: parallel_copy_pairs | length > 0
-
-- name: Check if munge key exists top level
+- name: Check aarch64 offline package sources
   ansible.builtin.stat:
-    path: "{{ slurm_config_path }}/munge.key"
-  register: munge_present
-
-- name: Ensure munge key is generated
-  ansible.builtin.shell: "{{ munge_key_cmd }} > {{ slurm_config_path }}/munge.key"
-  when: not munge_present.stat.exists
-  register: munge_gen
-  changed_when: munge_gen.rc == 0
+    path: "{{ item.source_path }}"
+  loop: "{{ offline_path_aarch64 | default([]) }}"
+  register: aarch64_offline_pkg_sources
 
-- name: Distribute the munge key
+- name: Copy x86_64 offline packages
   ansible.builtin.copy:
-    src: "{{ slurm_config_path }}/munge.key"
-    dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key"
-    mode: "{{ common_mode }}"
+    src: "{{ item.item.source_path }}/"
+    dest: "{{ item.item.dest_path }}/"
     remote_src: true
-  loop: "{{ (ctld_list | default([])) +
-            (cmpt_list | default([])) +
-            (compiler_login_list | default([])) +
-            (login_list | default([])) }}"
-
-- name: Slurm path ops
-  ansible.builtin.set_fact:
-    conf_path_items: "{{ conf_path_items | default({}) | combine({item.key: item.value}) }}"
-  when: item.value is string
-  loop: "{{ configs_input | dict2items }}"
-
-- name: Slurm dict ops
-  ansible.builtin.set_fact:
-    conf_dict_items: "{{ conf_dict_items | default({}) | combine({item.key: item.value}) }}"
-  when: item.value is mapping
-  loop: "{{ configs_input | dict2items }}"
-
-- name: Slurm dict ops
-  ansible.builtin.set_fact:
-    apply_config: >-
-      {{ apply_config | default({})
-        | combine({
-            item: (
-              (__default_config[item] | default({}))
-              | combine(conf_dict_items[item] | default({}))
-            )
-          })
-      }}
-  loop: "{{ conf_files }}"
-
-- name: Read NodeName parameters
-  ansible.builtin.include_tasks: read_node_idrac.yml
-  when: cmpt_list
-  loop: "{{ cmpt_list }}"
+    mode: preserve
+  loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}"
+  when:
+    - item.stat.exists
+    - item.item.source_path | length > 0
+    - item.item.dest_path | length > 0
 
-- name: Copy conf file if provided
+- name: Copy aarch64 offline packages
   ansible.builtin.copy:
-    src: "{{ conf_path_items.get(item.1) }}"
-    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf"
-    mode: "{{ conf_file_mode }}"
-    remote_src: "{{ copy_from_oim }}"
-  when: ctld_list
-  loop: "{{ ctld_list | product(conf_path_items.keys() | default([])) }}"
-
-- name: Add gpu parameters to slurm conf
-  ansible.builtin.set_fact:
-    apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}"
-  when: gpu_params is defined and gpu_params
-
-- name: Verify slurm conf keys only
-  ansible.builtin.assert:
-    that:
-      - (apply_config[item].keys() | list) | difference(__conf_keys[item]) | length == 0
-    fail_msg: "The following {{ item }} config keys are invalid: {{ apply_config[item].keys() | list | difference(__conf_keys[item]) | join(', ') }}"
-  when: apply_config[item] and __conf_keys[item]
-  loop: "{{ conf_files }}"
-
-- name: Slurm dict ops
-  ansible.builtin.set_fact:
-    slurm_conf_dict: "{{ apply_config['slurm'] }}"
-
-- name: Create all .conf for ctld only
-  ansible.builtin.template:
-    src: "{{ item.1 }}.conf.j2"
-    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "{{ conf_file_mode }}"
-  when: ctld_list
-  loop: "{{ ctld_list | product(conf_files | difference(conf_path_items.keys() | default([]))) }}"
-
-- name: Create mariadb cnf
-  ansible.builtin.template:
-    src: "mariadb-server.cnf.j2"
-    dest: "{{ slurm_config_path }}/{{ item }}/etc/my.cnf.d/mariadb-server.cnf"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "{{ conf_file_mode }}"
-  when: ctld_list
-  loop: "{{ ctld_list }}"
+    src: "{{ item.item.source_path }}/"
+    dest: "{{ item.item.dest_path }}/"
+    remote_src: true
+    mode: preserve
+  loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}"
+  when:
+    - item.stat.exists
+    - item.item.source_path | length > 0
+    - item.item.dest_path | length > 0
 
-- name: Generate slurmd opts for Configless
+- name: Set NFS info fact
   ansible.builtin.set_fact:
-    conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (apply_config['slurm']['SlurmctldPort'] | string)) | join(',') }}"
-
-- name: Create epilog.sh and slurmd.service
-  ansible.builtin.template:
-    src: "{{ item.1 }}.j2"
-    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "{{ conf_file_mode }}"
-  when: cmpt_list
-  loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}"
-
-- name: Create slurmd.service in login and login_compiler
-  ansible.builtin.template:
-    src: "{{ item.1 }}.j2"
-    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "{{ conf_file_mode }}"
-  when: login_list or compiler_login_list
-  loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}"
-
-- name: Get the slurm NFS path
-  ansible.builtin.debug:
-    msg: "The slurm NFS path is {{ share_path }}/slurm"
+    oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
 
-- name: NFS path for cloud init
-  ansible.builtin.set_fact:
-    cloud_init_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/slurm"
+- name: Check if source directory exists
+  ansible.builtin.stat:
+    path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
+  register: src_dir_check_x86_64
 
-- name: NFS path for controller trackfile
-  ansible.builtin.set_fact:
-    trackfile_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/ctld_track"
+- name: Check if source directory exists
+  ansible.builtin.stat:
+    path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
+  register: src_dir_check_aarch64
 
-- name: NFS path for cloud init
-  ansible.builtin.set_fact:
-    cloud_init_nfs_path_openldap: "{{ nfs_server_ip }}:{{ nfs_server_path }}/openldap"
-  when: hostvars['localhost']['openldap_support']
+- name: Copy cuda run file using copy module for aarch64
+  ansible.builtin.copy:
+    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
+    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
+    mode: '0755'
+    owner: root
+    group: root
+    directory_mode: '0755'
+    remote_src: true
+  when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir
 
-# This will be mounted for ucx, openmpi and ldms configurations on slurm nodes
-- name: NFS path for ucx, openmpi and ldms cloud init
-  ansible.builtin.set_fact:
-    cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}"
-    client_mount_path: "{{ share_path }}"
+- name: Copy cuda run file using copy module for x86_64
+  ansible.builtin.copy:
+    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
+    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
+    mode: '0755'
+    owner: root
+    group: root
+    directory_mode: '0755'
+    remote_src: true
+  when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir
diff --git a/discovery/roles/slurm_config/tasks/hpc_tools.yml b/discovery/roles/slurm_config/tasks/hpc_tools.yml
index c8bdb5d335..4eb511f80c 100644
--- a/discovery/roles/slurm_config/tasks/hpc_tools.yml
+++ b/discovery/roles/slurm_config/tasks/hpc_tools.yml
@@ -122,34 +122,27 @@
   ansible.builtin.set_fact:
     oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
 
-- name: Check if source directory exists
-  ansible.builtin.stat:
-    path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
-  register: src_dir_check_x86_64
+- name: Build parallel copy list for HPC tools
+  ansible.builtin.set_fact:
+    parallel_copy_pairs: []
 
-- name: Check if source directory exists
+- name: Check which parallel copy source directories exist
   ansible.builtin.stat:
-    path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
-  register: src_dir_check_aarch64
-
-- name: Copy cuda run file using copy module for aarch64
-  ansible.builtin.copy:
-    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
-    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
-    mode: '0755'
-    owner: root
-    group: root
-    directory_mode: '0755'
-    remote_src: true
-  when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir
+    path: "{{ item.src }}"
+  loop: "{{ parallel_copy_candidates }}"
+  register: copy_source_checks
+  failed_when: false
 
-- name: Copy cuda run file using copy module for x86_64
-  ansible.builtin.copy:
-    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
-    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
-    mode: '0755'
-    owner: root
-    group: root
-    directory_mode: '0755'
-    remote_src: true
-  when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir
+- name: Add only valid copy pairs (source exists)
+  ansible.builtin.set_fact:
+    parallel_copy_pairs: >-
+      {{ parallel_copy_pairs +
+         [[ item.item.src, item.item.dest ]] }}
+  loop: "{{ copy_source_checks.results }}"
+  when: item.stat.exists
+
+- name: Parallel copy HPC tool files
+  parallel_file_copy:
+    copy_pairs: "{{ parallel_copy_pairs }}"
+    max_workers: "{{ parallel_copy_max_workers }}"
+  when: parallel_copy_pairs | length > 0
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index a4717bd662..9b2ea90c89 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -103,10 +103,30 @@ auth_tls_certs_path: "/opt/omnia/auth/tls_certs/ldapserver.crt"
 slurm_installation_type: configless
 pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt"
 controller_empty_msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun discovery.yml."
-# nvidia sdk vars
-nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0"
-nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz"
-nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk"
+download_container_image_path: "{{ slurm_config_path }}/hpc_tools/scripts/download_container_image.sh"
+container_image_list_path: "{{ slurm_config_path }}/hpc_tools/scripts/container_image.list"
+pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225"
+packages_base_dir_x86_64: "{{ slurm_config_path }}/packages/x86_64"
+packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64"
+offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso"
+offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso"
+packages_layout_x86_64:
+  - doca-ofed
+  - cuda
+packages_layout_aarch64:
+  - doca-ofed
+  - cuda
+print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}"
+offline_path_x86_64:
+  - name: doca-ofed
+    source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed"
+    dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed"
+offline_path_aarch64:
+  - name: doca-ofed
+    source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed"
+    dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
+
+ssh_private_key_path: /root/.ssh/oim_rsa
 
 # parallel file copy
 parallel_copy_max_workers: 4

From 166cf2990555c6f092dad25f33722d796bf186bf Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 28 Jan 2026 15:26:30 +0000
Subject: [PATCH 005/172] updating vars

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 .../slurm_config/tasks/create_slurm_dir.yml   | 284 +++++++++++-------
 discovery/roles/slurm_config/vars/main.yml    |   6 +-
 2 files changed, 184 insertions(+), 106 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index c8bdb5d335..662802274b 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -12,144 +12,222 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Include variable file omnia_config.yml
+  ansible.builtin.include_vars: "{{ input_project_dir }}/omnia_config.yml"
 
-- name: Create HPC tools directories on share
+- name: Include storage vars
+  ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml"
+
+- name: Load slurm_custom.json for x86_64
+  ansible.builtin.include_vars:
+    file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
+    name: slurm_custom_x86_64
+  failed_when: false
+
+- name: Load slurm_custom.json for aarch64
+  ansible.builtin.include_vars:
+    file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json"
+    name: slurm_custom_aarch64
+  failed_when: false
+
+- name: Extract CUDA runfile name for x86_64 from slurm_custom.json
+  ansible.builtin.set_fact:
+    cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
+  when:
+    - slurm_custom_x86_64 is defined
+    - slurm_custom_x86_64.slurm_node is defined
+    - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
+
+- name: Extract CUDA runfile name for aarch64 from slurm_custom.json
+  ansible.builtin.set_fact:
+    cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}"
+  when:
+    - slurm_custom_aarch64 is defined
+    - slurm_custom_aarch64.slurm_node is defined
+    - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0
+
+- name: Set facts for slurm
+  ansible.builtin.set_fact:
+    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
+
+- name: Read the slurm mount point
+  ansible.builtin.set_fact:
+    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
+    nfs_server_ip: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_ip }}"
+    nfs_server_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_share_path }}"
+
+- name: Set facts for slurm
+  ansible.builtin.set_fact:
+    cluster_name: "{{ slurm_cluster[0].cluster_name }}"
+    configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}"
+    slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
+    controller_trackfile_path: "{{ share_path }}/ctld_track"
+
+- name: Configure openldap if supported
+  ansible.builtin.include_tasks: openldap_config.yml
+  when: hostvars['localhost']['openldap_support']
+
+- name: Set facts for slurm
+  ansible.builtin.set_fact:
+    share_prefix: "{{ slurm_config_path }}"
+  when: conf_in_nfs
+
+- name: Clear the share directory
+  ansible.builtin.file:
+    path: "{{ slurm_config_path }}"
+    state: absent
+  when: clear_slurm_files
+
+- name: Create the slurm directory in share
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}/hpc_tools/{{ item }}"
+    path: "{{ slurm_config_path }}"
     state: directory
     owner: root
     group: root
     mode: "{{ common_mode }}"
-  loop:
-    - cuda
-    - runfile
-    - scripts
-    - container_images
 
-- name: Deploy download_container_image.sh to NFS share
-  ansible.builtin.template:
-    src: "download_container_image.sh.j2"
-    dest: "{{ download_container_image_path }}"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "0755"
-
-- name: Deploy container_image.list to NFS share
-  ansible.builtin.template:
-    src: "container_image.list.j2"
-    dest: "{{ container_image_list_path }}"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "0644"
-
-- name: Set fact for pulp mirror
-  ansible.builtin.set_fact:
-    pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225"
-
-- name: Create x86_64 package base directory
+# This directory is created to store the controller track file in NFS
+# The track file is generated only after the Slurm controller has been fully configured in a fresh deployment
+- name: Create directory for controller init track file in share
   ansible.builtin.file:
-    path: "{{ packages_base_dir_x86_64 }}"
+    path: "{{ controller_trackfile_path }}"
     state: directory
-    mode: '{{ common_mode }}'
+    owner: root
+    group: root
+    mode: "{{ common_mode }}"
 
-- name: Create aarch64 package base directory
+- name: Create the slurm ctld directory on share
   ansible.builtin.file:
-    path: "{{ packages_base_dir_aarch64 }}"
+    path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}"
     state: directory
-    mode: '{{ common_mode }}'
+    owner: root
+    group: root
+    mode: "{{ common_mode }}"
+  when: ctld_list
+  loop: "{{ ctld_list | product(ctld_dir) }}"
 
-- name: Create x86_64 package layout directories
+- name: Create the slurm cmpt directory on share
   ansible.builtin.file:
-    path: "{{ packages_base_dir_x86_64 }}/{{ item }}"
+    path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}"
     state: directory
-    mode: '{{ common_mode }}'
-  loop: "{{ packages_layout_x86_64 }}"
+    owner: root
+    group: root
+    mode: "{{ common_mode }}"
+  when: cmpt_list or login_list or compiler_login_list
+  loop: "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}"
 
-- name: Create aarch64 package layout directories
+- name: Create the cert directory on share
   ansible.builtin.file:
-    path: "{{ packages_base_dir_aarch64 }}/{{ item }}"
+    path: "{{ slurm_config_path }}/cert"
     state: directory
-    mode: '{{ common_mode }}'
-  loop: "{{ packages_layout_aarch64 }}"
+    owner: root
+    group: root
+    mode: "{{ common_mode }}"
 
-- name: Print copy paths for x86_64
-  ansible.builtin.debug:
-    msg: "{{ print_copy_msg }}"
-  loop: "{{ offline_path_x86_64 | default([]) }}"
+- name: Copy pulp webserver certificate to client_share_path
+  ansible.builtin.copy:
+    src: "{{ pulp_webserver_cert_path }}"
+    dest: "{{ slurm_config_path }}/cert"
+    mode: "{{ file_mode }}"
+  become: true
 
-- name: Print copy paths for aarch64
-  ansible.builtin.debug:
-    msg: "{{ print_copy_msg }}"
-  loop: "{{ offline_path_aarch64 | default([]) }}"
+- name: Create hpc tools dirs
+  ansible.builtin.include_tasks: hpc_tools.yml
 
-- name: Check x86_64 offline package sources
+- name: Check if munge key exists top level
   ansible.builtin.stat:
-    path: "{{ item.source_path }}"
-  loop: "{{ offline_path_x86_64 | default([]) }}"
-  register: x86_64_offline_pkg_sources
+    path: "{{ slurm_config_path }}/munge.key"
+  register: munge_present
 
-- name: Check aarch64 offline package sources
-  ansible.builtin.stat:
-    path: "{{ item.source_path }}"
-  loop: "{{ offline_path_aarch64 | default([]) }}"
-  register: aarch64_offline_pkg_sources
+- name: Ensure munge key is generated
+  ansible.builtin.shell: "{{ munge_key_cmd }} > {{ slurm_config_path }}/munge.key"
+  when: not munge_present.stat.exists
+  register: munge_gen
+  changed_when: munge_gen.rc == 0
 
-- name: Copy x86_64 offline packages
+- name: Distribute the munge key
   ansible.builtin.copy:
-    src: "{{ item.item.source_path }}/"
-    dest: "{{ item.item.dest_path }}/"
+    src: "{{ slurm_config_path }}/munge.key"
+    dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key"
+    mode: "{{ common_mode }}"
     remote_src: true
-    mode: preserve
-  loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}"
-  when:
-    - item.stat.exists
-    - item.item.source_path | length > 0
-    - item.item.dest_path | length > 0
+  loop: "{{ (ctld_list | default([])) +
+            (cmpt_list | default([])) +
+            (compiler_login_list | default([])) +
+            (login_list | default([])) }}"
 
-- name: Copy aarch64 offline packages
-  ansible.builtin.copy:
-    src: "{{ item.item.source_path }}/"
-    dest: "{{ item.item.dest_path }}/"
-    remote_src: true
-    mode: preserve
-  loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}"
-  when:
-    - item.stat.exists
-    - item.item.source_path | length > 0
-    - item.item.dest_path | length > 0
+- name: Conf merge and write using slurm_conf module
+  ansible.builtin.include_tasks: confs.yml
+
+- name: Create mariadb cnf
+  ansible.builtin.template:
+    src: "mariadb-server.cnf.j2"
+    dest: "{{ slurm_config_path }}/{{ item }}/etc/my.cnf.d/mariadb-server.cnf"
+    owner: "{{ root_user }}"
+    group: "{{ root_group }}"
+    mode: "{{ conf_file_mode }}"
+  when: ctld_list
+  loop: "{{ ctld_list }}"
 
-- name: Set NFS info fact
+- name: Generate slurmd opts for Configless
   ansible.builtin.set_fact:
-    oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
+    conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (apply_config['slurm']['SlurmctldPort'] | string)) | join(',') }}"
 
-- name: Check if source directory exists
-  ansible.builtin.stat:
-    path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
-  register: src_dir_check_x86_64
+- name: Create epilog.sh and slurmd.service
+  ansible.builtin.template:
+    src: "{{ item.1 }}.j2"
+    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}"
+    owner: "{{ root_user }}"
+    group: "{{ root_group }}"
+    mode: "{{ conf_file_mode }}"
+  when: cmpt_list
+  loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}"
 
-- name: Check if source directory exists
-  ansible.builtin.stat:
-    path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
-  register: src_dir_check_aarch64
+- name: Create slurmd.service in login and login_compiler
+  ansible.builtin.template:
+    src: "{{ item.1 }}.j2"
+    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}"
+    owner: "{{ root_user }}"
+    group: "{{ root_group }}"
+    mode: "{{ conf_file_mode }}"
+  when: login_list or compiler_login_list
+  loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}"
 
-- name: Copy cuda run file using copy module for aarch64
-  ansible.builtin.copy:
-    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/"
-    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
-    mode: '0755'
+- name: Get the slurm NFS path
+  ansible.builtin.debug:
+    msg: "The slurm NFS path is {{ share_path }}/slurm"
+
+- name: NFS path for cloud init
+  ansible.builtin.set_fact:
+    cloud_init_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/slurm"
+
+- name: NFS path for controller trackfile
+  ansible.builtin.set_fact:
+    trackfile_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/ctld_track"
+
+- name: NFS path for cloud init
+  ansible.builtin.set_fact:
+    cloud_init_nfs_path_openldap: "{{ nfs_server_ip }}:{{ nfs_server_path }}/openldap"
+  when: hostvars['localhost']['openldap_support']
+
+# This will be mounted for ucx, openmpi and ldms configurations on slurm nodes
+- name: NFS path for ucx, openmpi and ldms cloud init
+  ansible.builtin.set_fact:
+    cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}"
+    client_mount_path: "{{ share_path }}"
+
+- name: Ensure SSH key directory exists on Slurm share
+  ansible.builtin.file:
+    path: "{{ slurm_config_path }}/ssh"
+    state: directory
     owner: root
     group: root
-    directory_mode: '0755'
-    remote_src: true
-  when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir
+    mode: '0700'
 
-- name: Copy cuda run file using copy module for x86_64
+- name: Copy OIM private key to Slurm share for node-to-node SSH
   ansible.builtin.copy:
-    src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/"
-    dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
-    mode: '0755'
+    src: "{{ ssh_private_key_path }}"
+    dest: "{{ slurm_config_path }}/ssh/oim_rsa"
     owner: root
     group: root
-    directory_mode: '0755'
-    remote_src: true
-  when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir
+    mode: '0600'
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 9b2ea90c89..23434e6765 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -148,6 +148,6 @@ parallel_copy_candidates:
     dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
 
   # NVIDIA HPC SDK (x86_64 tarball extracted dir)
-  - name: nvhpc_sdk_x86_64
-    src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
-    dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
+  # - name: nvhpc_sdk_x86_64
+  #  src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
+  #  dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"

From 87a022ab9746bb5ddcd559db83fce1a3f5ab6843 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 13:58:51 +0530
Subject: [PATCH 006/172] openmpi ucx template approach

---
 .vscode/.checkmarxIgnored                     |  1 +
 ...i-group-login_compiler_node_x86_64.yaml.j2 | 82 ++++++-------------
 .../ci-group-slurm_node_x86_64.yaml.j2        | 12 +++
 .../hpc_tools/configure_ucx_openmpi_env.sh.j2 | 56 +++++++++++++
 .../templates/hpc_tools/install_openmpi.sh.j2 | 73 +++++++++++++++++
 .../templates/hpc_tools/install_ucx.sh.j2     | 55 +++++++++++++
 6 files changed, 222 insertions(+), 57 deletions(-)
 create mode 100644 .vscode/.checkmarxIgnored
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2

diff --git a/.vscode/.checkmarxIgnored b/.vscode/.checkmarxIgnored
new file mode 100644
index 0000000000..9e26dfeeb6
--- /dev/null
+++ b/.vscode/.checkmarxIgnored
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index 3195fad9e3..cc8586193f 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -190,6 +190,18 @@
             {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
 {% endif %}
 
+        - path: /usr/local/bin/install_openmpi.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_ucx.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }}
+
         - path: /etc/hosts
           append: true
           content: |
@@ -299,66 +311,22 @@
 {% endif %}
 
 {% if hostvars['localhost']['ucx_support'] %}
-        # UCX build and install
-        - |
-          UCX_BIN={{ client_mount_path }}/benchmarks/ucx
-          mkdir -p {{ client_mount_path }}/compile/ucx
-          mkdir -p {{ client_mount_path }}/benchmarks/ucx
-          cd {{ client_mount_path }}/compile/ucx
-          wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz
-          tar xzf ucx.tar.gz
-          cd ucx-*
-          mkdir -p build
-          cd build
-          ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx
-          make -j 8
-          make install
+        - echo "===== UCX Setup ====="
+        - echo "UCX support is enabled."
+        - /usr/local/bin/install_ucx.sh
+        # - echo "Build script available at"
+        # - echo "  /usr/local/bin/install_ucx.sh"
+        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
 {% endif %}
 
 {% if hostvars['localhost']['openmpi_support'] %}
-        # OpenMPI build and install with UCX + Slurm detection
-        - |
-          OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi"
-          OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi"
-          mkdir -p $OPENMPI_SRC
-          mkdir -p $OPENMPI_INSTALL_PREFIX
-
-          cd $OPENMPI_SRC
-          wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz
-
-          tar xzf openmpi.tar.gz
-          cd openmpi-*
-          mkdir -p build
-
-          # Check Slurm
-          if sinfo >/dev/null 2>&1; then
-            SLURM_FLAG="--with-slurm=yes --with-munge=/usr"
-          else
-            SLURM_FLAG="--with-slurm=no"
-          fi
-
-          # Check UCX
-          if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then
-            {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v
-            if [ $? -eq 0 ]; then
-              UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx"
-            else
-              echo "ucx_info failed, disabling UCX"
-              UCX_FLAG=""
-            fi
-          else
-            echo "ucx_info not found, disabling UCX"
-            UCX_FLAG=""
-          fi
-
-          cd build
-          ../configure --prefix=$OPENMPI_INSTALL_PREFIX \
-            --enable-mpi1-compatibility \
-            --enable-prte-prefix-by-default \
-            $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out
-
-          make -j 8
-          make install
+        - echo "===== OpenMPI Setup ====="
+        - echo "OpenMPI support is enabled."
+        - /usr/local/bin/install_openmpi.sh
+        # - echo "Build script available at"
+        # - echo "  /usr/local/bin/install_openmpi.sh"
+        # - echo "Run UCX installation first if UCX support is enabled."
+        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
 {% endif %}
 
 {% if hostvars['localhost']['ldms_support'] %}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 80347f6854..27eb60456e 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -455,6 +455,18 @@
         - mkdir -p {{ client_mount_path }}
         - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
         - mount -a
+        - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
+        - echo "Shared NFS mount is available at: {{ client_mount_path }}"
+        - /usr/local/bin/configure_ucx_openmpi_env.sh
+        # - echo ""
+        # - echo "IMPORTANT:"
+        # - echo "1. Install UCX and/or OpenMPI on the LOGIN / COMPILER node first."
+        # - echo "2. Ensure they are installed under the shared mount:"
+        # - echo "   {{ client_mount_path }}/hpc_tools/benchmarks/"
+        # - echo "3. On this node, run the environment setup script when ready:"
+        # - echo ""
+        # - echo "This step is intentionally NOT run automatically."
+        - echo "=================================================="
 {% endif %}
 
 {% if hostvars['localhost']['ldms_support'] %}
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2
new file mode 100644
index 0000000000..4064eddbb1
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2
@@ -0,0 +1,56 @@
+#!/bin/bash
+LOGFILE="/var/log/configure_ucx_openmpi_env.log"
+exec > >(tee -a "$LOGFILE") 2>&1
+
+echo "===== Configuring UCX / OpenMPI environment (Slurm node) ====="
+
+CLIENT_MOUNT="{{ client_mount_path }}"
+UCX_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
+OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
+
+PROFILE_DIR="/etc/profile.d"
+
+# Ensure client mount exists and is mounted
+if ! mountpoint -q "$CLIENT_MOUNT"; then
+    echo "[WARN] $CLIENT_MOUNT is not mounted. Skipping UCX/OpenMPI env setup."
+    exit 0
+fi
+
+# ---------------- UCX ----------------
+if [ -d "$UCX_PREFIX/bin" ]; then
+    echo "[INFO] UCX detected at $UCX_PREFIX"
+
+    cat > "$PROFILE_DIR/ucx.sh" <<EOF
+# UCX environment
+export UCX_HOME="$UCX_PREFIX"
+export PATH="\$UCX_HOME/bin:\$PATH"
+export LD_LIBRARY_PATH="\$UCX_HOME/lib:\$LD_LIBRARY_PATH"
+EOF
+
+    chmod 644 "$PROFILE_DIR/ucx.sh"
+    echo "[SUCCESS] UCX environment enabled"
+else
+    echo "[INFO] UCX not found at $UCX_PREFIX — skipping"
+    rm -f "$PROFILE_DIR/ucx.sh"
+fi
+
+# ---------------- OpenMPI ----------------
+if [ -d "$OPENMPI_PREFIX/bin" ]; then
+    echo "[INFO] OpenMPI detected at $OPENMPI_PREFIX"
+
+    cat > "$PROFILE_DIR/openmpi.sh" <<EOF
+# OpenMPI environment
+export OPENMPI_HOME="$OPENMPI_PREFIX"
+export PATH="\$OPENMPI_HOME/bin:\$PATH"
+export LD_LIBRARY_PATH="\$OPENMPI_HOME/lib:\$LD_LIBRARY_PATH"
+export MANPATH="\$OPENMPI_HOME/share/man:\$MANPATH"
+EOF
+
+    chmod 644 "$PROFILE_DIR/openmpi.sh"
+    echo "[SUCCESS] OpenMPI environment enabled"
+else
+    echo "[INFO] OpenMPI not found at $OPENMPI_PREFIX — skipping"
+    rm -f "$PROFILE_DIR/openmpi.sh"
+fi
+
+echo "===== UCX / OpenMPI environment configuration complete ====="
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
new file mode 100644
index 0000000000..44e1a786b7
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
@@ -0,0 +1,73 @@
+#!/bin/bash
+set -e
+
+CLIENT_MOUNT="{{ client_mount_path }}"
+OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
+OPENMPI_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/openmpi"
+
+# Check that NFS is mounted
+if ! mountpoint -q "$CLIENT_MOUNT"; then
+    echo "[ERROR] $CLIENT_MOUNT is not mounted."
+    echo "        Please mount the NFS path before running install_openmpi.sh"
+    exit 1
+fi
+
+echo "===== OpenMPI build started ====="
+
+mkdir -p "$OPENMPI_BUILD" "$OPENMPI_PREFIX"
+cd "$OPENMPI_BUILD"
+
+if [ ! -f openmpi.tar.gz ]; then
+    wget --no-check-certificate \
+      https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \
+      -O openmpi.tar.gz \
+      >> "$OPENMPI_PREFIX/openmpi_tar_output.log" 2>&1
+else
+    echo "openmpi.tar.gz already exists, skipping download." \
+      >> "$OPENMPI_PREFIX/openmpi_tar_output.log"
+fi
+
+tar xzf openmpi.tar.gz
+cd openmpi-*
+mkdir -p build
+
+# Slurm detection
+if sinfo >/dev/null 2>&1; then
+  SLURM_FLAG="--with-slurm=yes --with-munge=/usr"
+else
+  SLURM_FLAG="--with-slurm=no"
+fi
+
+# UCX detection
+if [ -x "{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx/bin/ucx_info" ]; then
+  UCX_FLAG="--with-ucx={{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
+else
+  UCX_FLAG=""
+fi
+
+cd build
+../configure --prefix="$OPENMPI_PREFIX" \
+  --enable-mpi1-compatibility \
+  --enable-prte-prefix-by-default \
+  $SLURM_FLAG $UCX_FLAG
+
+make -j {{ openmpi_build_threads | default(8) }}
+make install
+
+# Configure OpenMPI environment variables system-wide
+OPENMPI_ENV_FILE="/etc/profile.d/openmpi.sh"
+
+cat > "$OPENMPI_ENV_FILE" <<EOF
+# OpenMPI environment
+export OPENMPI_HOME="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
+export PATH="\$OPENMPI_HOME/bin:\$PATH"
+export LD_LIBRARY_PATH="\$OPENMPI_HOME/lib:\$LD_LIBRARY_PATH"
+export MANPATH="\$OPENMPI_HOME/share/man:\$MANPATH"
+EOF
+
+chmod 644 "$OPENMPI_ENV_FILE"
+
+echo "[INFO] OpenMPI installed under {{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
+echo "[INFO] OpenMPI environment configured in $OPENMPI_ENV_FILE"
+
+echo "===== OpenMPI build completed ====="
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
new file mode 100644
index 0000000000..73d13d82a8
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+CLIENT_MOUNT="{{ client_mount_path }}"
+UCX_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
+UCX_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/ucx"
+
+# Check that NFS is mounted
+if ! mountpoint -q "$CLIENT_MOUNT"; then
+    echo "[ERROR] $CLIENT_MOUNT is not mounted."
+    echo "        Please mount the NFS path before running install_ucx.sh"
+    exit 1
+fi
+
+echo "===== UCX build started ====="
+
+mkdir -p "$UCX_BUILD" "$UCX_PREFIX"
+cd "$UCX_BUILD"
+
+if [ ! -f ucx.tar.gz ]; then
+    wget --no-check-certificate \
+      https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz \
+      -O ucx.tar.gz \
+      >> "$UCX_PREFIX/ucx_tar_output.log" 2>&1
+else
+    echo "ucx.tar.gz already exists, skipping download." \
+      >> "$UCX_PREFIX/ucx_tar_output.log"
+fi
+
+tar xzf ucx.tar.gz
+cd ucx-*
+mkdir -p build
+cd build
+
+../contrib/configure-release --prefix="$UCX_PREFIX"
+make -j {{ ucx_build_threads | default(8) }}
+make install
+
+# Configure UCX environment variables system-wide
+UCX_ENV_FILE="/etc/profile.d/ucx.sh"
+
+cat > "$UCX_ENV_FILE" <<EOF
+# UCX environment
+export UCX_HOME="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
+export PATH="\$UCX_HOME/bin:\$PATH"
+export LD_LIBRARY_PATH="\$UCX_HOME/lib:\$LD_LIBRARY_PATH"
+EOF
+
+chmod 644 "$UCX_ENV_FILE"
+
+echo "[INFO] UCX installed under {{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
+echo "[INFO] UCX environment configured in $UCX_ENV_FILE"
+echo "[INFO] Run 'source $UCX_ENV_FILE' or re-login to use ucx_info"
+
+echo "===== UCX build completed ====="

From e9002127fdc4c5542cb1831adac983daa692d8ff Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 14:04:04 +0530
Subject: [PATCH 007/172] adding template call

---
 .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 27eb60456e..4dfc45e213 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -408,6 +408,12 @@
           permissions: '0644'
           content: |
             {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }}
+        
+        - path: /usr/local/bin/configure_ucx_openmpi_env.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }}
 
       runcmd:
         - /usr/local/bin/set-ssh.sh

From 721712b598d66020127edd66d21cb1559a3d0574 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 14:05:56 +0530
Subject: [PATCH 008/172] deleting .vscode folder

---
 .vscode/.checkmarxIgnored | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 .vscode/.checkmarxIgnored

diff --git a/.vscode/.checkmarxIgnored b/.vscode/.checkmarxIgnored
deleted file mode 100644
index 9e26dfeeb6..0000000000
--- a/.vscode/.checkmarxIgnored
+++ /dev/null
@@ -1 +0,0 @@
-{}
\ No newline at end of file

From 2f59b34af122cc0d07573d91ff6152dffbb7b7bb Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 14:15:03 +0530
Subject: [PATCH 009/172] copyright update

---
 common/library/modules/parallel_file_copy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py
index 8f46f5a881..a697764683 100644
--- a/common/library/modules/parallel_file_copy.py
+++ b/common/library/modules/parallel_file_copy.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 3e773ece56757ed90b27df51cba9369c384c5bd6 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 16:23:17 +0530
Subject: [PATCH 010/172] nvidia hpc sdk changes wrt template approach

---
 .../library/module_utils/local_repo/config.py |  2 +-
 ...i-group-login_compiler_node_x86_64.yaml.j2 | 16 ++++
 .../ci-group-slurm_node_x86_64.yaml.j2        | 14 ++++
 .../hpc_tools/configure_nvhpc_env.sh.j2       | 71 ++++++++++++++++++
 .../hpc_tools/export_nvhpc_env.sh.j2          | 73 ++++++++++++++++++
 .../hpc_tools/install_nvhpc_sdk.sh.j2         | 75 +++++++++++++++++++
 .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 71 ++++++++++++++++++
 .../roles/slurm_config/tasks/hpc_tools.yml    |  1 +
 discovery/roles/slurm_config/vars/main.yml    | 11 ++-
 .../config/x86_64/rhel/10.0/slurm_custom.json |  5 ++
 10 files changed, 335 insertions(+), 4 deletions(-)
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 9c9af639fb..60debc51e3 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -81,7 +81,7 @@
 }
 CLI_FILE_PATH = "/root/.config/pulp/cli.toml"
 POST_TIMEOUT = 3600
-TAR_POLL_VAL = 3
+TAR_POLL_VAL = 25
 FILE_POLL_VAL = 1
 ISO_POLL_VAL = 15
 FILE_URI = "/pulp/api/v3/content/file/files/"
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index cc8586193f..245f2a7fb4 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -219,6 +219,18 @@
           permissions: '0644'
           content: |
             {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }}
+        
+        - path: /usr/local/bin/setup_nvhpc_sdk.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }}
+        
+        - path: /usr/local/bin/setup_nvhpc_sdk.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }}
 
       runcmd:
         - /usr/local/bin/set-ssh.sh
@@ -334,4 +346,8 @@
 
         - /root/ldms_sampler.sh
 {% endif %}
+
+        # nvidia sdk install
+        - /usr/local/bin/install_nvhpc_sdk.sh
+        - /usr/local/bin/configure_nvhpc_env.sh
         - echo "Cloud-Init has completed successfully."
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 4dfc45e213..06c4a1d413 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -415,6 +415,18 @@
           content: |
             {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }}
 
+        - path: /usr/local/bin/setup_nvhpc_sdk.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/export_nvhpc_env.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }}
+
       runcmd:
         - /usr/local/bin/set-ssh.sh
         - /usr/local/bin/install_nvidia_driver.sh
@@ -480,4 +492,6 @@
 
         - /root/ldms_sampler.sh
 {% endif %}
+        - /usr/local/bin/setup_nvhpc_sdk.sh
+        - /usr/local/bin/export_nvhpc_env.sh
         - echo "Cloud-Init has completed successfully."
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
new file mode 100644
index 0000000000..3c7efbc88b
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
@@ -0,0 +1,71 @@
+#!/bin/bash
+set -e
+
+LOGFILE="/var/log/nvhpc_env_config.log"
+exec >> "$LOGFILE" 2>&1
+
+echo "===== Configuring NVIDIA HPC SDK environment ====="
+
+# Cloud-init safe defaults
+export HOME=/root
+
+NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}"
+NVARCH="$(uname -s)_$(uname -m)"
+NVHPC_VERSION="{{ nvhpc_version | default('25.11') }}"
+
+NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION"
+PROFILE_FILE="/etc/profile.d/nvhpc.sh"
+
+if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then
+    echo "[ERROR] NVHPC compilers not found at $NVHPC_BASE"
+    exit 1
+fi
+
+echo "[INFO] NVHPC detected at $NVHPC_BASE"
+echo "[INFO] Writing persistent environment to $PROFILE_FILE"
+
+cat << EOF > "$PROFILE_FILE"
+# NVIDIA HPC SDK environment
+export NVCOMPILERS=$NVCOMPILERS
+export NVARCH=$NVARCH
+export NVHPC_VERSION=$NVHPC_VERSION
+
+export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH
+export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man
+
+# MPI (optional but recommended)
+export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH
+export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man
+
+# Modules support (optional)
+export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-}
+EOF
+
+chmod 644 "$PROFILE_FILE"
+
+# Source profile for current shell and all future non-login shells
+if [ -f "$PROFILE_FILE" ]; then
+    echo "[INFO] Sourcing NVHPC profile for current shell"
+    source "$PROFILE_FILE"
+    grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc
+fi
+
+# NVHPC marker file path
+MARKER_TARGET="{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}/.nvhpc_env_ready"
+
+if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then
+    echo "[ERROR] NVHPC NFS path not found in /etc/fstab"
+    exit 1
+fi
+
+echo "[INFO] NVHPC NFS entry found in /etc/fstab"
+
+if [ ! -d "{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" ]; then
+    echo "[ERROR] Marker directory missing: {{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}"
+    exit 1
+fi
+
+touch "$MARKER_TARGET"
+echo "[SUCCESS] NVHPC marker created: $MARKER_TARGET"
+
+echo "===== NVHPC environment configuration completed successfully ====="
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
new file mode 100644
index 0000000000..20e3bb0e5f
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
@@ -0,0 +1,73 @@
+#!/bin/bash
+set -e
+
+CLIENT_MOUNT="{{ client_mount_path }}"
+
+NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
+NVARCH="$(uname -s)_$(uname -m)"
+NVHPC_VERSION="25.11"
+
+NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION"
+PROFILE_FILE="/etc/profile.d/nvhpc.sh"
+LOGFILE="/var/log/export_nvhpc_env.log"
+
+# Log everything
+exec > >(tee -a "$LOGFILE") 2>&1
+
+# Check that NFS is mounted
+if ! mountpoint -q "$CLIENT_MOUNT"; then
+    echo "[ERROR] $CLIENT_MOUNT is not mounted."
+    echo "        Please mount the NFS path before running export_nvhpc_env.sh"
+    exit 1
+fi
+
+echo "===== NVHPC environment export started ====="
+
+# Validate compilers directory exists
+if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then
+    echo "[ERROR] NVHPC compilers not found at:"
+    echo "        $NVHPC_BASE/compilers/bin"
+    exit 1
+fi
+
+echo "[INFO] Writing persistent NVHPC profile at $PROFILE_FILE"
+
+# Write environment file system-wide
+cat > "$PROFILE_FILE" <<EOF
+# NVIDIA HPC SDK environment
+
+export NVCOMPILERS=$NVHPC_LOCAL_MOUNT
+export NVARCH=$NVARCH
+export NVHPC_VERSION=$NVHPC_VERSION
+
+# Compilers
+export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH
+export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man
+
+# MPI support
+export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH
+export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man
+
+# Modules
+export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-}
+EOF
+
+chmod 644 "$PROFILE_FILE"
+
+echo "[INFO] Verifying NVHPC compilers using login shell"
+
+# Verify nvc
+if ! bash -lc "command -v nvc && nvc --version >/dev/null"; then
+    echo "[ERROR] nvc verification failed"
+    exit 1
+fi
+
+# Verify nvfortran
+if ! bash -lc "command -v nvfortran && nvfortran --version >/dev/null"; then
+    echo "[ERROR] nvfortran verification failed"
+    exit 1
+fi
+
+echo "[SUCCESS] NVHPC environment exported successfully"
+echo "[INFO] Environment file configured in $PROFILE_FILE"
+echo "===== NVHPC export completed ====="
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
new file mode 100644
index 0000000000..bdf0e263d7
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
@@ -0,0 +1,75 @@
+#!/bin/bash
+set -e
+
+LOGFILE="/var/log/nvhpc_sdk_install.log"
+exec > >(tee -a "$LOGFILE") 2>&1
+
+echo "===== Starting NVIDIA HPC SDK installation ====="
+
+NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}"
+NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
+NVHPC_MOUNT="/shared-nvhpc-sdk"
+NVHPC_TARBALL="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}.tar.gz"
+NVHPC_INSTALL_DIR_NFS="{{ NVHPC_MOUNT }}/nvhpc"
+NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
+NVHPC_EXTRACT_DIR="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}"
+
+# Skip if already mounted
+if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
+    echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation."
+    exit 0
+fi
+
+# Skip if local directory exists
+if [ -d "$NVHPC_LOCAL_MOUNT" ]; then
+    echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping."
+    exit 0
+fi
+
+mkdir -p "$NVHPC_MOUNT"
+mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT"
+
+# Check tarball
+echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..."
+if [ ! -f "$NVHPC_TARBALL" ]; then
+    echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation."
+    exit 0
+fi
+
+# Extract if needed
+EXTRACT_SIZE_GB=$(du -sBG "$NVHPC_EXTRACT_DIR" 2>/dev/null | cut -f1 | tr -d 'G')
+if [ -d "$NVHPC_EXTRACT_DIR" ] && [ "$EXTRACT_SIZE_GB" -ge 13 ] && [ -f "$NVHPC_EXTRACT_DIR/install" ]; then
+    echo "[INFO] NVHPC already extracted. Skipping."
+else
+    echo "[INFO] Extracting NVIDIA HPC SDK tarball..."
+    tar -xzf "$NVHPC_TARBALL" -C "$NVHPC_MOUNT" \
+        --checkpoint=2000 \
+        --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait"
+fi
+
+mkdir -p "$NVHPC_INSTALL_DIR_NFS"
+INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin"
+
+if [ -x "$INSTALL_BIN_DIR/nvc" ]; then
+    echo "[INFO] NVHPC already installed. Skipping installer."
+else
+    echo "[INFO] Running NVIDIA HPC SDK installer..."
+    cd "$NVHPC_EXTRACT_DIR"
+    NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install
+fi
+
+echo "[SUCCESS] NVIDIA HPC SDK installation completed."
+
+# Mount NVHPC locally
+mkdir -p "$NVHPC_LOCAL_MOUNT"
+NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
+FSTAB_ENTRY="$NVHPC_INSTALL_EXPORT $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0"
+
+if ! grep -qE "^[^#].*$NVHPC_INSTALL_EXPORT[[:space:]]+$NVHPC_LOCAL_MOUNT[[:space:]]+nfs" /etc/fstab; then
+    echo "[INFO] Adding NVHPC mount to /etc/fstab"
+    echo "$FSTAB_ENTRY" >> /etc/fstab
+fi
+
+echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..."
+mount "$NVHPC_LOCAL_MOUNT"
+echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT"
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
new file mode 100644
index 0000000000..e81049e57c
--- /dev/null
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
@@ -0,0 +1,71 @@
+ - path: /usr/local/bin/setup_nvhpc_sdk.sh
+          permissions: '0755'
+          content: |
+            #!/bin/bash
+            LOGFILE="/var/log/setup_nvhpc_sdk.log"
+            exec > >(tee -a "$LOGFILE") 2>&1
+
+            echo "===== NVHPC SDK setup (mount + wait) ====="
+
+            PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
+            PARENT_MOUNT="/shared-nvhpc-sdk"
+
+            NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
+            NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
+
+            NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready"
+
+            WAIT_TIMEOUT=3600
+            SLEEP_INTERVAL=20
+            ELAPSED=0
+
+            # 1. Mount parent export
+            mkdir -p "$PARENT_MOUNT"
+
+            if ! mountpoint -q "$PARENT_MOUNT"; then
+                mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT"
+            fi
+
+            if ! mountpoint -q "$PARENT_MOUNT"; then
+                echo "[ERROR] Failed to mount NVHPC parent export"
+                exit 1
+            fi
+
+            echo "[INFO] Parent NVHPC export mounted"
+
+            # 2. Wait for readiness marker
+            echo "[INFO] Waiting for NVHPC readiness marker..."
+
+            while [ ! -f "$NVHPC_MARKER" ]; do
+                if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then
+                    echo "[ERROR] Timeout waiting for NVHPC readiness marker"
+                    exit 1
+                fi
+                sleep "$SLEEP_INTERVAL"
+                ELAPSED=$((ELAPSED + SLEEP_INTERVAL))
+            done
+
+            echo "[SUCCESS] NVHPC readiness marker detected"
+
+            # 3. Ensure fstab entry exists
+            if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then
+                echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab
+                echo "[INFO] NVHPC fstab entry added"
+            else
+                echo "[INFO] NVHPC fstab entry already present"
+            fi
+
+            # 4. Mount NVHPC SDK
+            mkdir -p "$NVHPC_LOCAL_MOUNT"
+
+            if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
+                mount "$NVHPC_LOCAL_MOUNT"
+            fi
+
+            if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
+                echo "[ERROR] Failed to mount NVHPC SDK"
+                exit 1
+            fi
+
+            echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT"
+            echo "===== NVHPC setup completed ====="
\ No newline at end of file
diff --git a/discovery/roles/slurm_config/tasks/hpc_tools.yml b/discovery/roles/slurm_config/tasks/hpc_tools.yml
index 4eb511f80c..46260da267 100644
--- a/discovery/roles/slurm_config/tasks/hpc_tools.yml
+++ b/discovery/roles/slurm_config/tasks/hpc_tools.yml
@@ -25,6 +25,7 @@
     - runfile
     - scripts
     - container_images
+    - nvidia_sdk
 
 - name: Deploy download_container_image.sh to NFS share
   ansible.builtin.template:
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 23434e6765..201c98dded 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -128,6 +128,11 @@ offline_path_aarch64:
 
 ssh_private_key_path: /root/.ssh/oim_rsa
 
+# nvidia sdk vars
+nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0"
+nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz"
+nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk"
+
 # parallel file copy
 parallel_copy_max_workers: 4
 
@@ -148,6 +153,6 @@ parallel_copy_candidates:
     dest: "{{ slurm_config_path }}/hpc_tools/runfile/"
 
   # NVIDIA HPC SDK (x86_64 tarball extracted dir)
-  # - name: nvhpc_sdk_x86_64
-  #  src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
-  #  dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
+  - name: nvhpc_sdk_x86_64
+    src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
+    dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
\ No newline at end of file
diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json
index 9531239fd2..ecf628883b 100644
--- a/input/config/x86_64/rhel/10.0/slurm_custom.json
+++ b/input/config/x86_64/rhel/10.0/slurm_custom.json
@@ -34,6 +34,11 @@
             {"package": "cuda-run",
              "type": "iso",
              "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run"
+            },
+            {
+            "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0",
+            "type": "tarball",
+            "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz"
             }
         ]
     },

From 18bffc836819f74038993f897a37f1569c23b5c4 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 16:24:48 +0530
Subject: [PATCH 011/172] filename chnage

---
 .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index 245f2a7fb4..08cc8b79dd 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -220,13 +220,13 @@
           content: |
             {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }}
         
-        - path: /usr/local/bin/setup_nvhpc_sdk.sh
+        - path: /usr/local/bin/install_nvhpc_sdk.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
           content: |
             {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }}
         
-        - path: /usr/local/bin/setup_nvhpc_sdk.sh
+        - path: /usr/local/bin/configure_nvhpc_env.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
           content: |

From ba185055ca7098697e27d3ec08bf60b568d739d1 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Thu, 29 Jan 2026 11:00:18 +0000
Subject: [PATCH 012/172] ansible lint

---
 discovery/roles/slurm_config/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 201c98dded..4c0f558acd 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -155,4 +155,4 @@ parallel_copy_candidates:
   # NVIDIA HPC SDK (x86_64 tarball extracted dir)
   - name: nvhpc_sdk_x86_64
     src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
-    dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
\ No newline at end of file
+    dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"

From 351e4c406b9079e3ebea7bb224df01e3bbecba5d Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Tue, 3 Feb 2026 11:25:01 +0530
Subject: [PATCH 013/172] cuda path changes

---
 .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2     | 2 +-
 .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2      | 2 +-
 .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2    | 2 +-
 .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index de236ed958..bc3068843a 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -98,7 +98,7 @@
             echo "[INFO] Setting up shared CUDA directory..."
             # Create and mount shared directory for compute nodes
             mkdir -p /shared-cuda-toolkit
-            mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit
+            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
 
             if [ $? -ne 0 ]; then
                 echo "[ERROR] Failed to mount NFS cuda share. Exiting."
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index 08cc8b79dd..a1f8a55f50 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -105,7 +105,7 @@
             echo "[INFO] Setting up shared CUDA directory..."
             # Create and mount shared directory for compute nodes
             mkdir -p /shared-cuda-toolkit
-            mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit
+            mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit
 
             if [ $? -ne 0 ]; then
                 echo "[ERROR] Failed to mount NFS cuda share. Exiting."
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index cc784bdd10..9b3ac1a501 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -127,7 +127,7 @@
             # Create mount point
             mkdir -p /usr/local/cuda
 
-            cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda"
+            cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda"
 
             echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share"
             mount -t nfs "$cuda_nfs_share" /usr/local/cuda
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 06c4a1d413..67a300c0f7 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -135,7 +135,7 @@
             # Create mount point
             mkdir -p /usr/local/cuda
 
-            cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda"
+            cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda"
 
             echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share"
             mount -t nfs "$cuda_nfs_share" /usr/local/cuda

From 2cea76e97a4ce6e892de63b1f51898ad9fdca819 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Tue, 3 Feb 2026 16:49:45 +0530
Subject: [PATCH 014/172]  changing variable call

---
 .../hpc_tools/install_nvhpc_sdk.sh.j2         |   6 +-
 .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 105 +++++++++---------
 2 files changed, 54 insertions(+), 57 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
index bdf0e263d7..26f3fd1775 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
@@ -9,10 +9,10 @@ echo "===== Starting NVIDIA HPC SDK installation ====="
 NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}"
 NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
 NVHPC_MOUNT="/shared-nvhpc-sdk"
-NVHPC_TARBALL="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}.tar.gz"
-NVHPC_INSTALL_DIR_NFS="{{ NVHPC_MOUNT }}/nvhpc"
+NVHPC_TARBALL="$NVHPC_MOUNT/${NVHPC_PKG_NAME}.tar.gz"
+NVHPC_INSTALL_DIR_NFS="$NVHPC_MOUNT/nvhpc"
 NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
-NVHPC_EXTRACT_DIR="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}"
+NVHPC_EXTRACT_DIR="$NVHPC_MOUNT/${NVHPC_PKG_NAME}"
 
 # Skip if already mounted
 if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
index e81049e57c..b57061cd08 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
@@ -1,71 +1,68 @@
- - path: /usr/local/bin/setup_nvhpc_sdk.sh
-          permissions: '0755'
-          content: |
-            #!/bin/bash
-            LOGFILE="/var/log/setup_nvhpc_sdk.log"
-            exec > >(tee -a "$LOGFILE") 2>&1
+#!/bin/bash
+LOGFILE="/var/log/setup_nvhpc_sdk.log"
+exec > >(tee -a "$LOGFILE") 2>&1
 
-            echo "===== NVHPC SDK setup (mount + wait) ====="
+echo "===== NVHPC SDK setup (mount + wait) ====="
 
-            PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
-            PARENT_MOUNT="/shared-nvhpc-sdk"
+PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
+PARENT_MOUNT="/shared-nvhpc-sdk"
 
-            NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
-            NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
+NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
+NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
 
-            NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready"
+NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready"
 
-            WAIT_TIMEOUT=3600
-            SLEEP_INTERVAL=20
-            ELAPSED=0
+WAIT_TIMEOUT=3600
+SLEEP_INTERVAL=20
+ELAPSED=0
 
-            # 1. Mount parent export
-            mkdir -p "$PARENT_MOUNT"
+# 1. Mount parent export
+mkdir -p "$PARENT_MOUNT"
 
-            if ! mountpoint -q "$PARENT_MOUNT"; then
-                mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT"
-            fi
+if ! mountpoint -q "$PARENT_MOUNT"; then
+    mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT"
+fi
 
-            if ! mountpoint -q "$PARENT_MOUNT"; then
-                echo "[ERROR] Failed to mount NVHPC parent export"
-                exit 1
-            fi
+if ! mountpoint -q "$PARENT_MOUNT"; then
+    echo "[ERROR] Failed to mount NVHPC parent export"
+    exit 1
+fi
 
-            echo "[INFO] Parent NVHPC export mounted"
+echo "[INFO] Parent NVHPC export mounted"
 
-            # 2. Wait for readiness marker
-            echo "[INFO] Waiting for NVHPC readiness marker..."
+# 2. Wait for readiness marker
+echo "[INFO] Waiting for NVHPC readiness marker..."
 
-            while [ ! -f "$NVHPC_MARKER" ]; do
-                if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then
-                    echo "[ERROR] Timeout waiting for NVHPC readiness marker"
-                    exit 1
-                fi
-                sleep "$SLEEP_INTERVAL"
-                ELAPSED=$((ELAPSED + SLEEP_INTERVAL))
-            done
+while [ ! -f "$NVHPC_MARKER" ]; do
+    if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then
+        echo "[ERROR] Timeout waiting for NVHPC readiness marker"
+        exit 1
+    fi
+    sleep "$SLEEP_INTERVAL"
+    ELAPSED=$((ELAPSED + SLEEP_INTERVAL))
+done
 
-            echo "[SUCCESS] NVHPC readiness marker detected"
+echo "[SUCCESS] NVHPC readiness marker detected"
 
-            # 3. Ensure fstab entry exists
-            if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then
-                echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab
-                echo "[INFO] NVHPC fstab entry added"
-            else
-                echo "[INFO] NVHPC fstab entry already present"
-            fi
+# 3. Ensure fstab entry exists
+if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then
+    echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab
+    echo "[INFO] NVHPC fstab entry added"
+else
+    echo "[INFO] NVHPC fstab entry already present"
+fi
 
-            # 4. Mount NVHPC SDK
-            mkdir -p "$NVHPC_LOCAL_MOUNT"
+# 4. Mount NVHPC SDK
+mkdir -p "$NVHPC_LOCAL_MOUNT"
 
-            if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
-                mount "$NVHPC_LOCAL_MOUNT"
-            fi
+if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
+    mount "$NVHPC_LOCAL_MOUNT"
+fi
 
-            if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
-                echo "[ERROR] Failed to mount NVHPC SDK"
-                exit 1
-            fi
+if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
+    echo "[ERROR] Failed to mount NVHPC SDK"
+    exit 1
+fi
 
-            echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT"
-            echo "===== NVHPC setup completed ====="
\ No newline at end of file
+echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT"
+echo "===== NVHPC setup completed ====="
\ No newline at end of file

From 080f107b0ab0b0058825907dfc004cac46217c57 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Tue, 3 Feb 2026 11:54:32 +0000
Subject: [PATCH 015/172] Code changes for additional container image support

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../schema/local_repo_config.json             |  65 ++++++-
 .../validation_flows/local_repo_validation.py |  62 +++++-
 .../library/module_utils/local_repo/config.py |  12 +-
 .../module_utils/local_repo/download_image.py | 182 ++++++++++-------
 .../local_repo/process_parallel.py            |  42 ++--
 .../module_utils/local_repo/registry_utils.py | 146 +++++++++-----
 .../module_utils/local_repo/software_utils.py |  96 ++++++++-
 .../local_repo/user_image_utility.py          | 184 ++++++++++++------
 common/library/modules/check_user_registry.py |  70 +++----
 common/library/modules/parallel_tasks.py      |  47 +++--
 input/local_repo_config.yml                   |  21 +-
 .../tasks/execute_parallel_tasks.yml          |   6 +-
 .../roles/parse_and_download/vars/main.yml    |   6 +-
 local_repo/roles/validation/tasks/main.yml    |   6 +-
 local_repo/roles/validation/vars/main.yml     |   8 +-
 15 files changed, 685 insertions(+), 268 deletions(-)

diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json
index 664b02b20c..63d61f0a31 100644
--- a/common/library/module_utils/input_validation/schema/local_repo_config.json
+++ b/common/library/module_utils/input_validation/schema/local_repo_config.json
@@ -2,6 +2,69 @@
   "$schema": "http://json-schema.org/draft-07/schema#",
   "type": "object",
   "properties": {
+    "user_registry": {
+      "type": [
+        "array",
+        "null"
+      ],
+      "items": {
+        "type": "object",
+        "properties": {
+          "host": {
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^[a-zA-Z0-9.-]+:[0-9]+$"
+          },
+          "cert_path": {
+            "type": "string",
+            "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$"
+          },
+          "key_path": {
+            "type": "string",
+            "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$"
+          }
+        },
+        "required": [
+          "host",
+          "cert_path",
+          "key_path"
+        ],
+        "allOf": [
+          {
+            "if": {
+              "properties": {
+                "cert_path": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "properties": {
+                "cert_path": {
+                  "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$"
+                }
+              }
+            }
+          },
+          {
+            "if": {
+              "properties": {
+                "key_path": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "properties": {
+                "key_path": {
+                  "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$"
+                }
+              }
+            }
+          }
+        ]
+      }   
+    },
     "user_repo_url_x86_64": {
       "type": [
         "array",
@@ -1082,4 +1145,4 @@
     "omnia_repo_url_rhel_x86_64"
   ],
   "additionalProperties": false
-}
\ No newline at end of file
+}
diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
index ee2dd12a29..efeda63c8a 100644
--- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,6 +78,22 @@ def validate_local_repo_config(input_file_path, data,
     errors = []
     base_repo_names = []
     local_repo_yml = create_file_path(input_file_path, file_names["local_repo_config"])
+    
+    user_registry = data.get("user_registry") 
+    if user_registry:
+        for registry in user_registry:
+            host = registry.get("host")
+            cert_path = registry.get("cert_path")
+            key_path = registry.get("key_path")
+            
+            # Validate user_registry certificate and key paths
+            if cert_path and not os.path.exists(cert_path):
+                errors.append(create_error_msg(local_repo_yml, "user_registry", 
+                                             f"Certificate file not found: {cert_path}"))
+            
+            if key_path and not os.path.exists(key_path):
+                errors.append(create_error_msg(local_repo_yml, "user_registry", 
+                                             f"Key file not found: {key_path}"))
     repo_names = {}
     sub_result = check_subscription_status(logger)
     logger.info(f"validate_local_repo_config: Subscription status: {sub_result}")
@@ -113,6 +129,50 @@ def validate_local_repo_config(input_file_path, data,
     software_config_file_path = create_file_path(input_file_path, file_names["software_config"])
     software_config_json = load_json(software_config_file_path)
 
+    # Check if additional_packages is enabled and contains image packages
+    additional_packages_enabled = any(sw.get("name") == "additional_packages" for sw in software_config_json.get("softwares", []))
+    if additional_packages_enabled:
+        # Get arch values from additional_packages entry in software_config.json
+        additional_packages_archs = []
+        for software in software_config_json.get("softwares", []):
+            if software.get("name") == "additional_packages":
+                arch_list = software.get("arch", [])
+                additional_packages_archs = arch_list  # Get all archs
+                break
+
+        # Check each arch specific additional_packages.json
+        has_image_packages = False
+        for additional_packages_arch in additional_packages_archs:
+            additional_packages_path = create_file_path(
+                input_file_path,
+                f"config/{additional_packages_arch}/{software_config_json['cluster_os_type']}/{software_config_json['cluster_os_version']}/additional_packages.json"
+            )
+            
+            if os.path.exists(additional_packages_path):
+                additional_packages_data = load_json(additional_packages_path)
+                has_image_packages = False
+                
+                # Check all sections for image packages
+                for section_name, section_data in additional_packages_data.items():
+                    if isinstance(section_data, dict) and "cluster" in section_data:
+                        cluster_packages = section_data.get("cluster", [])
+                        
+                        for package in cluster_packages:
+                            if package.get("type") == "image":
+                                has_image_packages = True
+                                break
+
+                    if has_image_packages:
+                        break
+
+        # If any architecture has image packages, user_registry must be defined and not empty
+        if has_image_packages and user_registry is None:
+            errors.append(create_error_msg(
+                local_repo_yml,
+                "user_registry", 
+                "user_registry must be defined when additional_packages.json contains packages of type 'image'"
+            ))
+
     # Extra validation: custom_slurm must have <arch>_slurm_custom in user_repo_url_<arch>
     for sw in software_config_json["softwares"]:
         if sw["name"] == "slurm_custom":
diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 9c9af639fb..5fee956352 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@
 DEFAULT_STATUS_FILENAME = "status.csv"
 STATUS_CSV_HEADER = 'name,type,status\n'
 SOFTWARE_CSV_HEADER = "name,status"
-USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml"
-USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key"
+# USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml"
+# USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key"
 # ----------------------------
 # Software tasklist Defaults
 # Used by prepare_tasklist.py
@@ -110,8 +110,10 @@
 
     "create_container_remote_auth": "pulp container remote create --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'",
 
-    "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'"
-
+    "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'",
+    "container_distribution_show": "pulp container distribution show --name %s | jq .repository",
+    "show_repository_version": "pulp container repository show --href %s | jq .latest_version_href",
+    "list_image_tags": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s"
 }
 OMNIA_CREDENTIALS_YAML_PATH = "/opt/omnia/input/project_default/omnia_config_credentials.yml"
 OMNIA_CREDENTIALS_VAULT_PATH = "/opt/omnia/input/project_default/.omnia_config_credentials_key"
diff --git a/common/library/module_utils/local_repo/download_image.py b/common/library/module_utils/local_repo/download_image.py
index c9b3020a7b..ffc5518177 100644
--- a/common/library/module_utils/local_repo/download_image.py
+++ b/common/library/module_utils/local_repo/download_image.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -206,22 +206,61 @@ def get_repo_url_and_content(package):
         ValueError: If the package prefix is not supported.
     """
     patterns = {
-         r"^(ghcr\.io)(/.+)": "https://ghcr.io",
-         r"^(docker\.io)(/.+)": "https://registry-1.docker.io",
-         r"^(quay\.io)(/.+)": "https://quay.io",
-         r"^(registry\.k8s\.io)(/.+)": "https://registry.k8s.io",
-         r"^(nvcr\.io)(/.+)": "https://nvcr.io",
-         r"^(public\.ecr\.aws)(/.+)": "https://public.ecr.aws",
-         r"^(gcr\.io)(/.+)": "https://gcr.io"
+        r"^(ghcr\.io)(:\d+)?(/.+)": "https://ghcr.io",
+        r"^(docker\.io)(:\d+)?(/.+)": "https://registry-1.docker.io",
+        r"^(quay\.io)(:\d+)?(/.+)": "https://quay.io",
+        r"^(registry\.k8s\.io)(:\d+)?(/.+)": "https://registry.k8s.io",
+        r"^(nvcr\.io)(:\d+)?(/.+)": "https://nvcr.io",
+        r"^(public\.ecr\.aws)(:\d+)?(/.+)": "https://public.ecr.aws",
+        r"^(gcr\.io)(:\d+)?(/.+)": "https://gcr.io",
     }
     for pattern, repo_url in patterns.items():
         match = re.match(pattern, package)
         if match:
             base_url = repo_url
-            package_content = match.group(2).lstrip("/")  # Remove leading slash
+
+            # If user provided a port, preserve it
+            if match.group(2):
+                base_url = f"{repo_url}{match.group(2)}"
+
+            package_content = match.group(3).lstrip("/")
             return base_url, package_content
 
-    raise ValueError(f"Unsupported package prefix for package: {package}")
+    # fallback for private / IP-based registries
+    match = re.match(r"^(?P<registry>[^/]+)(?P<path>/.*)$", package)
+    if match:
+        return f"https://{match.group('registry')}", match.group("path").lstrip("/")
+
+    raise ValueError(f"Invalid package format: {package}")
+
+
+# def get_repo_url_and_content(package):
+#     """
+#     Get the repository URL and content from a given package.
+#     Parameters:
+#         package (str): The package to extract the URL and content from.
+#     Returns:
+#         tuple: A tuple containing the repository URL and content.
+#     Raises:
+#         ValueError: If the package prefix is not supported.
+#     """
+#     patterns = {
+#          r"^(ghcr\.io)(/.+)": "https://ghcr.io",
+#          r"^(docker\.io)(/.+)": "https://registry-1.docker.io",
+#          r"^(quay\.io)(/.+)": "https://quay.io",
+#          r"^(registry\.k8s\.io)(/.+)": "https://registry.k8s.io",
+#          r"^(nvcr\.io)(/.+)": "https://nvcr.io",
+#          r"^(public\.ecr\.aws)(/.+)": "https://public.ecr.aws",
+#          r"^(gcr\.io)(/.+)": "https://gcr.io"
+#     }
+#     for pattern, repo_url in patterns.items():
+#         match = re.match(pattern, package)
+#         if match:
+#             base_url = repo_url
+#             package_content = match.group(2).lstrip("/")  # Remove leading slash
+#             return base_url, package_content
+
+#     raise ValueError(f"Unsupported package prefix for package: {package}")
 
 def process_image(package, status_file_path, version_variables,
                  user_registries,docker_username, docker_password, logger):
@@ -245,66 +284,79 @@ def process_image(package, status_file_path, version_variables,
     base_url, package_content = get_repo_url_and_content(package['package'])
     package_identifier = None
 
+    # Only check user registries for additional_packages
+    if user_registries and "additional_packages" in status_file_path:
+        result, package_identifier = handle_user_image_registry(
+            package,
+            package_content,
+            version_variables,
+            user_registries,
+            logger
+        )
 
-    if user_registries:
-        result, package_identifier = handle_user_image_registry(package, package_content,
-                                     version_variables, user_registries, logger)
-    # If user registry not found or no user registry given, proceed with public registry
-    if not result:
-        try:
-            repo_name_prefix = "container_repo_"
-            repository_name = f"{repo_name_prefix}{package['package'].replace('/', '_').replace(':', '_')}"
-            remote_name = f"remote_{package['package'].replace('/', '_')}"
-            package_identifier = package['package']
-            # Create container repository
-            with repository_creation_lock:
-                result = create_container_repository(repository_name, logger)
+        if not result:
+            logger.info(f"Image {package['package']} will not be synced to Pulp.")
+            status = "Failed"
+            return status
+        
+        else:
+            logger.info(f"Image {package['package']} synced to Pulp.")
+            status = "Success"
+            return status
+
+    try:
+        repo_name_prefix = "container_repo_"
+        repository_name = f"{repo_name_prefix}{package['package'].replace('/', '_').replace(':', '_')}"
+        remote_name = f"remote_{package['package'].replace('/', '_').replace(':', '_')}"
+        package_identifier = package['package']
+
+        # Create container repository
+        with repository_creation_lock:
+            result = create_container_repository(repository_name, logger)
+        if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
+            raise Exception(f"Failed to create repository: {repository_name}")
+
+        # Process digest or tag
+        if "digest" in package:
+            package_identifier += f":{package['digest']}"
+            result = create_container_remote_digest(
+                remote_name, base_url, package_content, policy_type, logger
+            )
             if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
-                raise Exception(f"Failed to create repository: {repository_name}")
-            # Process digest or tag
-            if "digest" in package:
-                package_identifier += f":{package['digest']}"
-                result = create_container_remote_digest(remote_name, base_url,
-                         package_content, policy_type, logger)
-                if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
-                    raise Exception(f"Failed to create remote digest: {remote_name}")
-
-            elif "tag" in package:
-                tag_template = Template(package['tag'])
-                tag_val = tag_template.render(**version_variables)
-                package_identifier += f":{package['tag']}"
-
-                # Only use auth for docker.io images
-                if package['package'].startswith('docker.io/'):
-
-                    with remote_creation_lock:
-                        if docker_username and docker_password:
-                            result = create_container_remote_with_auth(
-                                remote_name, base_url, package_content, policy_type,
-                                tag_val, logger, docker_username, docker_password
-                            )
-                        else:
-                            result = create_container_remote(
-                                remote_name, base_url, package_content, policy_type, tag_val, logger
-                            )
+                raise Exception(f"Failed to create remote digest: {remote_name}")
+
+        elif "tag" in package:
+            tag_template = Template(package['tag'])
+            tag_val = tag_template.render(**version_variables)
+            package_identifier += f":{package['tag']}"
+
+            with remote_creation_lock:
+                if package['package'].startswith('docker.io/') and docker_username and docker_password:
+                    result = create_container_remote_with_auth(
+                        remote_name, base_url, package_content, policy_type,
+                        tag_val, logger, docker_username, docker_password
+                    )
                 else:
-                    # For non-docker.io registries, use unauthenticated access
-                    with remote_creation_lock:
-                        result = create_container_remote(
-                            remote_name, base_url, package_content, policy_type, tag_val, logger
-                        )
-
-                if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
-                    raise Exception(f"Failed to create remote: {remote_name}")
-            # Sync and distribute container repository
-            result = sync_container_repository(repository_name, remote_name, package_content,logger)
+                    result = create_container_remote(
+                        remote_name, base_url, package_content, policy_type, tag_val, logger
+                    )
+
             if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
-                raise Exception(f"Failed to sync repository: {repository_name}")
+                raise Exception(f"Failed to create remote: {remote_name}")
 
-        except Exception as e:
-            status = "Failed"
-            logger.error(f"Failed to process image: {package_identifier}. Error: {e}")
+        # Sync and distribute
+        result = sync_container_repository(
+            repository_name, remote_name, package_content, logger
+        )
+        if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
+            raise Exception(f"Failed to sync repository: {repository_name}")
+
+    except Exception as e:
+        status = "Failed"
+        logger.error(f"Failed to process image: {package_identifier}. Error: {e}")
 
-    write_status_to_file(status_file_path, package_identifier, package['type'], status, logger, file_lock)
+    write_status_to_file(
+        status_file_path, package_identifier, package['type'], status, logger, file_lock
+    )
     logger.info("#" * 30 + f" {process_image.__name__} end " + "#" * 30)
     return status
diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py
index b1c0f0b91b..cfc3beb920 100644
--- a/common/library/module_utils/local_repo/process_parallel.py
+++ b/common/library/module_utils/local_repo/process_parallel.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,8 +34,8 @@
 from ansible.module_utils.local_repo.config import (
     OMNIA_CREDENTIALS_YAML_PATH,
     OMNIA_CREDENTIALS_VAULT_PATH,
-    USER_REG_CRED_INPUT,
-    USER_REG_KEY_PATH
+    # USER_REG_CRED_INPUT,
+    # USER_REG_KEY_PATH
 )
 # Global lock for logging synchronization
 log_lock = multiprocessing.Lock()
@@ -321,8 +321,8 @@ def execute_parallel(
     arc,
     standard_logger,
     local_repo_config_path,
-    user_reg_cred_input,
-    user_reg_key_path,
+    # user_reg_cred_input,
+    # user_reg_key_path,
     omnia_credentials_yaml_path,
     omnia_credentials_vault_path,
     timeout
@@ -355,22 +355,22 @@ def execute_parallel(
 
     config = load_yaml_file(local_repo_config_path)
     user_registries = config.get("user_registry", [])
-    if user_registries:
-        if is_encrypted(user_reg_cred_input):
-            process_file(user_reg_cred_input, user_reg_key_path, 'decrypt')
-
-        file2_data = load_yaml_file(user_reg_cred_input)
-        cred_lookup = {
-            entry['name']: entry
-            for entry in file2_data.get('user_registry_credential', [])
-        }
-        # Update user_registry entries with credentials if required
-        for registry in user_registries:
-            if registry.get("requires_auth"):
-                creds = cred_lookup.get(registry.get("name"))
-                if creds:
-                    registry["username"] = creds.get("username")
-                    registry["password"] = creds.get("password")
+    # if user_registries:
+    #     if is_encrypted(user_reg_cred_input):
+    #         process_file(user_reg_cred_input, user_reg_key_path, 'decrypt')
+
+    #     file2_data = load_yaml_file(user_reg_cred_input)
+    #     cred_lookup = {
+    #         entry['name']: entry
+    #         for entry in file2_data.get('user_registry_credential', [])
+    #     }
+    #     # Update user_registry entries with credentials if required
+    #     for registry in user_registries:
+    #         if registry.get("requires_auth"):
+    #             creds = cred_lookup.get(registry.get("name"))
+    #             if creds:
+    #                 registry["username"] = creds.get("username")
+    #                 registry["password"] = creds.get("password")
 
 
     try:
diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py
index 6974d963cb..2e7da2f659 100644
--- a/common/library/module_utils/local_repo/registry_utils.py
+++ b/common/library/module_utils/local_repo/registry_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,9 +13,29 @@
 # limitations under the License.
 # pylint: disable=import-error,no-name-in-module
 import requests
+import socket
+import ssl
 from requests.auth import HTTPBasicAuth
 from ansible.module_utils.local_repo.common_functions import is_file_exists
 
+def is_https(host, timeout=1):
+    ip, port = host.rsplit(":", 1)
+    port = int(port)
+
+    # Don't verify server cert; just see if TLS works
+    context = ssl.create_default_context()
+    context.check_hostname = False
+    context.verify_mode = ssl.CERT_NONE
+
+    try:
+        with socket.create_connection((ip, port), timeout=timeout) as sock:
+            with context.wrap_socket(sock, server_hostname=ip):
+                return True
+    except ssl.SSLError:
+        return False
+    except Exception:
+        return False
+
 def validate_user_registry(user_registry):
     """
     Validates a list of user registry entries with connectivity and credential check.
@@ -34,64 +54,92 @@ def validate_user_registry(user_registry):
         host = item.get('host')
         if not host:
             return False, f"Missing or empty 'host' in entry at index {idx}: {item}"
+        https = is_https(host)
 
-        requires_auth = item.get('requires_auth', False)
-
-        # Check basic username/password presence
-        if requires_auth:
-            if not item.get('username') or not item.get('password'):
-                return False, (
-                    f"'requires_auth' is true but 'username' or 'password' is missing or empty "
-                    f"in entry for (host: {host})"
-                )
-
-            cert_path = item.get('cert_path')
-            key_path = item.get('key_path')
-
-            if bool(cert_path) != bool(key_path):
-                return False, (
-                    f"If authentication is enabled, both 'cert_path' and 'key_path' must be present "
-                    f"or both omitted in entry for (host: {host})"
-                )
-            try:
-                url = f"https://{host}/api/v2.0/users/current"
-                response = requests.get(
-                    url,
-                    auth=HTTPBasicAuth(item['username'], item['password']),
-                    verify=True  # Set to True if using valid SSL certs
-                )
-
-                if response.status_code == 401:
-                    return False, f"Invalid credentials for host: {host}"
-                elif response.status_code != 200:
-                    return False, f"Unexpected status {response.status_code} while validating host: {host}"
-
-            except requests.exceptions.RequestException as e:
-                return False, f"Failed to connect to {host}: {str(e)}"
+        cert_path = (item.get("cert_path") or "").strip()
+        key_path  = (item.get("key_path")  or "").strip()
+
+        if https and (not cert_path or not key_path):
+            return False, f"{host} is an HTTPS registry and requires cert_path and key_path. Please provide cert_path and key_path in local_repo_config.yml under user_registry section"
 
     return True, ""
 
-def check_reachability(user_registry, timeout):
+        # requires_auth = item.get('requires_auth', False)
+
+        # # Check basic username/password presence
+        # if requires_auth:
+        #     if not item.get('username') or not item.get('password'):
+        #         return False, (
+        #             f"'requires_auth' is true but 'username' or 'password' is missing or empty "
+        #             f"in entry for (host: {host})"
+        #         )
+
+        #     cert_path = item.get('cert_path')
+        #     key_path = item.get('key_path')
+
+    #         if bool(cert_path) != bool(key_path):
+    #             return False, (
+    #                 f"If authentication is enabled, both 'cert_path' and 'key_path' must be present "
+    #                 f"or both omitted in entry for (host: {host})"
+    #             )
+    #         try:
+    #             url = f"https://{host}/api/v2.0/users/current"
+    #             response = requests.get(
+    #                 url,
+    #                 auth=HTTPBasicAuth(item['username'], item['password']),
+    #                 verify=True  # Set to True if using valid SSL certs
+    #             )
+
+    #             if response.status_code == 401:
+    #                 return False, f"Invalid credentials for host: {host}"
+    #             elif response.status_code != 200:
+    #                 return False, f"Unexpected status {response.status_code} while validating host: {host}"
+
+    #         except requests.exceptions.RequestException as e:
+    #             return False, f"Failed to connect to {host}: {str(e)}"
+
+    # return True, ""
+
+def tcp_ping(host, timeout=1):
     """
-    Checks the reachability of hosts in the user registry.
-
+    Check if a host:port is reachable via TCP.
+    
     Args:
-        user_registry (list): A list of dictionaries representing user registry entries.
-        timeout (int): The maximum number of seconds to wait for a response.
-
+        host (str): User registry host with port
+        timeout (int): Timeout in seconds
+    Returns:
+        bool: True if reachable, False otherwise
+    """
+    try:
+        if ":" in host:
+            hostname, port = host.split(":")
+            port = int(port)
+        else:
+            hostname = host
+            port = 443
+
+        with socket.create_connection((hostname, port), timeout=timeout):
+            return True
+    except Exception:
+        return False
+
+def check_reachability(user_registry, timeout=1):
+    """
+    Check reachability of hosts in a user registry.
+    
+    Args:
+        user_registry (list): List of dicts, each with a 'host' key
+        timeout (int): TCP connection timeout in seconds
     Returns:
-        tuple: A tuple containing two lists: reachable hosts and unreachable hosts.
+        tuple: (reachable_hosts, unreachable_hosts)
     """
     reachable, unreachable = [], []
     for item in user_registry:
-        try:
-            resp = requests.get(f"https://{item['host']}", timeout=timeout, verify=True)
-            if resp.status_code == 200:
-                reachable.append(item['host'])
-            else:
-                unreachable.append(item['host'])
-        except Exception:
-            unreachable.append(item['host'])
+        host = item['host']
+        if tcp_ping(host, timeout):
+            reachable.append(host)
+        else:
+            unreachable.append(host)
     return reachable, unreachable
 
 def find_invalid_cert_paths(user_registry):
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index e64479209b..6c78c51f3f 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 import requests
 from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
 from ansible.module_utils.local_repo.common_functions import is_encrypted, process_file, get_arch_from_sw_config
+from ansible.module_utils.local_repo.parse_and_download import execute_command
 # Import default variables from config.py
 from ansible.module_utils.local_repo.config import (
     PACKAGE_TYPES,
@@ -37,7 +38,8 @@
     SOFTWARES_KEY,
     REPO_CONFIG,
     ARCH_SUFFIXES,
-    ADDITIONAL_REPOS_KEY
+    ADDITIONAL_REPOS_KEY,
+    pulp_container_commands
 )
 
 
@@ -513,6 +515,81 @@ def get_failed_software(file_path):
     ]
     return failed_software
 
+def check_additional_image_in_pulp(image_entry, logger):
+    """
+    Checks if image present in additional_packages.json is configured in Pulp.
+    """
+    image_name = image_entry.get("package")
+    image_tag = image_entry.get("tag", None)
+    image_digest = image_entry.get("digest", None)
+
+    logger.info("Checking if %s is present in Pulp", image_name)
+
+    dist_name_prefix = "container_repo_"
+    transformed_dist_name = (f"{dist_name_prefix}{image_name.replace('/', '_').replace(':', '_')}")
+
+    repo_href_result = None
+    latest_version_href_result = None
+    tags_output_result = None
+
+    show_dist_cmd = (pulp_container_commands["container_distribution_show"] % transformed_dist_name)
+    repo_href_result = execute_command(show_dist_cmd, logger)
+    logger.info("repo_href_result: %s", repo_href_result)
+
+    if repo_href_result.get("stderr") and "Error:" in repo_href_result.get("stderr", ""):
+        logger.info("Distribution %s not found in Pulp", transformed_dist_name)
+        return {
+            "type": "image",
+            "package": image_name,
+            "tag": image_tag,
+        }
+    else:
+        logger.info("Distribution %s found in Pulp", transformed_dist_name)
+        repo_href = repo_href_result["stdout"]
+        show_repo_cmd = (pulp_container_commands["show_repository_version"] % repo_href)
+        latest_version_href_result = execute_command(show_repo_cmd, logger)
+        logger.info("latest_version_href_result: %s", latest_version_href_result)
+        if latest_version_href_result.get("stderr") and "Error:" in latest_version_href_result.get("stderr", ""):
+            logger.info("No repository version found. Empty repository")
+            return {
+                "type": "image",
+                "package": image_name,
+                "tag": image_tag,
+            }
+        else:
+            logger.info("Repository version found in Pulp")
+            latest_version_href = latest_version_href_result["stdout"]
+            show_tags_cmd = (pulp_container_commands["list_image_tags"] % latest_version_href)
+            tags_output_result = execute_command(show_tags_cmd, logger, type_json=True)
+            logger.info("tags_output_result: %s", tags_output_result)
+            if tags_output_result.get("stderr") and "Error:" in tags_output_result.get("stderr", ""):
+                logger.info("No tags found for %s", image_name)
+                return {
+                    "type": "image",
+                    "package": image_name,
+                    "tag": image_tag,
+                }
+            else:
+                logger.info("Tags found for %s", image_name)
+                tag_names = [tag["name"] for tag in tags_output_result.get("stdout", {}).get("results", [])]
+                logger.info("tag_names: %s", tag_names)
+                if image_tag and image_tag not in tag_names:
+                    logger.info("Tag %s not found for image %s in Pulp", image_tag, image_name)
+                    return {
+                        "type": "image",
+                        "package": image_name,
+                        "tag": image_tag,
+                    }
+                elif image_digest and image_digest not in tag_names:
+                    logger.info("Digest %s not found for image %s in Pulp", image_digest, image_name)
+                    return {
+                        "type": "image",
+                        "package": image_name,
+                        "tag": image_digest,
+                    }
+                else:
+                    logger.info("No download required as image is already present in Pulp")
+                    return {}
 
 def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_list=None):
     """
@@ -538,10 +615,25 @@ def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_
 
     filtered_list = []
 
+    # Check if file name is additional_packages.json
+    is_additional_packages = file_path.endswith("additional_packages.json")
+    logger.info("additional_packages present: %s", is_additional_packages)
+
     for key, package in data.items():
         if subgroup_list is None or key in subgroup_list:
             for value in package.values():
                 for item in value:
+                    # For every image, check if it is present in Pulp
+                    if is_additional_packages and item.get("type") == "image":
+                            logger.info("Calling function to check %s existence in Pulp", item)
+                            tag_missing_entry = check_additional_image_in_pulp(item, logger)
+                            logger.info("tag_missing_entry: %s", tag_missing_entry)
+                            if tag_missing_entry == {}:
+                                continue
+                            if tag_missing_entry:
+                                filtered_list.append(tag_missing_entry)
+                            continue
+
                     # Get package name
                     pkg_name = item.get("package")
 
diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py
index 5c818c2f75..2cbe1cba2d 100644
--- a/common/library/module_utils/local_repo/user_image_utility.py
+++ b/common/library/module_utils/local_repo/user_image_utility.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,30 +44,39 @@ def check_image_in_registry(
     Check if a container image exists in a user registry using Docker Registry HTTP API v2.
 
     Args:
-        host (str): Registry hostname.
-        image (str): Image name (e.g., library/nginx).
-        tag (str): Image tag (e.g., 1.25.2-alpine).
-        cacert (str, optional): Path to the CA certificate file.
-        key (str, optional): Path to the client key file.
-        username (str, optional): Registry username.
-        password (str, optional): Registry password.
-        logger (logging.Logger): Logger instance.
+        host (str): Registry hostname (without protocol)
+        image (str): Image name
+        tag (str): Image tag
+        cacert (str, optional): Path to the CA certificate file for TLS authentication
+        key (str, optional): Path to the client key file for TLS authentication
+        username (str, optional): Registry username for basic authentication
+        password (str, optional): Registry password for basic authentication
+        logger (logging.Logger, optional): Logger instance for logging messages
 
     Returns:
-        bool: True if image exists, False otherwise.
+        bool: True if image exists, False otherwise
     """
-    image_url = f"https://{host}/v2/{image}/manifests/{tag}"
+
+    if not host.startswith(("http://", "https://")):
+        protocol = "https" if (cacert and key) else "http"
+        host = f"{protocol}://{host}"
+    image_url = f"{host}/v2/{image}/manifests/{tag}"
     logger.info(f"Checking image existence at: {image_url}")
 
     try:
         request_args = {
-            "verify": False,  # Consider using 'verify=cacert' if using trusted certs
             "timeout": 10,
+            "verify": False,
+            "headers": {
+                "Accept": (
+                    "application/vnd.oci.image.manifest.v1+json,"
+                    "application/vnd.oci.image.index.v1+json,"
+                    "application/vnd.docker.distribution.manifest.v2+json,"
+                    "application/vnd.docker.distribution.manifest.list.v2+json"
+                )
+            },
         }
 
-        if username and password:
-            request_args["auth"] = HTTPBasicAuth(username, password)
-
         if cacert and key:
             request_args["cert"] = (cacert, key)
 
@@ -77,10 +86,21 @@ def check_image_in_registry(
             logger.info(f"Image '{image}:{tag}' exists in registry '{host}'")
             return True
 
-        logger.warning(
-            f"Image not found (HTTP {response.status_code}) in registry '{host}'"
+        if response.status_code == 404:
+            logger.info(
+                f"Image '{image}:{tag}' does not exist in registry '{host}'"
+            )
+            return False
+
+        logger.error(
+            f"Unexpected HTTP {response.status_code} while checking image "
+            f"'{image}:{tag}' in registry '{host}'"
         )
 
+    except requests.exceptions.SSLError as e:
+        logger.error(
+            f"TLS error while connecting to registry '{host}': {e}"
+        )
     except requests.RequestException as e:
         logger.exception(f"Network error while checking image: {e}")
     except Exception as e:
@@ -115,15 +135,38 @@ def create_user_remote_container(
         bool or dict: True on success, False on failure, or a dict with command result.
     """
     try:
-        ca_cert = f"@{cacert}"
-        client_key = f"@{key}"
-
         if tag_val is None:
             remote_exists = execute_command(
                 pulp_container_commands["show_container_remote"] % remote_name, logger
             )
             if not remote_exists:
-                command = pulp_container_commands["create_user_remote_digest"] % (
+                if cacert and key:
+                    ca_cert = f"@{cacert}"
+                    client_key = f"@{key}"
+                    command = pulp_container_commands["create_user_remote_digest"] % (
+                        remote_name,
+                        base_url,
+                        package_content,
+                        policy_type,
+                        ca_cert,
+                        client_key,
+                    )
+                else:
+                    command = pulp_container_commands["create_container_remote_for_digest"] % (
+                        remote_name,
+                        base_url,
+                        package_content,
+                        policy_type,
+                    )
+                result = execute_command(command, logger)
+                logger.info(f"Remote created successfully: {remote_name}")
+                return result
+
+            logger.info(f"Remote {remote_name} already exists.")
+            if cacert and key:
+                ca_cert = f"@{cacert}"
+                client_key = f"@{key}"
+                command = pulp_container_commands["update_user_remote_digest"] % (
                     remote_name,
                     base_url,
                     package_content,
@@ -131,19 +174,13 @@ def create_user_remote_container(
                     ca_cert,
                     client_key,
                 )
-                result = execute_command(command, logger)
-                logger.info(f"Remote created successfully: {remote_name}")
-                return result
-
-            logger.info(f"Remote {remote_name} already exists.")
-            command = pulp_container_commands["update_user_remote_digest"] % (
-                remote_name,
-                base_url,
-                package_content,
-                policy_type,
-                ca_cert,
-                client_key,
-            )
+            else:
+                command = pulp_container_commands["update_remote_for_digest"] % (
+                    remote_name,
+                    base_url,
+                    package_content,
+                    policy_type,
+                )
             result = execute_command(command, logger)
             logger.info(f"Remote updated successfully: {remote_name}")
             return result
@@ -154,15 +191,26 @@ def create_user_remote_container(
         )
 
         if not remote_exists:
-            command = pulp_container_commands["create_user_remote_tag"] % (
-                remote_name,
-                base_url,
-                package_content,
-                policy_type,
-                tag_val,
-                ca_cert,
-                client_key,
-            )
+            if cacert and key:
+                ca_cert = f"@{cacert}"
+                client_key = f"@{key}"
+                command = pulp_container_commands["create_user_remote_tag"] % (
+                    remote_name,
+                    base_url,
+                    package_content,
+                    policy_type,
+                    tag_val,
+                    ca_cert,
+                    client_key,
+                )
+            else:
+                command = pulp_container_commands["create_container_remote"] % (
+                    remote_name,
+                    base_url,
+                    package_content,
+                    policy_type,
+                    tag_val,
+                )
             result = execute_command(command, logger)
             if result:
                 logger.info(f"Remote '{remote_name}' created successfully.")
@@ -183,15 +231,26 @@ def create_user_remote_container(
         new_tags = existing_tags + [tag_val]
         tags_json = json.dumps(new_tags)
 
-        update_command = pulp_container_commands["update_user_remote_tag"] % (
-            remote_name,
-            base_url,
-            package_content,
-            policy_type,
-            tags_json,
-            ca_cert,
-            client_key,
-        )
+        if cacert and key:
+            ca_cert = f"@{cacert}"
+            client_key = f"@{key}"
+            update_command = pulp_container_commands["update_user_remote_tag"] % (
+                remote_name,
+                base_url,
+                package_content,
+                policy_type,
+                tags_json,
+                ca_cert,
+                client_key,
+            )
+        else:
+            update_command = pulp_container_commands["update_container_remote"] % (
+                remote_name,
+                base_url,
+                package_content,
+                policy_type,
+                tags_json,
+            )
         result = execute_command(update_command, logger)
 
         if result:
@@ -234,10 +293,13 @@ def process_user_registry(
     repository_name = (
         f"{user_reg_prefix}{package['package'].replace('/', '_').replace(':', '_')}"
     )
-    remote_name = f"user_remote_{package['package'].replace('/', '_')}"
+    remote_name = f"user_remote_{package['package'].replace('/', '_').replace(':', '_')}"
     package_identifier = package["package"]
     policy_type = "immediate"
-    base_url = f"https://{host}/"
+    if not host.startswith(("http://", "https://")):
+        protocol = "https" if (cacert and key) else "http"
+        host = f"{protocol}://{host}"
+    base_url = f"{host}/"
 
     logger.info("Creating user container repository")
     with repository_creation_lock:
@@ -314,8 +376,8 @@ def handle_user_image_registry(package, package_content, version_variables, user
             host = registry.get("host")
             cacert = registry.get("cert_path")
             key = registry.get("key_path")
-            username = registry.get("username")
-            password = registry.get("password")
+            # username = registry.get("username")
+            # password = registry.get("password")
 
             logger.info(f"Checking image {image_name}:{tag_val} in registry {host}")
             image_found = check_image_in_registry(
@@ -324,8 +386,8 @@ def handle_user_image_registry(package, package_content, version_variables, user
                 tag=tag_val,
                 cacert=cacert,
                 key=key,
-                username=username,
-                password=password,
+                username=None,
+                password=None,
                 logger=logger
             )
 
@@ -333,6 +395,11 @@ def handle_user_image_registry(package, package_content, version_variables, user
                 logger.info(f"Image '{image_name}:{tag_val}' found in registry '{host}'")
                 result, package_info = process_user_registry(package, host, package_content, version_variables, cacert, key, logger)
                 break
+            
+            elif not image_found:
+                logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}'")
+                result = False
+                break
 
     except Exception as e:
         logger.error(f"Exception in {handle_user_image_registry.__name__}: {e}")
@@ -340,3 +407,4 @@ def handle_user_image_registry(package, package_content, version_variables, user
 
     logger.info("#" * 30 + f" {handle_user_image_registry.__name__} end " + "#" * 30)
     return result, package_info
+
diff --git a/common/library/modules/check_user_registry.py b/common/library/modules/check_user_registry.py
index 8f59c93f68..c2995f17fb 100644
--- a/common/library/modules/check_user_registry.py
+++ b/common/library/modules/check_user_registry.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,60 +27,60 @@
     check_reachability,
     find_invalid_cert_paths
 )
-from ansible.module_utils.local_repo.config import (
-    USER_REG_CRED_INPUT,
-    USER_REG_KEY_PATH
-)
+# from ansible.module_utils.local_repo.config import (
+#     USER_REG_CRED_INPUT,
+#     USER_REG_KEY_PATH
+# )
 
 def main():
     """
     Ansible module to validate user registry entries.
-
-    This module loads a YAML configuration file, validates the user registry entries,
-    checks their reachability, and verifies the cert paths.
-
-    :return: A dictionary with the results of the validation and reachability checks.
     """
     module = AnsibleModule(
+        # argument_spec=dict(
+        #     timeout=dict(type='int', default=5),
+        #     config_file=dict(type='str', required=True),
+        #     user_reg_cred_input=dict(type='str', required=False, default=USER_REG_CRED_INPUT),
+        #     user_reg_key_path=dict(type='str', required=False, default=USER_REG_KEY_PATH)
+        # ),
         argument_spec=dict(
             timeout=dict(type='int', default=5),
-            config_file=dict(type='str', required=True),
-            user_reg_cred_input=dict(type='str', required=False, default=USER_REG_CRED_INPUT),
-            user_reg_key_path=dict(type='str', required=False, default=USER_REG_KEY_PATH)
+            config_file=dict(type='str', required=True)
         ),
         supports_check_mode=True
     )
 
+    # config_path = module.params['config_file']
+    # timeout = module.params['timeout']
+    # user_reg_cred_input = module.params["user_reg_cred_input"]
+    # user_reg_key_path = module.params["user_reg_key_path"]
+
     config_path = module.params['config_file']
     timeout = module.params['timeout']
-    user_reg_cred_input = module.params["user_reg_cred_input"]
-    user_reg_key_path = module.params["user_reg_key_path"]
-
     try:
         config_data = load_yaml_file(config_path)
     except FileNotFoundError as e:
         module.fail_json(msg=str(e))
 
     user_registry = get_repo_list(config_data, "user_registry")
-
-    if user_registry:
-        # Load credentials
-        if is_encrypted(user_reg_cred_input):
-            process_file(user_reg_cred_input, user_reg_key_path, 'decrypt')
-
-        file2_data = load_yaml_file(user_reg_cred_input)
-        cred_lookup = {
-            entry['name']: entry
-            for entry in file2_data.get('user_registry_credential', [])
-        }
-
-        # Update user_registry entries with credentials if required
-        for registry in user_registry:
-            if registry.get("requires_auth"):
-                creds = cred_lookup.get(registry.get("name"))
-                if creds:
-                    registry["username"] = creds.get("username")
-                    registry["password"] = creds.get("password")
+    # if user_registry:
+    #     # Load credentials
+    #     if is_encrypted(user_reg_cred_input):
+    #         process_file(user_reg_cred_input, user_reg_key_path, 'decrypt')
+
+    #     file2_data = load_yaml_file(user_reg_cred_input)
+    #     cred_lookup = {
+    #         entry['name']: entry
+    #         for entry in file2_data.get('user_registry_credential', [])
+    #     }
+
+    #     # Update user_registry entries with credentials if required
+    #     for registry in user_registry:
+    #         if registry.get("requires_auth"):
+    #             creds = cred_lookup.get(registry.get("name"))
+    #             if creds:
+    #                 registry["username"] = creds.get("username")
+    #                 registry["password"] = creds.get("password")
 
     # Exit early if user_registry is empty
     if not user_registry:
diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py
index 11b7aa1867..4fd910d027 100644
--- a/common/library/modules/parallel_tasks.py
+++ b/common/library/modules/parallel_tasks.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,8 +53,6 @@
     SOFTWARE_CSV_HEADER,
     STATUS_CSV_HEADER,
     LOCAL_REPO_CONFIG_PATH_DEFAULT,
-    USER_REG_CRED_INPUT,
-    USER_REG_KEY_PATH,
     OMNIA_CREDENTIALS_YAML_PATH,
     OMNIA_CREDENTIALS_VAULT_PATH
 )
@@ -302,6 +300,27 @@ def main():
     Raises:
         Exception: If an error occurs during execution.
     """
+    # module_args = {
+    #     "tasks": {"type": "list", "required": True},
+    #     "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS},
+    #     "timeout": {"type": "int", "required": False, "default": DEFAULT_TIMEOUT},
+    #     "log_dir": {"type": "str", "required": False, "default": LOG_DIR_DEFAULT},
+    #     "log_file": {"type": "str", "required": False, "default": DEFAULT_LOG_FILE},
+    #     "slog_file": {"type": "str", "required": False, "default": DEFAULT_SLOG_FILE},
+    #     "csv_file_path": {"type": "str", "required": False, "default": CSV_FILE_PATH_DEFAULT},
+    #     "repo_store_path": {"type": "str", "required": False, "default": DEFAULT_REPO_STORE_PATH},
+    #     "software": {"type": "list", "elements": "str", "required": True},
+    #     "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT},
+    #     "show_softwares_status": {"type": "bool", "required": False, "default": False},
+    #     "overall_status_dict": {"type": "dict","required": True},
+    #     "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT},
+    #     "arch": {"type": "str", "required": False},
+    #     "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT},
+    #     "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH},
+    #     "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH},
+    #     "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH}
+    # }
+
     module_args = {
         "tasks": {"type": "list", "required": True},
         "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS},
@@ -317,8 +336,6 @@ def main():
         "overall_status_dict": {"type": "dict","required": True},
         "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT},
         "arch": {"type": "str", "required": False},
-        "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT},
-        "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH},
         "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH},
         "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH}
     }
@@ -337,8 +354,8 @@ def main():
     overall_status_dict = module.params["overall_status_dict"]
     local_repo_config_path = module.params["local_repo_config_path"]
     arc = module.params["arch"]
-    user_reg_cred_input = module.params["user_reg_cred_input"]
-    user_reg_key_path = module.params["user_reg_key_path"]
+    # user_reg_cred_input = module.params["user_reg_cred_input"]
+    # user_reg_key_path = module.params["user_reg_key_path"]
     omnia_credentials_yaml_path = module.params["omnia_credentials_yaml_path"]
     omnia_credentials_vault_path = module.params["omnia_credentials_vault_path"]
 
@@ -370,20 +387,20 @@ def main():
         version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger)
         slogger.info(f"Cluster OS: {cluster_os_type}")
         slogger.info(f"Version Variables: {version_variables}")
-        gen_result = {}
-        if not os.path.isfile(user_reg_key_path):
-            gen_result = generate_vault_key(user_reg_key_path)
-        if gen_result is None:
-            module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}")
+        # gen_result = {}
+        # if not os.path.isfile(user_reg_key_path):
+        #     gen_result = generate_vault_key(user_reg_key_path)
+        # if gen_result is None:
+        #     module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}")
 
         overall_status, task_results = execute_parallel(
             tasks, determine_function, nthreads, repo_store_path, csv_file_path,
-            log_dir, user_data, version_variables, arc, slogger, local_repo_config_path, user_reg_cred_input, user_reg_key_path,
+            log_dir, user_data, version_variables, arc, slogger, local_repo_config_path,
             omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout
         )
 
-        if not is_encrypted(user_reg_cred_input):
-            process_file(user_reg_cred_input,user_reg_key_path,'encrypt')
+        # if not is_encrypted(user_reg_cred_input):
+        #     process_file(user_reg_cred_input,user_reg_key_path,'encrypt')
 
         end_time = datetime.now()
         formatted_end_time = end_time.strftime("%I:%M:%S %p")
diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml
index 55583e1a07..2f318f1deb 100644
--- a/input/local_repo_config.yml
+++ b/input/local_repo_config.yml
@@ -18,7 +18,20 @@
 # ================================
 # VARIABLE DETAILS
 # ================================
-# 1. user_repo_url_x86_64
+# 1. user_registry
+#--------------------------
+# Configuration for user registry to configure additional images in Pulp
+# Fields:
+#   host       : Registry IP and port in format "IP:port"
+#   cert_path  : Path to SSL certificate file (.crt) - Required only if host is using HTTPS
+#   key_path   : Path to SSL private key file (.key) - Required only if host is using HTTPS
+# Notes:
+#   - If host is HTTPS, cert_path and key_path are required
+#   - If host is HTTP, cert_path and key_path can be left empty
+#   - cert_path should point to .crt files only
+#   - key_path should point to .key files only
+#   - cert and key paths are accessed from within the omnia_core container
+# 2. user_repo_url_x86_64
 #--------------------------
 #    Optional list of user-defined repository URLs for x86_64 architecture.
 #    Each entry can include: url, gpgkey, sslcacert, sslclientkey, sslclientcert, name, policy.
@@ -36,7 +49,7 @@
 #   - Omit SSL fields entirely if SSL is not in use.
 #   - Its a madatory field in case of slurm_custom with name as '<arch>_slurm_custom'
 #
-# 2. user_repo_url_aarch64
+# 3. user_repo_url_aarch64
 #---------------------------
 #    Same as above but for aarch64 architecture.
 #
@@ -106,7 +119,9 @@
 # ================================
 # VARIABLES
 # ================================
-# Example    
+# user_registry:
+#    - { host: "172.16.107.254:4000", cert_path: "/opt/omnia/domain.crt", key_path: "/opt/omnia/domain.key" }
+user_registry:
 # user_repo_url_x86_64:
 #  - { url: "", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "",  name: "x86_64_slurm_custom" }
 user_repo_url_x86_64:
diff --git a/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml b/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml
index 9df565f229..3f44ccdeb0 100644
--- a/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml
+++ b/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@
         local_repo_config_path: "{{ local_repo_config_path }}"
         arch: "{{ item.arch }}"
         overall_status_dict: {}
-        user_reg_cred_input: "{{ user_reg_cred_input }}"
-        user_reg_key_path: "{{ user_reg_key_path }}"
+        # user_reg_cred_input: "{{ user_reg_cred_input }}"
+        # user_reg_key_path: "{{ user_reg_key_path }}"
         omnia_credentials_yaml_path: "{{ omnia_credentials_yaml_path }}"
         omnia_credentials_vault_path: "{{ omnia_credentials_vault_path }}"
         nthreads: "{{ (local_repo_py_module_vars[item.key].nthreads | default(local_repo_py_module_vars.default_vars.nthreads)) }}"
diff --git a/local_repo/roles/parse_and_download/vars/main.yml b/local_repo/roles/parse_and_download/vars/main.yml
index 90141225b6..74b24cd1c2 100644
--- a/local_repo/roles/parse_and_download/vars/main.yml
+++ b/local_repo/roles/parse_and_download/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,8 +27,8 @@ local_repo_config_path: "{{ project_input_path }}/local_repo_config.yml"
 sw_config_json_path: "{{ project_input_path }}/software_config.json"
 functional_groups_config_path: "{{ nfs_shared_path }}/.data/functional_groups_config.yml"
 user_json_file: "{{ project_input_path }}/software_config.json"
-user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml"
-user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key"
+# user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml"
+# user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key"
 omnia_credentials_yaml_path: "{{ project_input_path }}/omnia_config_credentials.yml"
 omnia_credentials_vault_path: "{{ project_input_path }}/.omnia_config_credentials_key"
 clean_rpms: true
diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml
index 6087ab200b..ea9c61aeb5 100644
--- a/local_repo/roles/validation/tasks/main.yml
+++ b/local_repo/roles/validation/tasks/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,8 +36,8 @@
 - name: Check user registry reachability
   check_user_registry:
     config_file: "{{ local_repo_config_file }}"
-    user_reg_cred_input: "{{ user_reg_cred_input }}"
-    user_reg_key_path: "{{ user_reg_key_path }}"
+    # user_reg_cred_input: "{{ user_reg_cred_input }}"
+    # user_reg_key_path: "{{ user_reg_key_path }}"
     timeout: "{{ time_out }}"
   register: registry_check_result
 
diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml
index 83c0523e47..ec343cb3ef 100644
--- a/local_repo/roles/validation/vars/main.yml
+++ b/local_repo/roles/validation/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,8 +44,8 @@ softwares_invalid_msg: "Invalid software_name(s) found: {{ softwares_list | diff
 # Usage: main.yml
 nfs_shared_path: "/opt/omnia"
 local_repo_config_file: "{{ project_input_path }}/local_repo_config.yml"
-user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml"
-user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key"
+# user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml"
+# user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key"
 var_mount_percentage_limit: 80
 var_mount_overuse_msg: |
   [WARNING] local_repo.yml may fail as /var mount usage has exceeded the limit of {{ var_mount_percentage_limit }}%.
@@ -144,7 +144,7 @@ user_registry_fail_msg: "Failed. Please ensure user_registry is non empty list a
     check if there is any indentation error in {{ project_input_path }}/local_repo_config.yml"
 user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry should have 'host' and 'cert_path' keys defined"
 time_out: 30
-user_registry_msg: "Above host registries are not reachable. If the user registry is not accessible from the Omnia Infrastructure Manager, Omnia will download all the images for the software listed in software_config.json."   # noqa: yaml[line-length]
+user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager."   # noqa: yaml[line-length]
 cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml"  # noqa: yaml[line-length]
 
 # Usage: validate_user_repo_url.yml

From 64c141daa4f29fba717cdd935476c9a88f6da10d Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Wed, 4 Feb 2026 11:30:03 +0530
Subject: [PATCH 016/172] localrepo pulp cleanup

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../library/module_utils/local_repo/config.py |  73 +-
 .../local_repo/download_common.py             |  14 +-
 .../module_utils/local_repo/download_rpm.py   |  21 +-
 .../local_repo/parse_and_download.py          |  33 +-
 .../module_utils/local_repo/software_utils.py |  15 +-
 common/library/modules/group_package_map.py   |   4 +
 common/library/modules/pulp_cleanup.py        | 837 ++++++++++++++++++
 local_repo/pulp_cleanup.yml                   |  96 ++
 8 files changed, 1068 insertions(+), 25 deletions(-)
 create mode 100644 common/library/modules/pulp_cleanup.py
 create mode 100644 local_repo/pulp_cleanup.yml

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 9c9af639fb..4bf3ade5dd 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -33,7 +33,7 @@
 DEFAULT_REPO_STORE_PATH = "/tmp/offline_repo"
 USER_JSON_FILE_DEFAULT = ""
 DEFAULT_STATUS_FILENAME = "status.csv"
-STATUS_CSV_HEADER = 'name,type,status\n'
+STATUS_CSV_HEADER = 'name,type,repo_name,status\n'
 SOFTWARE_CSV_HEADER = "name,status"
 USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml"
 USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key"
@@ -78,7 +78,29 @@
     "show_distribution": "pulp file distribution show --name %s",
     "distribution_create": "pulp file distribution create --name %s --base-path %s --repository %s",
     "distribution_update": "pulp file distribution update --name %s --base-path %s --repository %s",
+
+    # Cleanup commands
+    "delete_repository": "pulp file repository destroy --name %s",
+    "delete_distribution": "pulp file distribution destroy --name %s",
+    "delete_publication": "pulp file publication destroy --href %s",
+    "list_publications": "pulp file publication list --repository %s",
+    "list_repositories": "pulp file repository list",
+    "list_distributions": "pulp file distribution list",
+    "list_content": "pulp file content list --repository-version %s",
+    "show_repository_version": "pulp file repository version show --repository %s",
+    "orphan_cleanup": "pulp orphan cleanup --protection-time 0"
+}
+
+# Pulp Python repository commands (for pip modules)
+pulp_python_commands = {
+    "list_repositories": "pulp python repository list",
+    "show_repository": "pulp python repository show --name %s",
+    "delete_repository": "pulp python repository destroy --name %s",
+    "list_distributions": "pulp python distribution list",
+    "delete_distribution": "pulp python distribution destroy --name %s",
+    "orphan_cleanup": "pulp orphan cleanup --protection-time 0"
 }
+
 CLI_FILE_PATH = "/root/.config/pulp/cli.toml"
 POST_TIMEOUT = 3600
 TAR_POLL_VAL = 3
@@ -107,10 +129,20 @@
     "distribute_container_repository": "pulp container distribution create --name %s --repository %s --base-path %s",
     "update_container_distribution": "pulp container distribution update --name %s --repository %s --base-path %s",
     "list_container_remote_tags": "pulp container remote list --name %s --field include_tags",
-
     "create_container_remote_auth": "pulp container remote create --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'",
-
-    "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'"
+    "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'",
+    # Cleanup commands
+    "delete_repository": "pulp container repository destroy --name %s",
+    "delete_remote": "pulp container remote destroy --name %s",
+    "delete_distribution": "pulp container distribution destroy --name %s",
+    "list_repositories": "pulp container repository list",
+    "list_remotes": "pulp container remote list",
+    "list_distributions": "pulp container distribution list",
+    # Tag-specific cleanup commands
+    "get_repo_version": "pulp container repository show --href %s",
+    "list_tags_by_version": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s",
+    "rename_repository": "pulp container repository update --name %s --new-name %s",
+    "orphan_cleanup": "pulp orphan cleanup"
 
 }
 OMNIA_CREDENTIALS_YAML_PATH = "/opt/omnia/input/project_default/omnia_config_credentials.yml"
@@ -145,9 +177,40 @@
     "check_distribution": "pulp rpm distribution show --name %s",
     "check_publication": "pulp rpm publication list --repository %s",
     "delete_publication": "pulp rpm publication destroy --href %s",
-    "get_repo_version": "pulp rpm repository show --name %s"
+    "get_repo_version": "pulp rpm repository show --name %s",
+    "list_repositories": "pulp rpm repository list",
+    "list_remotes": "pulp rpm remote list",
+    "list_distributions": "pulp rpm distribution list",
+    "orphan_cleanup": "pulp orphan cleanup --protection-time 0"
 }
 
+# ----------------------------
+# Pulp Cleanup Configuration
+# Used by pulp_cleanup.py and Ansible modules
+# ----------------------------
+
+# Default paths
+CLEANUP_BASE_PATH_DEFAULT = "/opt/omnia/log/local_repo"
+CLEANUP_STATUS_FILE_PATH_DEFAULT = "/opt/omnia/log/local_repo/cleanup_status.csv"
+CLEANUP_LOG_PATH_DEFAULT = "/opt/omnia/log/local_repo/cleanup.log"
+
+# Default cleanup behavior
+CLEANUP_DELETE_REMOTE_DEFAULT = True
+CLEANUP_DELETE_DISTRIBUTION_DEFAULT = True
+CLEANUP_CLEANUP_ORPHANS_AFTER_DEFAULT = True
+CLEANUP_LIST_ONLY_DEFAULT = False
+CLEANUP_FORCE_DEFAULT = False
+
+# Cleanup status values
+CLEANUP_STATUS_SUCCESS = "Success"
+CLEANUP_STATUS_FAILED = "Failed"
+CLEANUP_STATUS_IN_PROGRESS = "In Progress"
+
+# Cleanup status file settings
+CLEANUP_STATUS_FILENAME = "cleanup_status.csv"
+CLEANUP_STATUS_CSV_HEADER = "artifact_name,artifact_type,status,message,timestamp\n"
+CLEANUP_LOG_FILE_PATH = "/opt/omnia/log/local_repo/cleanup.log"
+
 # ----------------------------
 # Additional Repos Aggregation Settings
 # Used by process_rpm_config.py for aggregated repos feature
diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py
index c8d8bd1339..f139384b23 100644
--- a/common/library/module_utils/local_repo/download_common.py
+++ b/common/library/module_utils/local_repo/download_common.py
@@ -477,7 +477,7 @@ def process_manifest(file,repo_store_path, status_file_path, cluster_os_type, cl
         manifest_directory = os.path.join(repo_store_path, "offline_repo", "cluster",arc.lower(), cluster_os_type, cluster_os_version, "manifest", package_name)
         # # Determine the manifest file path
         file_path = os.path.join(manifest_directory, f"{package_name}.yaml")
-        repository_name = "manifest" + package_name
+        repository_name = arc.lower() + "_manifest" + package_name
         output_file =  package_name + ".yml"
         relative_path = output_file
         base_path = manifest_directory.strip("/")
@@ -531,7 +531,7 @@ def process_git(file,repo_store_path, status_file_path, cluster_os_type, cluster
         clone_directory = os.path.join(git_modules_directory, package_name)
         clone_directory = shlex.quote(clone_directory).strip("'\"")
         tarball_path = os.path.join(git_modules_directory, f'{package_name}.tar.gz')
-        repository_name = "git" + package_name
+        repository_name = arc.lower() + "_git" + package_name
         output_file = package_name + ".tar.gz"
         relative_path = output_file
         base_path = git_modules_directory.strip("/")
@@ -600,7 +600,7 @@ def process_shell(file,repo_store_path, status_file_path,  cluster_os_type, clus
         os.makedirs(sh_directory, exist_ok=True)  # Ensure the directory exists
 
         sh_path = os.path.join(sh_directory, f"{package_name}.sh")
-        repository_name = "shell" + package_name
+        repository_name = arc.lower() + "_shell" + package_name
         output_file = package_name + ".sh"
         relative_path = output_file
         base_path = sh_directory.strip("/")
@@ -651,7 +651,7 @@ def process_ansible_galaxy_collection(file, repo_store_path, status_file_path, c
         galaxy_collections_directory = shlex.quote(galaxy_collections_directory).strip("'\"")
         os.makedirs(galaxy_collections_directory, exist_ok=True)  # Ensure the directory exists
         collections_tarball_path = os.path.join(galaxy_collections_directory, f'{package_name.replace(".", "-")}-{version}.tar.gz')
-        repository_name = "ansible_galaxy_collection" + package_name
+        repository_name = arc.lower() + "_ansible_galaxy_collection" + package_name
         output_file = f"{file['package'].replace('.', '-')}-{file['version']}.tar.gz"
         relative_path = output_file
         base_path = galaxy_collections_directory.strip("/")
@@ -758,7 +758,7 @@ def process_tarball(package, repo_store_path, status_file_path, version_variable
     tarball_path = os.path.join(tarball_directory, f"{package_name}.tar.gz")
     tarball_path = shlex.quote(tarball_path).strip("'\"")
 
-    repository_name = "tarball" + package_name
+    repository_name = arc.lower() + "_tarball" + package_name
     output_file = package_name + ".tar.gz"
     relative_path = output_file
     base_path = tarball_directory.strip("/")
@@ -844,7 +844,7 @@ def process_iso(package, repo_store_path, status_file_path,
     url_support = True
     package_name = package['package']
     package_type = package['type']
-    repository_name = "iso" + package_name + arc
+    repository_name = arc.lower() + "_iso" + package_name
 
     distribution_name = repository_name
     if 'url' in package:
@@ -941,7 +941,7 @@ def process_pip(package, repo_store_path, status_file_path,  cluster_os_type, cl
         package_name = shlex.quote(package['package']).strip("'\"")
         package_type = package['type']
         version = package.get('version', None)
-        pip_repo = "pip_module" + package_name
+        pip_repo =  arc.lower() + "_pip_module" + package_name
         distribution_name = pip_repo
 
         logger.info(f"Processing Pip Package: {package_name}, Version: {version}")
diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py
index 0b7bc2a0e6..95b354dd6b 100644
--- a/common/library/module_utils/local_repo/download_rpm.py
+++ b/common/library/module_utils/local_repo/download_rpm.py
@@ -49,6 +49,9 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
     logger.info("#" * 30 + f" {process_rpm.__name__} start " + "#" * 30)
 
     try:
+        # Get repo_mapping for individual RPM repo names
+        repo_mapping = package.get("repo_mapping", {})
+
         if repo_config_value == "always":
             rpm_list = list(set(package["rpm_list"]))
             logger.info(f"{package['package']} - List of rpms is {rpm_list}")
@@ -90,9 +93,11 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
 
             # Detect successes/failures from combined run
             for pkg in rpm_list:
+                # Get repo_name for this specific RPM from mapping
+                pkg_repo_name = repo_mapping.get(pkg, "")
                 if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines):
                     downloaded.append(pkg)
-                    write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock)
+                    write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
                 else:
                     failed.append(pkg)
 
@@ -102,14 +107,16 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                 for pkg in failed[:]:
                     cmd = DNF_COMMANDS[arch_key] + [f'--destdir={rpm_directory}', pkg]
                     retry_res = subprocess.run(cmd, check=False, capture_output=True, text=True)
+                    # Get repo_name for this specific RPM from mapping
+                    pkg_repo_name = repo_mapping.get(pkg, "")
 
                     if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr:
                         downloaded.append(pkg)
                         failed.remove(pkg)
-                        write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock)
+                        write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
                         logger.info(f"Package '{pkg}' downloaded successfully on retry.")
                     else:
-                        write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock)
+                        write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name)
                         logger.error(f"Package '{pkg}' still failed after retry.")
 
             # Determine final status
@@ -124,13 +131,17 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
             status = "Success"
             logger.info("RPM won't be downloaded when repo_config is partial or never")
             for pkg in package["rpm_list"]:
-                write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock)
+                # Get repo_name for this specific RPM from mapping
+                pkg_repo_name = repo_mapping.get(pkg, "")
+                write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
 
     except Exception as e:
         logger.error(f"Exception occurred: {e}")
         status = "Failed"
         for pkg in package.get("rpm_list", []):
-            write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock)
+            # Get repo_name for this specific RPM from mapping
+            pkg_repo_name = repo_mapping.get(pkg, "")
+            write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name)
 
     finally:
         logger.info(f"Overall status for {package['package']}: {status}")
diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 8874621f0c..367f9561f5 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -83,9 +83,17 @@ def execute_command(cmd_string, logger, type_json=False):
     finally:
         logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30)
 
-def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock):
+def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None):
     """
     Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes.
+    Args:
+        status_file_path: Path to the status file
+        package_name: Name of the package
+        package_type: Type of the package (rpm, image, etc.)
+        status: Status (Success, Failed, etc.)
+        logger: Logger instance
+        file_lock: Lock for thread safety
+        repo_name: Optional repository name (for RPMs)
     """
     logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30)
 
@@ -97,19 +105,32 @@ def write_status_to_file(status_file_path, package_name, package_type, status, l
 
                 updated = False
                 with open(status_file_path, "w") as f:
-                    for line in lines:
+                      # Write header (new files always have repo_name column)
+                    if lines:
+                        f.write(lines[0])  # Keep existing header
+
+                    # Write data lines
+                    for line in lines[1:]:  # Skip header
                         if line.startswith(f"{package_name},"):
-                            f.write(f"{package_name},{package_type},{status}\n")
+                           # f.write(f"{package_name},{package_type},{status}\n")
+                            # Update existing line with repo_name (order: name,type,repo_name,status)
+                            parts = line.strip().split(',')
+                            if len(parts) >= 4:
+                                parts[2] = repo_name if repo_name else ''
+                                parts[3] = status
+                                f.write(','.join(parts) + '\n')
+                            else:
+                                f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
                             updated = True
                         else:
                             f.write(line)
 
                     if not updated:
-                        f.write(f"{package_name},{package_type},{status}\n")
+                        f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
             else:
                 with open(status_file_path, "w") as f:
-                    f.write("name,type,status\n")
-                    f.write(f"{package_name},{package_type},{status}\n")
+                    f.write(STATUS_CSV_HEADER)
+                    f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
 
             logger.info(f"Status written to {status_file_path} for {package_name}.")
     except Exception as e:
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index e64479209b..6290e78538 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -174,21 +174,32 @@ def transform_package_dict(data, arch_val,logger):
     for sw_name, items in data.items():
         transformed_items = []
         rpm_packages = []
+        repo_mapping = {}
 
         for item in items:
             if item.get("type") == "rpm":
                 rpm_packages.append(item["package"])
+                # Preserve repo_name if available
+                if "repo_name" in item:
+                    repo_mapping[item["package"]] = item["repo_name"]
             elif item.get("type") == "rpm_list":
                 rpm_packages.extend(item["package_list"])
+                # Preserve repo_mapping if available
+                if "repo_mapping" in item:
+                    repo_mapping.update(item["repo_mapping"])
             else:
                 transformed_items.append(item)
 
         if rpm_packages:
-            transformed_items.append({
+            rpm_task = {
                 "package": RPM_LABEL_TEMPLATE.format(key=sw_name),
                 "rpm_list": rpm_packages,
                 "type": "rpm"
-            })
+            }
+            # Add repo_mapping if we have any
+            if repo_mapping:
+                rpm_task["repo_mapping"] = repo_mapping
+            transformed_items.append(rpm_task)
 
         result[arch_val][sw_name] = transformed_items
         logger.info(f"Finished processing %s. Result: %s", sw_name, transformed_items)
diff --git a/common/library/modules/group_package_map.py b/common/library/modules/group_package_map.py
index e5d29289e1..6076970f6d 100644
--- a/common/library/modules/group_package_map.py
+++ b/common/library/modules/group_package_map.py
@@ -145,6 +145,10 @@ def get_type_dict(clust_list):
                 # Add package to rpm key
             type_dict[pkgtype] = type_dict.get(
                 pkgtype, []) + [pkg_dict.get('package')]
+            # Also track repo_name mapping for RPMs
+            if 'repo_mapping' not in type_dict:
+                type_dict['repo_mapping'] = {}
+            type_dict['repo_mapping'][pkg_dict.get('package')] = pkg_dict.get('repo_name', '')
 
         # Update reboot required values
         reboot_val = pkg_dict.get(REBOOT_KEY, False)
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
new file mode 100644
index 0000000000..10c43ca0e9
--- /dev/null
+++ b/common/library/modules/pulp_cleanup.py
@@ -0,0 +1,837 @@
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unified Pulp Cleanup Module
+
+Architecture:
+    Input → Type Detection → Processing → Status Updates → Return Results
+
+Handles:
+    - Repository cleanup (RPM)
+    - Container cleanup
+    - File cleanup (git, tarball, pip_module)
+"""
+
+import os
+import csv
+import glob
+import json
+import subprocess
+import time
+from datetime import datetime
+from typing import Dict, List, Any, Tuple
+
+from ansible.module_utils.basic import AnsibleModule
+from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
+from ansible.module_utils.local_repo.config import (
+    CLEANUP_BASE_PATH_DEFAULT,
+    CLEANUP_STATUS_FILE_PATH_DEFAULT,
+    pulp_rpm_commands,
+    pulp_container_commands,
+    pulp_file_commands,
+    pulp_python_commands,
+    ARCH_SUFFIXES
+)
+
+
+# =============================================================================
+# PRETTY TABLE FORMATTING
+# =============================================================================
+
+# ANSI color codes
+GREEN = '\033[92m'
+RED = '\033[91m'
+YELLOW = '\033[93m'
+RESET = '\033[0m'
+
+def format_pretty_table(results: List[Dict[str, Any]]) -> str:
+    """Format cleanup results into a pretty table."""
+    if not results:
+        return "No cleanup results to display"
+
+    headers = ["Name", "Type", "Status", "Message"]
+    
+    # Calculate column widths
+    widths = [len(h) for h in headers]
+    for r in results:
+        widths[0] = max(widths[0], len(str(r.get('name', ''))))
+        widths[1] = max(widths[1], len(str(r.get('type', ''))))
+        widths[2] = max(widths[2], len(str(r.get('status', ''))))
+        widths[3] = max(widths[3], min(len(str(r.get('message', ''))), 40))
+
+    # Build table
+    border = "+" + "+".join("-" * (w + 2) for w in widths) + "+"
+    header_row = "|" + "|".join(f" {h.ljust(w)} " for h, w in zip(headers, widths)) + "|"
+    
+    lines = [border, header_row, border]
+    
+    for r in results:
+        msg = str(r.get('message', ''))[:40]
+        row = "|" + "|".join([
+            f" {str(r.get('name', '')).ljust(widths[0])} ",
+            f" {str(r.get('type', '')).ljust(widths[1])} ",
+            f" {str(r.get('status', '')).ljust(widths[2])} ",
+            #f" {colored_status}{status_padding} ",
+            f" {msg.ljust(widths[3])} "
+        ]) + "|"
+        lines.append(row)
+    
+    lines.append(border)
+    return "\n".join(lines)
+
+
+# =============================================================================
+# COMMAND EXECUTION
+# =============================================================================
+
+def run_cmd(cmd: str, logger) -> Dict[str, Any]:
+    """Execute shell command and return result."""
+    try:
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
+        return {"rc": result.returncode, "stdout": result.stdout, "stderr": result.stderr}
+    except Exception as e:
+        logger.error(f"Command failed: {cmd} - {e}")
+        return {"rc": 1, "stdout": "", "stderr": str(e)}
+
+
+def safe_json_parse(data: str, default: Any = None) -> Any:
+    """Safely parse JSON string using JSONDecoder with validation.
+    
+    Uses json.JSONDecoder instead of json.loads to avoid Checkmarx vulnerabilities.
+    """
+    if not data or not isinstance(data, str):
+        return default if default is not None else []
+    
+    try:
+        decoder = json.JSONDecoder()
+        parsed, _ = decoder.raw_decode(data.strip())
+        return parsed
+    except (ValueError, TypeError):
+        return default if default is not None else []
+
+
+# =============================================================================
+# CONTAINER IMAGE VALIDATION & CONVERSION
+# =============================================================================
+
+def validate_container_format(image_name: str) -> Tuple[bool, str]:
+    """Validate container image format.
+    
+    User must provide format: registry/image (e.g., registry.k8s.io/pause)
+    
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if not image_name:
+        return False, "Container image name cannot be empty"
+    
+    # Must contain at least one '/' to indicate registry/image format
+    if '/' not in image_name:
+        return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)"
+    
+    # Must have a registry part (contains '.' or is a known registry)
+    parts = image_name.split('/')
+    registry = parts[0]
+    
+    # Check if registry looks valid (contains dot or is localhost)
+    if '.' not in registry and registry != 'localhost' and ':' not in registry:
+        return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)"
+    
+    return True, ""
+
+
+def convert_to_pulp_container_name(image_name: str) -> str:
+    """Convert user-provided image name to Pulp repository name.
+    
+    Examples:
+        registry.k8s.io/pause -> container_repo_registry.k8s.io_pause
+        docker.io/library/busybox -> container_repo_docker.io_library_busybox
+        ghcr.io/kube-vip/kube-vip -> container_repo_ghcr.io_kube-vip_kube-vip
+    """
+    # Replace '/' with '_' and prepend 'container_repo_'
+    normalized = image_name.replace('/', '_')
+    return f"container_repo_{normalized}"
+
+
+# =============================================================================
+# TYPE DETECTION
+# =============================================================================
+
+def detect_file_type(name: str) -> str:
+    """Detect artifact type from name."""
+    # Pip module: contains == (e.g., cffi==1.17.1)
+    if '==' in name:
+        return "pip_module"
+    # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix)
+    if '.' in name and '/' not in name and '==' not in name and any(x in name.lower() for x in ['ansible', 'community', 'galaxy']):
+        return "ansible_galaxy_collection"
+    if name.startswith('ansible_galaxy_collection'):
+        return "ansible_galaxy_collection"
+    if any(x in name.lower() for x in ['chart', 'tar', 'tgz', 'helm', 'bundle']):
+        return "tarball"
+    if any(x in name.lower() for x in ['git', 'repo', 'source', 'scm']):
+        return "git"
+    if any(x in name.lower() for x in ['manifest', 'calico', 'yml', 'yaml']):
+        return "manifest"
+    return "file"
+
+
+# =============================================================================
+# EXISTENCE CHECKS
+# =============================================================================
+
+def repo_exists(name: str, logger) -> bool:
+    """Check if RPM repository exists in Pulp."""
+    cmd = pulp_rpm_commands["show_repository"] % name
+    result = run_cmd(cmd, logger)
+    return result["rc"] == 0
+
+
+def container_exists(name: str, logger) -> bool:
+    """Check if container repository exists in Pulp."""
+    cmd = pulp_container_commands["show_container_repo"] % name
+    result = run_cmd(cmd, logger)
+    return result["rc"] == 0
+
+
+def file_exists_in_status(name: str, base_path: str, logger) -> bool:
+    """Check if file artifact exists in status files."""
+    try:
+        for status_file in glob.glob(f"{base_path}/x86_64/*/status.csv"):
+            with open(status_file, 'r') as f:
+                if name in f.read():
+                    return True
+        return False
+    except Exception:
+        return False
+
+
+# =============================================================================
+# CLEANUP FUNCTIONS
+# =============================================================================
+
+def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]:
+    """Cleanup a single RPM repository."""
+    result = {"name": name, "type": "repository", "status": "Failed", "message": ""}
+    
+    # Check existence
+    if not repo_exists(name, logger):
+        result["message"] = "Repository not found"
+        return result
+    
+    try:
+        # Delete distributions
+        dist_list = run_cmd(pulp_rpm_commands["list_distributions"], logger)
+        if dist_list["rc"] == 0:
+            dists = safe_json_parse(dist_list["stdout"])
+            for d in dists:
+                if d.get('name', '') == name or name in d.get('name', ''):
+                    run_cmd(pulp_rpm_commands["delete_distribution"] % d.get('name', ''), logger)
+        
+        # Delete publications
+        pub_list = run_cmd(pulp_rpm_commands["list_publications"] % name, logger)
+        if pub_list["rc"] == 0:
+            pubs = safe_json_parse(pub_list["stdout"])
+            for p in pubs:
+                run_cmd(pulp_rpm_commands["delete_publication"] % p.get('pulp_href', ''), logger)
+        
+        # Delete remote
+        run_cmd(pulp_rpm_commands["delete_remote"] % name, logger)
+        
+        # Delete repository
+        del_result = run_cmd(pulp_rpm_commands["delete_repository"] % name, logger)
+        
+        if del_result["rc"] == 0:
+            result["status"] = "Success"
+            result["message"] = "Repository deleted"
+            # Update status files - remove RPM entries from this repo and mark software as partial
+            affected = remove_rpms_from_repository(name, base_path, logger)
+            logger.info(f" mark affected softwares as partial {affected}")
+            mark_software_partial(affected, base_path, logger)
+        else:
+            result["message"] = f"Delete failed: {del_result['stderr']}"
+            
+    except Exception as e:
+        result["message"] = f"Error: {str(e)}"
+    
+    return result
+
+
+def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]:
+    """Cleanup a single container repository.
+    
+    Args:
+        user_input: User-provided image name (e.g., registry.k8s.io/pause)
+    """
+    result = {"name": user_input, "type": "container", "status": "Failed", "message": ""}
+    
+    # Validate format
+    is_valid, error_msg = validate_container_format(user_input)
+    if not is_valid:
+        result["message"] = error_msg
+        return result
+    
+    # Convert to Pulp naming convention
+    pulp_name = convert_to_pulp_container_name(user_input)
+    
+    # Check existence
+    if not container_exists(pulp_name, logger):
+        result["message"] = f"Container not found in Pulp (looked for: {pulp_name})"
+        return result
+    
+    try:
+        # Delete distributions
+        dist_list = run_cmd(pulp_container_commands["list_distributions"], logger)
+        if dist_list["rc"] == 0:
+            dists = safe_json_parse(dist_list["stdout"])
+            for d in dists:
+                if d.get('name', '') == pulp_name:
+                    run_cmd(pulp_container_commands["delete_distribution"] % d.get('name', ''), logger)
+        
+        # Delete repository
+        del_result = run_cmd(pulp_container_commands["delete_repository"] % pulp_name, logger)
+        
+        if del_result["rc"] == 0:
+            result["status"] = "Success"
+            result["message"] = "Container deleted"
+            # Update status files - remove image entries and mark software as partial
+            affected = remove_from_status_files(user_input, 'image', base_path, logger)
+            mark_software_partial(affected, base_path, logger)
+        else:
+            result["message"] = f"Delete failed: {del_result['stderr']}"
+            
+    except Exception as e:
+        result["message"] = f"Error: {str(e)}"
+    
+    return result
+
+
+def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]:
+    """Check if file content exists in Pulp file repository.
+    
+    Returns:
+        Tuple of (exists, repo_name, content_href)
+    """
+    try:
+        # List file repositories and search for the content
+        repo_list = run_cmd(pulp_file_commands["list_repositories"], logger)
+        if repo_list["rc"] != 0:
+            return False, "", ""
+        
+        repos = safe_json_parse(repo_list["stdout"])
+        for repo in repos:
+            repo_name = repo.get('name', '')
+            # Check if this repo contains our file
+            content_list = run_cmd(
+                f"pulp file content list --repository {repo_name} --relative-path '{name}'",
+                logger
+            )
+            if content_list["rc"] == 0:
+                contents = safe_json_parse(content_list["stdout"])
+                if contents:
+                    return True, repo_name, contents[0].get('pulp_href', '')
+        
+        return False, "", ""
+    except Exception:
+        return False, "", ""
+
+
+def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) -> Tuple[bool, str]:
+    """Delete file content from Pulp.
+    
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        messages = []
+        
+        # 1. Remove content from repository
+        if content_href:
+            remove_result = run_cmd(
+                f"pulp file repository content remove --repository {repo_name} --href {content_href}",
+                logger
+            )
+            if remove_result["rc"] == 0:
+                messages.append("Content removed from repository")
+            else:
+                # Try alternative: modify repository to remove content
+                run_cmd(
+                    f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'",
+                    logger
+                )
+        
+        # 2. Delete distribution if exists
+        dist_result = run_cmd(pulp_file_commands["list_distributions"], logger)
+        if dist_result["rc"] == 0:
+            dists = safe_json_parse(dist_result["stdout"])
+            for d in dists:
+                if d.get('name', '') == name or name in d.get('name', ''):
+                    run_cmd(pulp_file_commands["delete_distribution"] % d.get('name', ''), logger)
+                    messages.append("Distribution deleted")
+        
+        # 3. Try to delete the file repository if it's named after the artifact
+        repo_del = run_cmd(pulp_file_commands["delete_repository"] % name, logger)
+        if repo_del["rc"] == 0:
+            messages.append("Repository deleted")
+        
+        return True, "; ".join(messages) if messages else "Removed from Pulp"
+        
+    except Exception as e:
+        return False, f"Pulp deletion error: {str(e)}"
+
+
+def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
+    """Cleanup a pip module from Pulp Python repository.
+    
+    Pip modules are stored as: pip_module<package_name>==<version>
+    e.g., pip_modulecffi==1.17.1
+    """
+    result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""}
+    messages = []
+    pulp_deleted = False
+    
+    try:
+        # Pulp Python repo name format: pip_module<name>
+        # User input could be "cffi==1.17.1" or "pip_modulecffi==1.17.1"
+        if name.startswith("pip_module"):
+            pulp_repo_name = name
+        else:
+            pulp_repo_name = f"pip_module{name}"
+        
+        logger.info(f"Looking for Python repository: {pulp_repo_name}")
+        
+        # Check if repository exists
+        repo_check = run_cmd(pulp_python_commands["show_repository"] % pulp_repo_name, logger)
+        
+        if repo_check["rc"] == 0:
+            # Delete distribution first
+            dist_del = run_cmd(pulp_python_commands["delete_distribution"] % pulp_repo_name, logger)
+            if dist_del["rc"] == 0:
+                messages.append("Distribution deleted")
+            
+            # Delete repository
+            repo_del = run_cmd(pulp_python_commands["delete_repository"] % pulp_repo_name, logger)
+            if repo_del["rc"] == 0:
+                pulp_deleted = True
+                messages.append("Repository deleted")
+            
+            # Run orphan cleanup
+            if pulp_deleted:
+                logger.info("Running orphan cleanup...")
+                orphan_result = run_cmd(pulp_python_commands["orphan_cleanup"], logger)
+                if orphan_result["rc"] == 0:
+                    messages.append("Orphan cleanup completed")
+        else:
+            # Try listing repos to find partial match
+            repo_list = run_cmd(pulp_python_commands["list_repositories"], logger)
+            if repo_list["rc"] == 0:
+                repos = safe_json_parse(repo_list["stdout"])
+                for repo in repos:
+                    repo_name = repo.get('name', '')
+                    if name in repo_name or repo_name == pulp_repo_name:
+                        logger.info(f"Found matching Python repository: {repo_name}")
+                        
+                        dist_del = run_cmd(pulp_python_commands["delete_distribution"] % repo_name, logger)
+                        if dist_del["rc"] == 0:
+                            messages.append("Distribution deleted")
+                        
+                        repo_del = run_cmd(pulp_python_commands["delete_repository"] % repo_name, logger)
+                        if repo_del["rc"] == 0:
+                            pulp_deleted = True
+                            messages.append("Repository deleted")
+                        break
+        
+        # Update status files
+        if file_exists_in_status(name, base_path, logger):
+            affected = remove_from_status_files(name, 'pip_module', base_path, logger)
+            if affected:
+                messages.append("Status files updated")
+                mark_software_partial(affected, base_path, logger)
+        
+        if pulp_deleted:
+            result["status"] = "Success"
+            result["message"] = "; ".join(messages) if messages else "Cleaned up"
+        else:
+            result["message"] = f"pip_module '{name}' not found in Pulp"
+            
+    except Exception as e:
+        result["message"] = f"Error: {str(e)}"
+    
+    return result
+
+
+def get_pulp_file_repo_name(name: str, file_type: str) -> str:
+    """Get the Pulp File repository name based on artifact type.
+    
+    Naming conventions:
+    - ansible_galaxy_collection: ansible_galaxy_collection<package>
+    - tarball, git, manifest, file: <name> (as-is)
+    """
+    if file_type == "ansible_galaxy_collection":
+        if name.startswith("ansible_galaxy_collection"):
+            return name
+        return f"ansible_galaxy_collection{name}"
+    return name
+
+
+def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -> Dict[str, Any]:
+    """Cleanup artifact from Pulp File repository.
+    
+    Handles: tarball, git, manifest, ansible_galaxy_collection
+    All use 'pulp file' repository type with type-specific naming conventions.
+    """
+    result = {"name": name, "type": file_type, "status": "Failed", "message": ""}
+    messages = []
+    pulp_deleted = False
+    status_removed = False
+    
+    try:
+        # Get the expected Pulp repository name
+        pulp_repo_name = get_pulp_file_repo_name(name, file_type)
+        logger.info(f"Looking for {file_type} repository: {pulp_repo_name}")
+        
+        # Check if repository exists directly
+        repo_check = run_cmd(pulp_file_commands["show_repository"] % pulp_repo_name, logger)
+        
+        if repo_check["rc"] == 0:
+            # Found exact match - delete distribution and repository
+            dist_del = run_cmd(pulp_file_commands["delete_distribution"] % pulp_repo_name, logger)
+            if dist_del["rc"] == 0:
+                messages.append("Distribution deleted")
+            
+            repo_del = run_cmd(pulp_file_commands["delete_repository"] % pulp_repo_name, logger)
+            if repo_del["rc"] == 0:
+                pulp_deleted = True
+                messages.append("Repository deleted")
+        else:
+            # Try listing repos to find partial match
+            repo_list = run_cmd(pulp_file_commands["list_repositories"], logger)
+            if repo_list["rc"] == 0:
+                repos = safe_json_parse(repo_list["stdout"])
+                for repo in repos:
+                    repo_name = repo.get('name', '')
+                    if name in repo_name or repo_name == pulp_repo_name:
+                        logger.info(f"Found matching repository: {repo_name}")
+                        
+                        dist_del = run_cmd(pulp_file_commands["delete_distribution"] % repo_name, logger)
+                        if dist_del["rc"] == 0:
+                            messages.append("Distribution deleted")
+                        
+                        repo_del = run_cmd(pulp_file_commands["delete_repository"] % repo_name, logger)
+                        if repo_del["rc"] == 0:
+                            pulp_deleted = True
+                            messages.append("Repository deleted")
+                        break
+        
+        # Run orphan cleanup to remove actual content files
+        if pulp_deleted:
+            logger.info("Running orphan cleanup to remove content files...")
+            orphan_result = run_cmd(pulp_file_commands["orphan_cleanup"], logger)
+            if orphan_result["rc"] == 0:
+                messages.append("Orphan cleanup completed")
+            else:
+                logger.warning(f"Orphan cleanup warning: {orphan_result['stderr']}")
+        
+        # Update status files
+        if file_exists_in_status(name, base_path, logger):
+            affected = remove_from_status_files(name, file_type, base_path, logger)
+            if affected:
+                status_removed = True
+                messages.append("Status files updated")
+                mark_software_partial(affected, base_path, logger)
+        
+        # Determine overall result
+        if pulp_deleted or status_removed:
+            result["status"] = "Success"
+            result["message"] = "; ".join(messages) if messages else "Cleaned up"
+        else:
+            result["message"] = f"{file_type} '{name}' not found in Pulp or status files"
+            
+    except Exception as e:
+        result["message"] = f"Error: {str(e)}"
+    
+    return result
+
+
+def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]:
+    """Cleanup a file artifact.
+    
+    Routes to appropriate handler:
+    - pip_module: Pulp Python repository
+    - tarball, git, manifest, ansible_galaxy_collection: Pulp File repository
+    """
+    file_type = detect_file_type(name)
+    
+    # Handle pip modules separately - they use Python repositories
+    if file_type == "pip_module":
+        return cleanup_pip_module(name, base_path, logger)
+    
+    # All other file types use Pulp File repository
+    return cleanup_file_repository(name, file_type, base_path, logger)
+
+
+# =============================================================================
+# STATUS FILE UPDATES
+# =============================================================================
+
+def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[str]:
+    """Remove RPMs that belong to a specific repository from status files.
+    
+    Uses the repo_name column in status.csv to accurately identify RPMs from the repository.
+    
+    Args:
+        repo_name: Repository name (e.g., 'x86_64_appstream')
+        base_path: Base path for status files
+        logger: Logger instance
+        
+    Returns:
+        List of software names that were affected
+    """
+    affected_software = []
+    logger.info(f"Removing RPMs from status.csv for repository: {repo_name}")
+    try:
+        for arch in ARCH_SUFFIXES:
+            for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"):
+                rows = []
+                removed = False
+                has_repo_column = False
+                
+                # Check if file has repo_name column
+                with open(status_file, 'r') as f:
+                    header = f.readline().strip().lower()
+                    has_repo_column = "repo_name" in header
+                
+                with open(status_file, 'r') as f:
+                    reader = csv.DictReader(f)
+                    fieldnames = reader.fieldnames
+                    for row in reader:
+                        name = row.get('name', '')
+                        row_type = row.get('type', '')
+                        rpm_repo = row.get('repo_name', '')
+                        
+                        logger.info(f"Processing row: {row}")
+                        # For RPMs, check if they belong to the deleted repository
+                        if row_type == 'rpm':
+                            if has_repo_column and rpm_repo == repo_name:
+                                removed = True
+                                logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)")
+                            else:
+                                rows.append(row)
+                        else:
+                            rows.append(row)
+                
+                if removed and fieldnames:
+                    with open(status_file, 'w', newline='') as f:
+                        writer = csv.DictWriter(f, fieldnames=fieldnames)
+                        writer.writeheader()
+                        writer.writerows(rows)
+                    
+                    # Track affected software
+                    software_name = os.path.basename(os.path.dirname(status_file))
+                    if software_name not in affected_software:
+                        affected_software.append(software_name)
+                    
+        return affected_software
+    except Exception as e:
+        logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}")
+        return []
+
+def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> List[str]:
+    """Remove artifact from status.csv files and return affected software names.
+    
+    Args:
+        artifact_name: Name of the artifact to remove
+        artifact_type: Type of artifact (git, tarball, pip_module)
+        base_path: Base path for status files
+        logger: Logger instance
+        
+    Returns:
+        List of software names that were affected
+    """
+    affected_software = []
+    try:
+        for arch in ARCH_SUFFIXES:
+            for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"):
+                rows = []
+                removed = False
+                with open(status_file, 'r') as f:
+                    reader = csv.DictReader(f)
+                    fieldnames = reader.fieldnames
+                    for row in reader:
+                        name = row.get('name', '')
+                        row_type = row.get('type', '')
+
+                        if name == artifact_name and row_type == artifact_type:
+                            removed = True
+                            logger.info(f"Removing {artifact_type} '{name}' from {status_file}")
+                        else:
+                            rows.append(row)
+                        
+                        # # Match logic based on type
+                        # should_remove = False
+                        # if artifact_type == 'image':
+                        #     # Container images: match with or without tag
+                        #     should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:"))
+                        # else:
+                        #     # Other types: exact match
+                        #     should_remove = (name == artifact_name)
+                        
+                        # if should_remove:
+                        #     removed = True
+                        #     logger.info(f"Removing '{name}' from {status_file}")
+                        # else:
+                        #     rows.append(row)
+                
+                if removed and fieldnames:
+                    with open(status_file, 'w', newline='') as f:
+                        writer = csv.DictWriter(f, fieldnames=fieldnames)
+                        writer.writeheader()
+                        writer.writerows(rows)
+                    
+                    # Track affected software
+                    software_name = os.path.basename(os.path.dirname(status_file))
+                    if software_name not in affected_software:
+                        affected_software.append(software_name)
+                    
+        return affected_software
+    except Exception as e:
+        logger.error(f"Failed to remove from status files: {e}")
+        return []
+
+
+def mark_software_partial(software_names: List[str], base_path: str, logger):
+    """Mark software entries as partial in software.csv.
+    
+    Args:
+        software_names: List of software names to mark as partial
+        base_path: Base path for software.csv
+        logger: Logger instance
+    """
+    if not software_names:
+        return
+        
+    try:
+        for arch in ARCH_SUFFIXES:
+            software_file = f"{base_path}/{arch}/software.csv"
+            if not os.path.exists(software_file):
+                continue
+            
+            rows = []
+            with open(software_file, 'r') as f:
+                reader = csv.DictReader(f)
+                fieldnames = reader.fieldnames
+                for row in reader:
+                    if row.get('name') in software_names:
+                        row['status'] = 'partial'
+                        logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in software.csv")
+                    rows.append(row)
+            
+            if fieldnames and rows:
+                with open(software_file, 'w', newline='') as f:
+                    writer = csv.DictWriter(f, fieldnames=fieldnames)
+                    writer.writeheader()
+                    writer.writerows(rows)
+    except Exception as e:
+        logger.error(f"Failed to update software.csv: {e}")
+
+
+def write_cleanup_status(results: List[Dict], base_path: str):
+    """Write cleanup results to status file."""
+    status_file = f"{base_path}/cleanup_status.csv"
+    os.makedirs(os.path.dirname(status_file), exist_ok=True)
+    
+    with open(status_file, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'type', 'status', 'message'])
+        writer.writeheader()
+        writer.writerows(results)
+    
+    return status_file
+
+
+# =============================================================================
+# MAIN MODULE
+# =============================================================================
+
+def run_module():
+    """Main module execution."""
+    module = AnsibleModule(
+        argument_spec=dict(
+            cleanup_repos=dict(type='list', elements='str', default=[]),
+            cleanup_containers=dict(type='list', elements='str', default=[]),
+            cleanup_files=dict(type='list', elements='str', default=[]),
+            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT)
+        ),
+        supports_check_mode=True
+    )
+
+    cleanup_repos = module.params['cleanup_repos']
+    cleanup_containers = module.params['cleanup_containers']
+    cleanup_files = module.params['cleanup_files']
+    base_path = module.params['base_path']
+
+    # Setup logger - setup_standard_logger expects a directory, creates standard.log inside
+    log_dir = os.path.join(base_path, "cleanup")
+    os.makedirs(base_path, exist_ok=True)
+    logger = setup_standard_logger(log_dir)
+    
+    logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}")
+
+    all_results = []
+
+    # Process repositories
+    for repo in cleanup_repos:
+        result = cleanup_repository(repo, base_path, logger)
+        all_results.append(result)
+        logger.info(f"Repository {repo}: {result['status']} - {result['message']}")
+
+    # Process containers
+    for container in cleanup_containers:
+        result = cleanup_container(container, base_path, logger)
+        all_results.append(result)
+        logger.info(f"Container {container}: {result['status']} - {result['message']}")
+
+    # Process files
+    for file in cleanup_files:
+        result = cleanup_file(file, base_path, logger)
+        all_results.append(result)
+        logger.info(f"File {file}: {result['status']} - {result['message']}")
+
+    # Write status file
+    status_file = write_cleanup_status(all_results, base_path)
+
+    # Calculate summary
+    total = len(all_results)
+    success = len([r for r in all_results if r['status'] == 'Success'])
+    failed = len([r for r in all_results if r['status'] == 'Failed'])
+
+    # Generate pretty table
+    pretty_table = format_pretty_table(all_results)
+
+    logger.info(f"Cleanup completed - Total: {total}, Success: {success}, Failed: {failed}")
+
+    module.exit_json(
+        changed=success > 0,
+        results=all_results,
+        total=total,
+        success_count=success,
+        failed_count=failed,
+        summary=f"Total: {total}, Success: {success}, Failed: {failed}",
+        pretty_table=pretty_table,
+        pretty_table_lines=pretty_table.split('\n'),
+        status_file=status_file
+    )
+
+
+if __name__ == '__main__':
+    run_module()
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
new file mode 100644
index 0000000000..c07a8ef7b0
--- /dev/null
+++ b/local_repo/pulp_cleanup.yml
@@ -0,0 +1,96 @@
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+# Pulp Cleanup Playbook - Clean Architecture
+#
+# Usage:
+#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel", "baseos"]}'
+#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_containers": ["nginx", "redis"]}'
+#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_files": ["git", "chart-0.48.0"]}'
+#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel"], "cleanup_containers": ["nginx"]}' -e force=true
+
+- name: Pulp Cleanup
+  hosts: localhost
+  connection: local
+  gather_facts: false
+
+  pre_tasks:
+    # Step 1: Input Validation
+    - name: Validate input - at least one cleanup type must be specified
+      ansible.builtin.assert:
+        that:
+          - (cleanup_repos | default([]) | length > 0) or (cleanup_containers | default([]) | length > 0) or (cleanup_files | default([]) | length > 0)
+        fail_msg: |
+          No cleanup items specified. Please provide at least one of:
+            cleanup_repos: ['repo1', 'repo2']
+            cleanup_containers: ['container1', 'container2']
+            cleanup_files: ['file1', 'file2']
+
+    # Step 2: User Confirmation
+    - name: Parse cleanup lists
+      ansible.builtin.set_fact:
+        repo_list: "{{ (cleanup_repos.split(',') if cleanup_repos != 'all' else []) if cleanup_repos is defined else [] }}"
+        container_list: "{{ (cleanup_containers.split(',') if cleanup_containers is string else cleanup_containers) | default([]) }}"
+        file_list: "{{ (cleanup_files.split(',') if cleanup_files is string else cleanup_files) | default([]) }}"
+
+    - name: Display cleanup summary
+      ansible.builtin.debug:
+        msg:
+          - "========== CLEANUP SUMMARY =========="
+          #- "Repositories : {{ (cleanup_repos | default([]) | join(', ')) if cleanup_repos | default([]) | length > 0 else 'None' }}"
+          - "Repositories : {{ (repo_list | default([]) | join(', ')) if repo_list | default([]) | length > 0 else 'None' }}"
+          - "Containers   : {{ (container_list | default([]) | join(', ')) if cleanup_containers | default([]) | length > 0 else 'None' }}"
+          - "Files        : {{ (file_list | default([]) | join(', ')) if cleanup_files | default([]) | length > 0 else 'None' }}"
+          - "====================================="
+    - name: Get user confirmation
+      ansible.builtin.pause:
+        prompt: |
+
+          ⚠️  WARNING: This will permanently delete the specified artifacts.
+          This action cannot be undone.
+
+          Type 'yes' to continue or press Ctrl+C to abort
+      register: user_input
+      when: not (force | default(false)) | bool
+
+    - name: Abort if not confirmed
+      ansible.builtin.fail:
+        msg: "Cleanup cancelled by user"
+      when:
+        - not (force | default(false)) | bool
+        - user_input.user_input | default('') | lower != 'yes'
+
+  tasks:
+    # Step 3: Call Python Module
+    - name: Execute cleanup
+      pulp_cleanup:
+        cleanup_repos: "{{ repo_list | default([]) }}"
+        cleanup_containers: "{{ container_list | default([]) }}"
+        cleanup_files: "{{ file_list | default([]) }}"
+      register: cleanup_result
+
+  post_tasks:
+    # Step 4: Display Results
+    - name: Display cleanup results
+      ansible.builtin.debug:
+         msg: "{{ cleanup_result.pretty_table_lines }}"
+
+    - name: Display summary
+      ansible.builtin.debug:
+        msg:
+          - "========== CLEANUP COMPLETED =========="
+          - "Total: {{ cleanup_result.total }}, Success: {{ cleanup_result.success_count }}, Failed: {{ cleanup_result.failed_count }}"
+          - "Status file: {{ cleanup_result.status_file }}"
+          - "========================================"
+          
\ No newline at end of file

From c3874566676cc9fa781008c4d4c1c47f442b5ba2 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Wed, 4 Feb 2026 14:25:59 +0530
Subject: [PATCH 017/172] ansible lint fixes

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 local_repo/pulp_cleanup.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index c07a8ef7b0..123b3a481f 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -48,7 +48,6 @@
       ansible.builtin.debug:
         msg:
           - "========== CLEANUP SUMMARY =========="
-          #- "Repositories : {{ (cleanup_repos | default([]) | join(', ')) if cleanup_repos | default([]) | length > 0 else 'None' }}"
           - "Repositories : {{ (repo_list | default([]) | join(', ')) if repo_list | default([]) | length > 0 else 'None' }}"
           - "Containers   : {{ (container_list | default([]) | join(', ')) if cleanup_containers | default([]) | length > 0 else 'None' }}"
           - "Files        : {{ (file_list | default([]) | join(', ')) if cleanup_files | default([]) | length > 0 else 'None' }}"
@@ -84,7 +83,7 @@
     # Step 4: Display Results
     - name: Display cleanup results
       ansible.builtin.debug:
-         msg: "{{ cleanup_result.pretty_table_lines }}"
+        msg: "{{ cleanup_result.pretty_table_lines }}"
 
     - name: Display summary
       ansible.builtin.debug:
@@ -93,4 +92,3 @@
           - "Total: {{ cleanup_result.total }}, Success: {{ cleanup_result.success_count }}, Failed: {{ cleanup_result.failed_count }}"
           - "Status file: {{ cleanup_result.status_file }}"
           - "========================================"
-          
\ No newline at end of file

From 675af6eff9caf49f0d3d38697a90d912d0200475 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Wed, 4 Feb 2026 12:47:51 +0000
Subject: [PATCH 018/172] fix for multiple user registries

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../input_validation/schema/local_repo_config.json     |  8 +++-----
 .../module_utils/local_repo/user_image_utility.py      | 10 +++++-----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json
index 63d61f0a31..e44cf44df7 100644
--- a/common/library/module_utils/input_validation/schema/local_repo_config.json
+++ b/common/library/module_utils/input_validation/schema/local_repo_config.json
@@ -17,17 +17,15 @@
           },
           "cert_path": {
             "type": "string",
-            "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$"
+            "pattern": "^$|^[a-zA-Z0-9/\\._-]*\\.crt$"
           },
           "key_path": {
             "type": "string",
-            "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$"
+            "pattern": "^$|^[a-zA-Z0-9/\\._-]*\\.key$"
           }
         },
         "required": [
-          "host",
-          "cert_path",
-          "key_path"
+          "host"
         ],
         "allOf": [
           {
diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py
index 2cbe1cba2d..e97e9411dd 100644
--- a/common/library/module_utils/local_repo/user_image_utility.py
+++ b/common/library/module_utils/local_repo/user_image_utility.py
@@ -395,11 +395,11 @@ def handle_user_image_registry(package, package_content, version_variables, user
                 logger.info(f"Image '{image_name}:{tag_val}' found in registry '{host}'")
                 result, package_info = process_user_registry(package, host, package_content, version_variables, cacert, key, logger)
                 break
-            
-            elif not image_found:
-                logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}'")
-                result = False
-                break
+            else:
+                logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}', checking next registry...")
+        else:
+            logger.info(f"Image '{image_name}:{tag_val}' not found in any user registry")
+            result = False
 
     except Exception as e:
         logger.error(f"Exception in {handle_user_image_registry.__name__}: {e}")

From e11eebd8ce6501c487094db88ef5e8100a33ae02 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Wed, 4 Feb 2026 20:11:32 +0530
Subject: [PATCH 019/172] status file update after cleanup

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/modules/pulp_cleanup.py | 109 +++++++++++++------------
 1 file changed, 59 insertions(+), 50 deletions(-)

diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 10c43ca0e9..91b863144a 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -258,7 +258,7 @@ def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]:
             # Update status files - remove RPM entries from this repo and mark software as partial
             affected = remove_rpms_from_repository(name, base_path, logger)
             logger.info(f" mark affected softwares as partial {affected}")
-            mark_software_partial(affected, base_path, logger)
+            mark_software_partial(affected, base_path, logger, 'repository')
         else:
             result["message"] = f"Delete failed: {del_result['stderr']}"
             
@@ -307,7 +307,7 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]
             result["message"] = "Container deleted"
             # Update status files - remove image entries and mark software as partial
             affected = remove_from_status_files(user_input, 'image', base_path, logger)
-            mark_software_partial(affected, base_path, logger)
+            mark_software_partial(affected, base_path, logger, 'image')
         else:
             result["message"] = f"Delete failed: {del_result['stderr']}"
             
@@ -457,7 +457,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
             affected = remove_from_status_files(name, 'pip_module', base_path, logger)
             if affected:
                 messages.append("Status files updated")
-                mark_software_partial(affected, base_path, logger)
+                mark_software_partial(affected, base_path, logger, 'pip_module')
         
         if pulp_deleted:
             result["status"] = "Success"
@@ -549,7 +549,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
             if affected:
                 status_removed = True
                 messages.append("Status files updated")
-                mark_software_partial(affected, base_path, logger)
+                mark_software_partial(affected, base_path, logger, file_type)
         
         # Determine overall result
         if pulp_deleted or status_removed:
@@ -647,8 +647,8 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[
         logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}")
         return []
 
-def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> List[str]:
-    """Remove artifact from status.csv files and return affected software names.
+def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]:
+    """Remove artifact from status.csv files and return affected software names by architecture.
     
     Args:
         artifact_name: Name of the artifact to remove
@@ -657,11 +657,12 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
         logger: Logger instance
         
     Returns:
-        List of software names that were affected
+        Dict mapping architecture to list of affected software names
     """
-    affected_software = []
+    affected_software = {}
     try:
         for arch in ARCH_SUFFIXES:
+            arch_affected = []
             for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"):
                 rows = []
                 removed = False
@@ -671,27 +672,20 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
                     for row in reader:
                         name = row.get('name', '')
                         row_type = row.get('type', '')
+                        # Match logic based on type
+                        should_remove = False
+                        if artifact_type == 'image':
+                            # Container images: match with or without tag
+                            should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:"))
+                        else:
+                            # Other types: exact match
+                            should_remove = (name == artifact_name)
 
-                        if name == artifact_name and row_type == artifact_type:
+                        if should_remove:
                             removed = True
-                            logger.info(f"Removing {artifact_type} '{name}' from {status_file}")
+                            logger.info(f"Removing '{name}' from {status_file}")
                         else:
                             rows.append(row)
-                        
-                        # # Match logic based on type
-                        # should_remove = False
-                        # if artifact_type == 'image':
-                        #     # Container images: match with or without tag
-                        #     should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:"))
-                        # else:
-                        #     # Other types: exact match
-                        #     should_remove = (name == artifact_name)
-                        
-                        # if should_remove:
-                        #     removed = True
-                        #     logger.info(f"Removing '{name}' from {status_file}")
-                        # else:
-                        #     rows.append(row)
                 
                 if removed and fieldnames:
                     with open(status_file, 'w', newline='') as f:
@@ -701,47 +695,62 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
                     
                     # Track affected software
                     software_name = os.path.basename(os.path.dirname(status_file))
-                    if software_name not in affected_software:
-                        affected_software.append(software_name)
+                    if software_name not in arch_affected:
+                        arch_affected.append(software_name)
+
+            if arch_affected:
+                affected_software[arch] = arch_affected
                     
+        logger.info(f"remove_from_status_files returning: {affected_software}")        
         return affected_software
     except Exception as e:
         logger.error(f"Failed to remove from status files: {e}")
-        return []
+        return {}
 
 
-def mark_software_partial(software_names: List[str], base_path: str, logger):
+def mark_software_partial(affected_software: Dict[str, List[str]], base_path: str, logger, artifact_type: str = None):
     """Mark software entries as partial in software.csv.
     
     Args:
-        software_names: List of software names to mark as partial
+        affected_software: Dict mapping architecture to list of affected software names
         base_path: Base path for software.csv
         logger: Logger instance
+        artifact_type: Type of artifact being removed (for logging purposes)
     """
-    if not software_names:
+    logger.info(f"mark_software_partial called with affected_software: {affected_software}")
+    if not affected_software:
+        logger.info("No affected software to mark as partial")
         return
         
     try:
-        for arch in ARCH_SUFFIXES:
-            software_file = f"{base_path}/{arch}/software.csv"
-            if not os.path.exists(software_file):
+        # Only mark architectures where artifacts were actually removed
+        for arch, software_names in affected_software.items():
+            logger.info(f"Processing arch: {arch}, software_names: {software_names}")
+            if not software_names:
                 continue
-            
-            rows = []
-            with open(software_file, 'r') as f:
-                reader = csv.DictReader(f)
-                fieldnames = reader.fieldnames
-                for row in reader:
-                    if row.get('name') in software_names:
-                        row['status'] = 'partial'
-                        logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in software.csv")
-                    rows.append(row)
-            
-            if fieldnames and rows:
-                with open(software_file, 'w', newline='') as f:
-                    writer = csv.DictWriter(f, fieldnames=fieldnames)
-                    writer.writeheader()
-                    writer.writerows(rows)
+
+            software_file = f"{base_path}/{arch}/software.csv"
+            logger.info(f"Looking for software file: {software_file}")
+            if os.path.exists(software_file):
+                rows = []
+                updated = False
+                with open(software_file, 'r') as f:
+                    reader = csv.DictReader(f)
+                    fieldnames = reader.fieldnames
+                    for row in reader:
+                        logger.info(f"Checking row: {row}")
+                        if row.get('name') in software_names:
+                            row['status'] = 'partial'
+                            updated = True
+                            logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in {arch}/software.csv ({artifact_type} cleanup)")
+                        rows.append(row)
+
+                if fieldnames and rows:
+                    with open(software_file, 'w', newline='') as f:
+                        writer = csv.DictWriter(f, fieldnames=fieldnames)
+                        writer.writeheader()
+                        writer.writerows(rows)
+                    logger.info(f"Successfully wrote updated software.csv for {arch}")
     except Exception as e:
         logger.error(f"Failed to update software.csv: {e}")
 

From 329737f52107174ff791f25eb759eaf0d31492fd Mon Sep 17 00:00:00 2001
From: balajikumaran-c-s <balajikumaran.cs@dellteam.com>
Date: Wed, 4 Feb 2026 15:01:33 +0000
Subject: [PATCH 020/172] Fix aarch64 base image package fact name

---
 .../roles/fetch_packages/tasks/fetch_packages.yml             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml b/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml
index e5bc523294..40c6b1092c 100644
--- a/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml
+++ b/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml
@@ -24,9 +24,9 @@
         software_config_path: "{{ software_config_file_path }}"
       register: base_image_output
 
-    - name: Set x86_64_base_image_packages
+    - name: Set aarch_64_base_image_packages
       ansible.builtin.set_fact:
-        x86_64_base_image_packages: "{{ base_image_output.base_image_packages }}"
+        aarch64_base_image_packages: "{{ base_image_output.base_image_packages }}"
 
     - name: Debug package aarch64_base_image_packages
       ansible.builtin.debug:

From c9172e9a431d09122ae507629c017156786c4950 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 20:47:47 +0530
Subject: [PATCH 021/172] victoria connect details

---
 utils/external_victoria_connect_details.yml   |  21 +++
 .../tasks/main.yml                            | 165 ++++++++++++++++++
 .../vars/main.yml                             |  18 ++
 3 files changed, 204 insertions(+)
 create mode 100644 utils/external_victoria_connect_details.yml
 create mode 100644 utils/roles/external_victoria_connect_details/tasks/main.yml
 create mode 100644 utils/roles/external_victoria_connect_details/vars/main.yml

diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml
new file mode 100644
index 0000000000..3d29b4f720
--- /dev/null
+++ b/utils/external_victoria_connect_details.yml
@@ -0,0 +1,21 @@
+#  Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Fetch external Victoria connection details
+  hosts: service_kube_control_plane
+  connection: ssh
+  gather_facts: false
+  roles:
+    - external_victoria_connect_details
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
new file mode 100644
index 0000000000..9230879781
--- /dev/null
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -0,0 +1,165 @@
+#  Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Check kubectl presence
+  ansible.builtin.command: kubectl version --client=true
+  register: kubectl_check
+  changed_when: false
+  failed_when: kubectl_check.rc != 0
+
+- name: Get Victoria pods status
+  ansible.builtin.command: >-
+    kubectl get pods -n {{ victoria_namespace }}
+    -l app in (vminsert,vmselect,vmstorage,victoriametrics)
+    -o wide
+  register: victoria_pods
+  changed_when: false
+  failed_when: victoria_pods.rc != 0
+
+- name: Get vminsert service LoadBalancer IP
+  ansible.builtin.command: >-
+    kubectl get svc vminsert -n {{ victoria_namespace }}
+    -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
+  register: vminsert_lb_ip
+  changed_when: false
+  failed_when: false
+
+- name: Get vminsert service LoadBalancer hostname
+  ansible.builtin.command: >-
+    kubectl get svc vminsert -n {{ victoria_namespace }}
+    -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
+  register: vminsert_lb_hostname
+  changed_when: false
+  failed_when: false
+
+- name: Get vminsert service external port
+  ansible.builtin.command: >-
+    kubectl get svc vminsert -n {{ victoria_namespace }}
+    -o jsonpath='{.spec.ports[0].port}'
+  register: vminsert_lb_port
+  changed_when: false
+  failed_when: false
+
+- name: Get vmselect service LoadBalancer IP
+  ansible.builtin.command: >-
+    kubectl get svc vmselect -n {{ victoria_namespace }}
+    -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
+  register: vmselect_lb_ip
+  changed_when: false
+  failed_when: false
+
+- name: Get vmselect service LoadBalancer hostname
+  ansible.builtin.command: >-
+    kubectl get svc vmselect -n {{ victoria_namespace }}
+    -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
+  register: vmselect_lb_hostname
+  changed_when: false
+  failed_when: false
+
+- name: Get vmselect service external port
+  ansible.builtin.command: >-
+    kubectl get svc vmselect -n {{ victoria_namespace }}
+    -o jsonpath='{.spec.ports[0].port}'
+  register: vmselect_lb_port
+  changed_when: false
+  failed_when: false
+
+- name: Set endpoint facts
+  ansible.builtin.set_fact:
+    vminsert_host: >-
+      {{
+        (vminsert_lb_ip.stdout | trim)
+        if (vminsert_lb_ip.stdout | trim | length) > 0
+        else (vminsert_lb_hostname.stdout | trim)
+      }}
+    vmselect_host: >-
+      {{
+        (vmselect_lb_ip.stdout | trim)
+        if (vmselect_lb_ip.stdout | trim | length) > 0
+        else (vmselect_lb_hostname.stdout | trim)
+      }}
+    vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}"
+    vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}"
+    victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt"
+    victoria_tls_cert: "{{ victoria_tls_cert_dir }}/server.crt"
+    victoria_tls_key: "{{ victoria_tls_cert_dir }}/server.key"
+
+- name: Fail when LoadBalancer IPs are not available
+  ansible.builtin.fail:
+    msg: >-
+      Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect'
+      exist in namespace '{{ victoria_namespace }}' and have external IPs assigned.
+  when:
+    - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0
+
+- name: Set Victoria external port fallbacks
+  ansible.builtin.set_fact:
+    vminsert_port: "8480"
+    vmselect_port: "8481"
+  when:
+    - vminsert_port | trim | length == 0 or vmselect_port | trim | length == 0
+
+- name: Build connection details
+  ansible.builtin.set_fact:
+    victoria_connect_details:
+      victoria:
+        namespace: "{{ victoria_namespace }}"
+        pod_status: "{{ victoria_pods.stdout }}"
+        base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}"
+        endpoints:
+          vminsert:
+            host: "{{ vminsert_host }}"
+            port: "{{ vminsert_port | int }}"
+            write_endpoint: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write"
+          vmselect:
+            host: "{{ vmselect_host }}"
+            port: "{{ vmselect_port | int }}"
+            query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query"
+        tls:
+          ca_crt: "{{ victoria_tls_ca }}"
+          server_crt: "{{ victoria_tls_cert }}"
+          server_key: "{{ victoria_tls_key }}"
+
+- name: Ensure output directory exists
+  ansible.builtin.file:
+    path: "{{ victoria_output_file | dirname }}"
+    state: directory
+    mode: "0755"
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
+- name: Write connection details to file
+  ansible.builtin.copy:
+    content: "{{ victoria_connect_details | to_nice_yaml }}"
+    dest: "{{ victoria_output_file }}"
+    mode: "0644"
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
+- name: Display Victoria connection details
+  ansible.builtin.debug:
+    msg:
+      - "Victoria connection details written to: {{ victoria_output_file }}"
+      - "vminsert: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write"
+      - "vmselect: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query"
+      - "TLS CA: {{ victoria_tls_ca }}"
+      - "TLS cert: {{ victoria_tls_cert }}"
+      - "TLS key: {{ victoria_tls_key }}"
+      - "Pods:\n{{ victoria_pods.stdout }}"
+  delegate_to: localhost
+  connection: local
+  run_once: true
diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml
new file mode 100644
index 0000000000..29db9136f2
--- /dev/null
+++ b/utils/roles/external_victoria_connect_details/vars/main.yml
@@ -0,0 +1,18 @@
+#  Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+victoria_namespace: "telemetry"
+victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml"
+victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs"

From d4ee4628bd1e6bc95544b551d4ce33d1fd7441f4 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 20:53:38 +0530
Subject: [PATCH 022/172] Update main.yml

---
 utils/roles/external_victoria_connect_details/tasks/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 9230879781..9e194436b7 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -20,9 +20,9 @@
   failed_when: kubectl_check.rc != 0
 
 - name: Get Victoria pods status
-  ansible.builtin.command: >-
+  ansible.builtin.shell: >-
     kubectl get pods -n {{ victoria_namespace }}
-    -l app in (vminsert,vmselect,vmstorage,victoriametrics)
+    -l "app in (vminsert,vmselect,vmstorage,victoriametrics)"
     -o wide
   register: victoria_pods
   changed_when: false

From aabcbb1d47aad173edef09216378b8c98bdab43a Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:07:59 +0530
Subject: [PATCH 023/172] Update main.yml

---
 .../tasks/main.yml                            | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 9e194436b7..bdcb15ca0b 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -127,10 +127,9 @@
             host: "{{ vmselect_host }}"
             port: "{{ vmselect_port | int }}"
             query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query"
+            ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui"
         tls:
-          ca_crt: "{{ victoria_tls_ca }}"
           server_crt: "{{ victoria_tls_cert }}"
-          server_key: "{{ victoria_tls_key }}"
 
 - name: Ensure output directory exists
   ansible.builtin.file:
@@ -152,14 +151,19 @@
 
 - name: Display Victoria connection details
   ansible.builtin.debug:
-    msg:
-      - "Victoria connection details written to: {{ victoria_output_file }}"
-      - "vminsert: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write"
-      - "vmselect: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query"
-      - "TLS CA: {{ victoria_tls_ca }}"
-      - "TLS cert: {{ victoria_tls_cert }}"
-      - "TLS key: {{ victoria_tls_key }}"
-      - "Pods:\n{{ victoria_pods.stdout }}"
+    msg: |
+      Victoria connection details written to: {{ victoria_output_file }}
+
+      Endpoints:
+        vminsert write: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write
+        vmselect query: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query
+        vmselect UI:    https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui
+
+      TLS:
+        server.crt: {{ victoria_tls_cert }}
+
+      Pods:
+      {{ victoria_pods.stdout }}
   delegate_to: localhost
   connection: local
   run_once: true

From 9bd5250b26e756c1c659aae85313ab7387013a4a Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:11:22 +0530
Subject: [PATCH 024/172] Update external_victoria_connect_details.yml

---
 utils/external_victoria_connect_details.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml
index 3d29b4f720..ad4ed542df 100644
--- a/utils/external_victoria_connect_details.yml
+++ b/utils/external_victoria_connect_details.yml
@@ -13,6 +13,19 @@
 #  limitations under the License.
 ---
 
+- name: Preflight - validate inventory
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Fail if service_kube_control_plane group is missing or empty
+      ansible.builtin.fail:
+        msg: >-
+          Inventory must define a non-empty 'service_kube_control_plane' group.
+          Run with '-i <inventory>' and ensure at least one host is in that group.
+      when:
+        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0
+
 - name: Fetch external Victoria connection details
   hosts: service_kube_control_plane
   connection: ssh

From 01c213ba7b9ce758fc41d7bb388b1a8da357cd63 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:26:25 +0530
Subject: [PATCH 025/172] Update main.yml

---
 .../tasks/main.yml                            | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index bdcb15ca0b..d1b0286d76 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -151,19 +151,22 @@
 
 - name: Display Victoria connection details
   ansible.builtin.debug:
-    msg: |
-      Victoria connection details written to: {{ victoria_output_file }}
-
-      Endpoints:
-        vminsert write: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write
-        vmselect query: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query
-        vmselect UI:    https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui
-
-      TLS:
-        server.crt: {{ victoria_tls_cert }}
-
-      Pods:
-      {{ victoria_pods.stdout }}
+    msg: >-
+      {{
+        [
+          'Victoria connection details written to: ' ~ victoria_output_file,
+          '',
+          'Endpoints:',
+          '  vminsert write: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write',
+          '  vmselect query: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/prometheus/api/v1/query',
+          '  vmselect UI:    https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui',
+          '',
+          'TLS:',
+          '  server.crt: ' ~ victoria_tls_cert,
+          '',
+          'Pods:'
+        ] + (victoria_pods.stdout_lines | default([]))
+      }}
   delegate_to: localhost
   connection: local
   run_once: true

From e7887b800ed44bfe3e5b6a9d4110c1a3a7afe39c Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:32:39 +0530
Subject: [PATCH 026/172] Update main.yml

---
 .../tasks/main.yml                            | 105 +++++++++++++++---
 1 file changed, 92 insertions(+), 13 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index d1b0286d76..851528e9eb 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -19,14 +19,92 @@
   changed_when: false
   failed_when: kubectl_check.rc != 0
 
+- name: Check for Victoria cluster services
+  ansible.builtin.command: >-
+    kubectl get svc {{ item }} -n {{ victoria_namespace }} -o name
+  loop:
+    - vminsert
+    - vmselect
+  register: victoria_cluster_svcs
+  changed_when: false
+  failed_when: false
+
+- name: Check for Victoria single-node service
+  ansible.builtin.command: >-
+    kubectl get svc victoria-loadbalancer -n {{ victoria_namespace }} -o name
+  register: victoria_single_svc
+  changed_when: false
+  failed_when: false
+
+- name: Set Victoria deployment mode
+  ansible.builtin.set_fact:
+    victoria_deployment_mode: >-
+      {{
+        'cluster'
+        if (victoria_cluster_svcs.results | selectattr('rc', 'equalto', 0) | list | length) == 2
+        else ('single-node' if victoria_single_svc.rc == 0 else 'unknown')
+      }}
+
+- name: Fail if Victoria cluster mode is not deployed
+  ansible.builtin.fail:
+    msg: >-
+      Victoria deployment mode detected: {{ victoria_deployment_mode }}.
+      External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage).
+      Single-node Victoria (victoria-loadbalancer) is not supported for external integration.
+  when: victoria_deployment_mode != 'cluster'
+
 - name: Get Victoria pods status
   ansible.builtin.shell: >-
     kubectl get pods -n {{ victoria_namespace }}
     -l "app in (vminsert,vmselect,vmstorage,victoriametrics)"
     -o wide
-  register: victoria_pods
+  register: victoria_pods_wide
+  changed_when: false
+  failed_when: victoria_pods_wide.rc != 0
+
+- name: Get Victoria pods status (json)
+  ansible.builtin.shell: >-
+    kubectl get pods -n {{ victoria_namespace }}
+    -l "app in (vminsert,vmselect,vmstorage,victoriametrics)"
+    -o json
+  register: victoria_pods_json
   changed_when: false
-  failed_when: victoria_pods.rc != 0
+  failed_when: victoria_pods_json.rc != 0
+
+- name: Parse Victoria pods
+  ansible.builtin.set_fact:
+    victoria_pods_parsed: "{{ victoria_pods_json.stdout | from_json }}"
+
+- name: Fail if no Victoria pods found
+  ansible.builtin.fail:
+    msg: "No Victoria pods found in namespace '{{ victoria_namespace }}'."
+  when: (victoria_pods_parsed.items | default([]) | length) == 0
+
+- name: Fail if Victoria pods are not Running
+  ansible.builtin.fail:
+    msg: "One or more Victoria pods are not in Running state."
+  when: >-
+    {{
+      (victoria_pods_parsed.items | default([])
+      | selectattr('status.phase', 'ne', 'Running')
+      | list
+      | length) > 0
+    }}
+
+- name: Fail if Victoria pods are not Ready
+  ansible.builtin.fail:
+    msg: "One or more Victoria pods are not Ready."
+  when: >-
+    {{
+      (victoria_pods_parsed.items | default([])
+      | selectattr('status.containerStatuses', 'defined')
+      | map(attribute='status.containerStatuses')
+      | list
+      | flatten
+      | selectattr('ready', 'equalto', false)
+      | list
+      | length) > 0
+    }}
 
 - name: Get vminsert service LoadBalancer IP
   ansible.builtin.command: >-
@@ -34,7 +112,7 @@
     -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
   register: vminsert_lb_ip
   changed_when: false
-  failed_when: false
+  failed_when: vminsert_lb_ip.rc != 0
 
 - name: Get vminsert service LoadBalancer hostname
   ansible.builtin.command: >-
@@ -42,7 +120,7 @@
     -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
   register: vminsert_lb_hostname
   changed_when: false
-  failed_when: false
+  failed_when: vminsert_lb_hostname.rc != 0
 
 - name: Get vminsert service external port
   ansible.builtin.command: >-
@@ -50,7 +128,7 @@
     -o jsonpath='{.spec.ports[0].port}'
   register: vminsert_lb_port
   changed_when: false
-  failed_when: false
+  failed_when: vminsert_lb_port.rc != 0
 
 - name: Get vmselect service LoadBalancer IP
   ansible.builtin.command: >-
@@ -58,7 +136,7 @@
     -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
   register: vmselect_lb_ip
   changed_when: false
-  failed_when: false
+  failed_when: vmselect_lb_ip.rc != 0
 
 - name: Get vmselect service LoadBalancer hostname
   ansible.builtin.command: >-
@@ -66,7 +144,7 @@
     -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
   register: vmselect_lb_hostname
   changed_when: false
-  failed_when: false
+  failed_when: vmselect_lb_hostname.rc != 0
 
 - name: Get vmselect service external port
   ansible.builtin.command: >-
@@ -74,7 +152,7 @@
     -o jsonpath='{.spec.ports[0].port}'
   register: vmselect_lb_port
   changed_when: false
-  failed_when: false
+  failed_when: vmselect_lb_port.rc != 0
 
 - name: Set endpoint facts
   ansible.builtin.set_fact:
@@ -116,7 +194,8 @@
     victoria_connect_details:
       victoria:
         namespace: "{{ victoria_namespace }}"
-        pod_status: "{{ victoria_pods.stdout }}"
+        deployment_mode: "{{ victoria_deployment_mode }}"
+        pod_status: "{{ victoria_pods_wide.stdout }}"
         base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}"
         endpoints:
           vminsert:
@@ -156,16 +235,16 @@
         [
           'Victoria connection details written to: ' ~ victoria_output_file,
           '',
+          'Mode: ' ~ victoria_deployment_mode,
+          '',
           'Endpoints:',
           '  vminsert write: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write',
           '  vmselect query: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/prometheus/api/v1/query',
           '  vmselect UI:    https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui',
           '',
           'TLS:',
-          '  server.crt: ' ~ victoria_tls_cert,
-          '',
-          'Pods:'
-        ] + (victoria_pods.stdout_lines | default([]))
+          '  server.crt: ' ~ victoria_tls_cert
+        ]
       }}
   delegate_to: localhost
   connection: local

From ed8ef8fbb21c5165c7acb36856ad1f93bfabee38 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:35:51 +0530
Subject: [PATCH 027/172] Update main.yml

---
 .../roles/external_victoria_connect_details/tasks/main.yml  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 851528e9eb..758719043c 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -78,14 +78,14 @@
 - name: Fail if no Victoria pods found
   ansible.builtin.fail:
     msg: "No Victoria pods found in namespace '{{ victoria_namespace }}'."
-  when: (victoria_pods_parsed.items | default([]) | length) == 0
+  when: (victoria_pods_parsed.get('items', []) | length) == 0
 
 - name: Fail if Victoria pods are not Running
   ansible.builtin.fail:
     msg: "One or more Victoria pods are not in Running state."
   when: >-
     {{
-      (victoria_pods_parsed.items | default([])
+      (victoria_pods_parsed.get('items', [])
       | selectattr('status.phase', 'ne', 'Running')
       | list
       | length) > 0
@@ -96,7 +96,7 @@
     msg: "One or more Victoria pods are not Ready."
   when: >-
     {{
-      (victoria_pods_parsed.items | default([])
+      (victoria_pods_parsed.get('items', [])
       | selectattr('status.containerStatuses', 'defined')
       | map(attribute='status.containerStatuses')
       | list

From b8e806ecc665764533d41116558060e79b04e325 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:38:54 +0530
Subject: [PATCH 028/172] Update main.yml

---
 .../external_victoria_connect_details/tasks/main.yml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 758719043c..4ec7f0c901 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -83,20 +83,17 @@
 - name: Fail if Victoria pods are not Running
   ansible.builtin.fail:
     msg: "One or more Victoria pods are not in Running state."
-  when: >-
-    {{
-      (victoria_pods_parsed.get('items', [])
+  when:
+    - (victoria_pods_parsed.get('items', [])
       | selectattr('status.phase', 'ne', 'Running')
       | list
       | length) > 0
-    }}
 
 - name: Fail if Victoria pods are not Ready
   ansible.builtin.fail:
     msg: "One or more Victoria pods are not Ready."
-  when: >-
-    {{
-      (victoria_pods_parsed.get('items', [])
+  when:
+    - (victoria_pods_parsed.get('items', [])
       | selectattr('status.containerStatuses', 'defined')
       | map(attribute='status.containerStatuses')
       | list
@@ -104,7 +101,6 @@
       | selectattr('ready', 'equalto', false)
       | list
       | length) > 0
-    }}
 
 - name: Get vminsert service LoadBalancer IP
   ansible.builtin.command: >-

From 0262dd365f21b052b116eab7006859f46ad60e84 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 4 Feb 2026 21:40:53 +0530
Subject: [PATCH 029/172] The input for thew custom confs now exist on the core
 container

---
 discovery/roles/slurm_config/tasks/confs.yml | 44 +++++++++++++++++---
 input/omnia_config.yml                       |  4 +-
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index f3228fa460..e1b4e2d3ea 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -27,7 +27,8 @@
 - name: Slurm dbd opts
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({})
-     | combine({'slurmdbd': (apply_config['slurmdbd'] | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}"
+     | combine({'slurmdbd': (apply_config['slurmdbd']
+     | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}"
   when: ctld_list
 
 - name: Check .conf files existence
@@ -37,21 +38,52 @@
   loop: "{{ ctld_list | product(conf_files | default([])) }}"
   register: ctld_conf_files
 
+- name: Parse configs_input files from localhost (if they are paths)
+  slurm_conf:
+    op: parse
+    conf_name: "{{ item.key }}"
+    path: "{{ item.value }}"
+  delegate_to: localhost
+  loop: "{{ configs_input | default({}) | dict2items }}"
+  register: parsed_configs_input_results
+  when:
+    - configs_input is defined
+    - configs_input
+    - item.value is abs
+
+- name: Build parsed_configs_input dictionary from parsed files
+  ansible.builtin.set_fact:
+    parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}"
+  loop: "{{ parsed_configs_input_results.results }}"
+  when:
+    - parsed_configs_input_results is defined
+    - not parsed_configs_input_results.skipped | default(false)
+
+- name: Add configs_input dicts that are already parsed
+  ansible.builtin.set_fact:
+    parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}"
+  loop: "{{ configs_input | default({}) | dict2items }}"
+  when:
+    - configs_input is defined
+    - configs_input
+    - item.value is mapping
+
 - name: Create lists for conf_merge
   ansible.builtin.set_fact:
     conf_merge_dict: "{{
         conf_merge_dict | default({})
         | combine({
-            conf_set.item.1: (
-              [apply_config[conf_set.item.1]]
-              + ([conf_set.stat.path] if conf_set.stat.exists else [])
-              + ([configs_input.get(conf_set.item.1)] if configs_input.get(conf_set.item.1) else [])
+            existing_conf_set.item.1: (
+              [apply_config[existing_conf_set.item.1]]
+              + ([existing_conf_set.stat.path] if existing_conf_set.stat.exists else [])
+              + ([parsed_configs_input.get(existing_conf_set.item.1)]
+               if parsed_configs_input is defined and parsed_configs_input.get(existing_conf_set.item.1) else [])
             )
           })
       }}"
   loop: "{{ ctld_conf_files.results }}"
   loop_control:
-    loop_var: conf_set
+    loop_var: existing_conf_set
   register: prepared_conf_lists
 
 - name: Prepend ClusterName and SlurmctldHost to slurm conf sources
diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index 3c4b3dbc35..032fa77ce0 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -32,13 +32,12 @@
 # <conf name>: 
 #    <mapping> or <filepath>
 # <mapping> Supply the configuration values directly as a key–value map
-# <filepath> Supply the absolute path to a custom configuration file on the OIM server
+# <filepath> Supply the absolute path to a custom configuration file
 # The conf files supported by slurm are
 # slurm
 # cgroup
 # slurmdbd
 # gres
-# mpi
 # Thes files will be written into the slurm_config directory with .conf suffix
 
 slurm_cluster:
@@ -62,7 +61,6 @@ slurm_cluster:
     #   cgroup: /path/to/custom_cgroup.conf
     #   slurmdbd: /path/to/custom_slurmdbd.conf
     #   gres: /path/to/custom_gres.conf
-    #   mpi: /path/to/custom_mpi.conf
   
 # ----------------------------SERVICE K8S------------------------------------------------------
 # For service k8s cluster below parameters are required,(List)

From a2dc5e132fd3f78e841a5a1088bce86d5ef4db9d Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Wed, 4 Feb 2026 21:45:24 +0530
Subject: [PATCH 030/172] kafka update

---
 utils/external_kafka_connect_details.yml      |  34 +++
 .../tasks/main.yml                            | 215 ++++++++++++++++++
 .../vars/main.yml                             |  22 ++
 3 files changed, 271 insertions(+)
 create mode 100644 utils/external_kafka_connect_details.yml
 create mode 100644 utils/roles/external_kafka_connect_details/tasks/main.yml
 create mode 100644 utils/roles/external_kafka_connect_details/vars/main.yml

diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml
new file mode 100644
index 0000000000..a51a75aa3f
--- /dev/null
+++ b/utils/external_kafka_connect_details.yml
@@ -0,0 +1,34 @@
+#  Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Preflight - validate inventory
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Fail if service_kube_control_plane group is missing or empty
+      ansible.builtin.fail:
+        msg: >-
+          Inventory must define a non-empty 'service_kube_control_plane' group.
+          Run with '-i <inventory>' and ensure at least one host is in that group.
+      when:
+        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0
+
+- name: Fetch external Kafka connection details
+  hosts: service_kube_control_plane
+  connection: ssh
+  gather_facts: false
+  roles:
+    - external_kafka_connect_details
diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
new file mode 100644
index 0000000000..169c83bad3
--- /dev/null
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -0,0 +1,215 @@
+#  Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Check kubectl presence
+  ansible.builtin.command: kubectl version --client=true
+  register: kubectl_check
+  changed_when: false
+  failed_when: kubectl_check.rc != 0
+
+- name: Get Kafka pod status
+  ansible.builtin.command: >-
+    kubectl get pods -n {{ kafka_namespace }}
+    -l app.kubernetes.io/name=kafka
+    -o wide
+  register: kafka_pods
+  changed_when: false
+  failed_when: false
+
+- name: Get Kafka pod status (json)
+  ansible.builtin.command: >-
+    kubectl get pods -n {{ kafka_namespace }}
+    -l app.kubernetes.io/name=kafka
+    -o json
+  register: kafka_pods_json
+  changed_when: false
+  failed_when: kafka_pods_json.rc != 0
+
+- name: Parse Kafka pods
+  ansible.builtin.set_fact:
+    kafka_pods_parsed: "{{ kafka_pods_json.stdout | from_json }}"
+
+- name: Fail if no Kafka pods found
+  ansible.builtin.fail:
+    msg: "No Kafka pods found in namespace '{{ kafka_namespace }}'."
+  when: (kafka_pods_parsed.get('items', []) | length) == 0
+
+- name: Fail if Kafka pods are not Running
+  ansible.builtin.fail:
+    msg: "One or more Kafka pods are not in Running state."
+  when:
+    - (kafka_pods_parsed.get('items', [])
+      | selectattr('status.phase', 'ne', 'Running')
+      | list
+      | length) > 0
+
+- name: Fail if Kafka pods are not Ready
+  ansible.builtin.fail:
+    msg: "One or more Kafka pods are not Ready."
+  when:
+    - (kafka_pods_parsed.get('items', [])
+      | selectattr('status.containerStatuses', 'defined')
+      | map(attribute='status.containerStatuses')
+      | list
+      | flatten
+      | selectattr('ready', 'equalto', false)
+      | list
+      | length) > 0
+
+- name: Get Kafka LoadBalancer IP
+  ansible.builtin.command: >-
+    kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }}
+    -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
+  register: kafka_lb_ip
+  changed_when: false
+  failed_when: kafka_lb_ip.rc != 0
+
+- name: Get Kafka LoadBalancer hostname
+  ansible.builtin.command: >-
+    kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }}
+    -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
+  register: kafka_lb_hostname
+  changed_when: false
+  failed_when: kafka_lb_hostname.rc != 0
+
+- name: Get Kafka LoadBalancer external port
+  ansible.builtin.command: >-
+    kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }}
+    -o jsonpath='{.spec.ports[0].port}'
+  register: kafka_lb_port
+  changed_when: false
+  failed_when: kafka_lb_port.rc != 0
+
+- name: Set Kafka external endpoint
+  ansible.builtin.set_fact:
+    kafka_external_host: >-
+      {{
+        (kafka_lb_ip.stdout | trim)
+        if (kafka_lb_ip.stdout | trim | length) > 0
+        else (kafka_lb_hostname.stdout | trim)
+      }}
+    kafka_external_port: "{{ (kafka_lb_port.stdout | trim) | default('') }}"
+
+- name: Fail when Kafka external endpoint is not available
+  ansible.builtin.fail:
+    msg: >-
+      Failed to fetch Kafka LoadBalancer endpoint. Ensure service '{{ kafka_lb_service_name }}'
+      exists in namespace '{{ kafka_namespace }}' and has an external IP/hostname assigned.
+  when: kafka_external_host | trim | length == 0
+
+- name: Set Kafka external port fallback
+  ansible.builtin.set_fact:
+    kafka_external_port: "{{ kafka_bootstrap_port | string }}"
+  when: kafka_external_port | trim | length == 0
+
+- name: Ensure output directory exists
+  ansible.builtin.file:
+    path: "{{ kafka_output_dir }}"
+    state: directory
+    mode: "0755"
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
+- name: Read Kafka cluster CA cert from secret
+  ansible.builtin.command: >-
+    kubectl get secret {{ kafka_cluster_ca_secret }} -n {{ kafka_namespace }}
+    -o jsonpath='{.data.ca\.crt}'
+  register: kafka_ca_crt_b64
+  changed_when: false
+  failed_when: kafka_ca_crt_b64.rc != 0 or (kafka_ca_crt_b64.stdout | trim | length == 0)
+
+- name: Read Kafka client cert from secret
+  ansible.builtin.command: >-
+    kubectl get secret {{ kafka_client_secret }} -n {{ kafka_namespace }}
+    -o jsonpath='{.data.user\.crt}'
+  register: kafka_user_crt_b64
+  changed_when: false
+  failed_when: kafka_user_crt_b64.rc != 0 or (kafka_user_crt_b64.stdout | trim | length == 0)
+
+- name: Read Kafka client key from secret
+  ansible.builtin.command: >-
+    kubectl get secret {{ kafka_client_secret }} -n {{ kafka_namespace }}
+    -o jsonpath='{.data.user\.key}'
+  register: kafka_user_key_b64
+  changed_when: false
+  failed_when: kafka_user_key_b64.rc != 0 or (kafka_user_key_b64.stdout | trim | length == 0)
+
+- name: Write Kafka CA/cert/key files
+  ansible.builtin.copy:
+    content: "{{ item.content }}"
+    dest: "{{ item.dest }}"
+    mode: "0600"
+  loop:
+    - dest: "{{ kafka_output_dir }}/ca.crt"
+      content: "{{ kafka_ca_crt_b64.stdout | b64decode }}"
+    - dest: "{{ kafka_output_dir }}/user.crt"
+      content: "{{ kafka_user_crt_b64.stdout | b64decode }}"
+    - dest: "{{ kafka_output_dir }}/user.key"
+      content: "{{ kafka_user_key_b64.stdout | b64decode }}"
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
+- name: Build Kafka connection details
+  ansible.builtin.set_fact:
+    kafka_connect_details:
+      kafka:
+        namespace: "{{ kafka_namespace }}"
+        loadbalancer_service: "{{ kafka_lb_service_name }}"
+        pod_status: "{{ kafka_pods.stdout | default('') }}"
+        bootstrap_server: "{{ kafka_external_host }}:{{ kafka_external_port }}"
+        tls:
+          ca_crt: "{{ kafka_output_dir }}/ca.crt"
+          client_crt: "{{ kafka_output_dir }}/user.crt"
+          client_key: "{{ kafka_output_dir }}/user.key"
+
+- name: Ensure output file directory exists
+  ansible.builtin.file:
+    path: "{{ kafka_output_file | dirname }}"
+    state: directory
+    mode: "0755"
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
+- name: Write Kafka connection details to file
+  ansible.builtin.copy:
+    content: "{{ kafka_connect_details | to_nice_yaml }}"
+    dest: "{{ kafka_output_file }}"
+    mode: "0644"
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
+- name: Display Kafka connection details
+  ansible.builtin.debug:
+    msg: >-
+      {{
+        [
+          'Kafka connection details written to: ' ~ kafka_output_file,
+          '',
+          'Bootstrap: ' ~ kafka_external_host ~ ':' ~ kafka_external_port,
+          '',
+          'TLS:',
+          '  CA: ' ~ kafka_output_dir ~ '/ca.crt',
+          '  client cert: ' ~ kafka_output_dir ~ '/user.crt',
+          '  client key: ' ~ kafka_output_dir ~ '/user.key',
+          ''
+        ]
+      }}
+  delegate_to: localhost
+  connection: local
+  run_once: true
diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml
new file mode 100644
index 0000000000..fd2455b550
--- /dev/null
+++ b/utils/roles/external_kafka_connect_details/vars/main.yml
@@ -0,0 +1,22 @@
+#  Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+kafka_namespace: "telemetry"
+kafka_lb_service_name: "kafka-kafka-external-bootstrap"
+kafka_bootstrap_port: 9094
+kafka_cluster_ca_secret: "kafka-cluster-ca-cert"
+kafka_client_secret: "kafkapump"
+kafka_output_dir: "/opt/omnia/telemetry/external_kafka"
+kafka_output_file: "/opt/omnia/telemetry/external_kafka_connect_details.yml"

From bc4f61db34ab7c4da5a546c5a4b6820fbba2ed4b Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Thu, 5 Feb 2026 11:46:52 +0530
Subject: [PATCH 031/172] kafka and victoria update

---
 .../tasks/main.yml                            | 50 +++++--------------
 .../vars/main.yml                             |  8 +++
 .../tasks/main.yml                            | 15 ++----
 .../vars/main.yml                             | 13 +++++
 4 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 169c83bad3..61d0811815 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -43,12 +43,12 @@
 
 - name: Fail if no Kafka pods found
   ansible.builtin.fail:
-    msg: "No Kafka pods found in namespace '{{ kafka_namespace }}'."
+    msg: "{{ kafka_err_no_pods_found }}"
   when: (kafka_pods_parsed.get('items', []) | length) == 0
 
 - name: Fail if Kafka pods are not Running
   ansible.builtin.fail:
-    msg: "One or more Kafka pods are not in Running state."
+    msg: "{{ kafka_err_pods_not_running }}"
   when:
     - (kafka_pods_parsed.get('items', [])
       | selectattr('status.phase', 'ne', 'Running')
@@ -57,7 +57,7 @@
 
 - name: Fail if Kafka pods are not Ready
   ansible.builtin.fail:
-    msg: "One or more Kafka pods are not Ready."
+    msg: "{{ kafka_err_pods_not_ready }}"
   when:
     - (kafka_pods_parsed.get('items', [])
       | selectattr('status.containerStatuses', 'defined')
@@ -76,43 +76,15 @@
   changed_when: false
   failed_when: kafka_lb_ip.rc != 0
 
-- name: Get Kafka LoadBalancer hostname
-  ansible.builtin.command: >-
-    kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }}
-    -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
-  register: kafka_lb_hostname
-  changed_when: false
-  failed_when: kafka_lb_hostname.rc != 0
-
-- name: Get Kafka LoadBalancer external port
-  ansible.builtin.command: >-
-    kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }}
-    -o jsonpath='{.spec.ports[0].port}'
-  register: kafka_lb_port
-  changed_when: false
-  failed_when: kafka_lb_port.rc != 0
-
 - name: Set Kafka external endpoint
   ansible.builtin.set_fact:
-    kafka_external_host: >-
-      {{
-        (kafka_lb_ip.stdout | trim)
-        if (kafka_lb_ip.stdout | trim | length) > 0
-        else (kafka_lb_hostname.stdout | trim)
-      }}
-    kafka_external_port: "{{ (kafka_lb_port.stdout | trim) | default('') }}"
+    kafka_external_ip: "{{ kafka_lb_ip.stdout | trim }}"
+    kafka_external_port: "{{ kafka_bootstrap_port | string }}"
 
 - name: Fail when Kafka external endpoint is not available
   ansible.builtin.fail:
-    msg: >-
-      Failed to fetch Kafka LoadBalancer endpoint. Ensure service '{{ kafka_lb_service_name }}'
-      exists in namespace '{{ kafka_namespace }}' and has an external IP/hostname assigned.
-  when: kafka_external_host | trim | length == 0
-
-- name: Set Kafka external port fallback
-  ansible.builtin.set_fact:
-    kafka_external_port: "{{ kafka_bootstrap_port | string }}"
-  when: kafka_external_port | trim | length == 0
+    msg: "{{ kafka_err_external_ip_missing }}"
+  when: kafka_external_ip | trim | length == 0
 
 - name: Ensure output directory exists
   ansible.builtin.file:
@@ -170,7 +142,7 @@
         namespace: "{{ kafka_namespace }}"
         loadbalancer_service: "{{ kafka_lb_service_name }}"
         pod_status: "{{ kafka_pods.stdout | default('') }}"
-        bootstrap_server: "{{ kafka_external_host }}:{{ kafka_external_port }}"
+        bootstrap_server: "{{ kafka_external_ip }}:{{ kafka_external_port }}"
         tls:
           ca_crt: "{{ kafka_output_dir }}/ca.crt"
           client_crt: "{{ kafka_output_dir }}/user.crt"
@@ -201,12 +173,16 @@
         [
           'Kafka connection details written to: ' ~ kafka_output_file,
           '',
-          'Bootstrap: ' ~ kafka_external_host ~ ':' ~ kafka_external_port,
+          'Bootstrap: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
           '',
           'TLS:',
           '  CA: ' ~ kafka_output_dir ~ '/ca.crt',
           '  client cert: ' ~ kafka_output_dir ~ '/user.crt',
           '  client key: ' ~ kafka_output_dir ~ '/user.key',
+          '',
+          'OME note (client cert):',
+          '  Create a certificate in .pfx format (provide a passphrase when prompted):',
+          '  openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
           ''
         ]
       }}
diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml
index fd2455b550..d0bd070d47 100644
--- a/utils/roles/external_kafka_connect_details/vars/main.yml
+++ b/utils/roles/external_kafka_connect_details/vars/main.yml
@@ -20,3 +20,11 @@ kafka_cluster_ca_secret: "kafka-cluster-ca-cert"
 kafka_client_secret: "kafkapump"
 kafka_output_dir: "/opt/omnia/telemetry/external_kafka"
 kafka_output_file: "/opt/omnia/telemetry/external_kafka_connect_details.yml"
+
+kafka_err_no_pods_found: "No Kafka pods found in namespace '{{ kafka_namespace }}'."
+kafka_err_pods_not_running: "One or more Kafka pods are not in Running state."
+kafka_err_pods_not_ready: "One or more Kafka pods are not Ready."
+
+kafka_err_external_ip_missing: >-
+  Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}'
+  exists in namespace '{{ kafka_namespace }}' and has an external IP assigned.
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 4ec7f0c901..90dac54cca 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -47,10 +47,7 @@
 
 - name: Fail if Victoria cluster mode is not deployed
   ansible.builtin.fail:
-    msg: >-
-      Victoria deployment mode detected: {{ victoria_deployment_mode }}.
-      External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage).
-      Single-node Victoria (victoria-loadbalancer) is not supported for external integration.
+    msg: "{{ victoria_err_mode_not_supported }}"
   when: victoria_deployment_mode != 'cluster'
 
 - name: Get Victoria pods status
@@ -77,12 +74,12 @@
 
 - name: Fail if no Victoria pods found
   ansible.builtin.fail:
-    msg: "No Victoria pods found in namespace '{{ victoria_namespace }}'."
+    msg: "{{ victoria_err_no_pods_found }}"
   when: (victoria_pods_parsed.get('items', []) | length) == 0
 
 - name: Fail if Victoria pods are not Running
   ansible.builtin.fail:
-    msg: "One or more Victoria pods are not in Running state."
+    msg: "{{ victoria_err_pods_not_running }}"
   when:
     - (victoria_pods_parsed.get('items', [])
       | selectattr('status.phase', 'ne', 'Running')
@@ -91,7 +88,7 @@
 
 - name: Fail if Victoria pods are not Ready
   ansible.builtin.fail:
-    msg: "One or more Victoria pods are not Ready."
+    msg: "{{ victoria_err_pods_not_ready }}"
   when:
     - (victoria_pods_parsed.get('items', [])
       | selectattr('status.containerStatuses', 'defined')
@@ -172,9 +169,7 @@
 
 - name: Fail when LoadBalancer IPs are not available
   ansible.builtin.fail:
-    msg: >-
-      Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect'
-      exist in namespace '{{ victoria_namespace }}' and have external IPs assigned.
+    msg: "{{ victoria_err_lb_missing }}"
   when:
     - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0
 
diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml
index 29db9136f2..ea1c083deb 100644
--- a/utils/roles/external_victoria_connect_details/vars/main.yml
+++ b/utils/roles/external_victoria_connect_details/vars/main.yml
@@ -16,3 +16,16 @@
 victoria_namespace: "telemetry"
 victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml"
 victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs"
+
+victoria_err_mode_not_supported: >-
+  Victoria deployment mode detected: {{ victoria_deployment_mode }}.
+  External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage).
+  Single-node Victoria (victoria-loadbalancer) is not supported for external integration.
+
+victoria_err_no_pods_found: "No Victoria pods found in namespace '{{ victoria_namespace }}'."
+victoria_err_pods_not_running: "One or more Victoria pods are not in Running state."
+victoria_err_pods_not_ready: "One or more Victoria pods are not Ready."
+
+victoria_err_lb_missing: >-
+  Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect'
+  exist in namespace '{{ victoria_namespace }}' and have external IPs assigned.

From d3f6f7e9cef0cae0579d8ea1cf2e49805afcf3f2 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Thu, 5 Feb 2026 12:29:16 +0530
Subject: [PATCH 032/172] update sfm and ome

---
 .../tasks/main.yml                            | 10 ++++++----
 .../tasks/main.yml                            | 20 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 61d0811815..0c4d525a82 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -173,16 +173,18 @@
         [
           'Kafka connection details written to: ' ~ kafka_output_file,
           '',
-          'Bootstrap: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
+          'Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
           '',
           'TLS:',
           '  CA: ' ~ kafka_output_dir ~ '/ca.crt',
           '  client cert: ' ~ kafka_output_dir ~ '/user.crt',
           '  client key: ' ~ kafka_output_dir ~ '/user.key',
           '',
-          'OME note (client cert):',
-          '  Create a certificate in .pfx format (provide a passphrase when prompted):',
-          '  openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
+          'OME note (mTLS):',
+          '  Use ca.crt as the server certificate in OME.',
+          '  Create a client certificate in .pfx format (provide a passphrase when prompted):',
+          '    openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
+          '  Use user.pfx as the client certificate in OME.',
           ''
         ]
       }}
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 90dac54cca..c44c145921 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -173,6 +173,15 @@
   when:
     - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0
 
+- name: Build SFM hosts entry
+  ansible.builtin.set_fact:
+    victoria_sfm_hosts_entry: >-
+      {{
+        'echo "' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts'
+        if (vminsert_lb_ip.stdout | trim | length) > 0
+        else ''
+      }}
+
 - name: Set Victoria external port fallbacks
   ansible.builtin.set_fact:
     vminsert_port: "8480"
@@ -200,6 +209,10 @@
             ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui"
         tls:
           server_crt: "{{ victoria_tls_cert }}"
+        notes:
+          sfm:
+            vminsert_write_url: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write"
+            hosts_entry: "{{ victoria_sfm_hosts_entry }}"
 
 - name: Ensure output directory exists
   ansible.builtin.file:
@@ -234,7 +247,12 @@
           '  vmselect UI:    https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui',
           '',
           'TLS:',
-          '  server.crt: ' ~ victoria_tls_cert
+          '  server.crt: ' ~ victoria_tls_cert,
+          '',
+          'SFM note:',
+          '  Use vminsert write URL for SFM: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write',
+          '  Add this entry to /etc/hosts on the SFM server:',
+          '    ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'LoadBalancer IP not available; cannot generate /etc/hosts entry.')
         ]
       }}
   delegate_to: localhost

From 4a6a7d7fbca7bbaf4c1cadf2b9ec440e0f94d895 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 5 Feb 2026 13:19:53 +0530
Subject: [PATCH 033/172] When mix of path and map were provided, was causing
 issues

---
 discovery/roles/slurm_config/tasks/confs.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index e1b4e2d3ea..33315709cc 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -49,7 +49,7 @@
   when:
     - configs_input is defined
     - configs_input
-    - item.value is abs
+    - item.value is string
 
 - name: Build parsed_configs_input dictionary from parsed files
   ansible.builtin.set_fact:
@@ -57,7 +57,7 @@
   loop: "{{ parsed_configs_input_results.results }}"
   when:
     - parsed_configs_input_results is defined
-    - not parsed_configs_input_results.skipped | default(false)
+    - not item.skipped | default(false)
 
 - name: Add configs_input dicts that are already parsed
   ansible.builtin.set_fact:

From 07758fa988e2c91768bff545ce4ed7aad53c01ff Mon Sep 17 00:00:00 2001
From: Kratika_Patidar <Kratika.Patidar@dell.com>
Date: Thu, 5 Feb 2026 11:01:56 +0000
Subject: [PATCH 034/172] Updating additional_packages.json for aarch64 with
 service_k8s functional groups

---
 .../aarch64/rhel/10.0/additional_packages.json    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/input/config/aarch64/rhel/10.0/additional_packages.json b/input/config/aarch64/rhel/10.0/additional_packages.json
index b01c3f78b5..0d6d9a0452 100644
--- a/input/config/aarch64/rhel/10.0/additional_packages.json
+++ b/input/config/aarch64/rhel/10.0/additional_packages.json
@@ -4,6 +4,21 @@
 
         ]
     },
+    "service_kube_control_plane_first": {
+        "cluster": [
+
+        ]
+    },
+    "service_kube_control_plane": {
+        "cluster": [
+
+        ]
+    },
+    "service_kube_node": {
+        "cluster": [
+
+        ]
+    },
     "slurm_control_node": {
         "cluster": [
 

From 51349058819406ee0a9a2e2cac8594ea127c45a3 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 5 Feb 2026 17:14:58 +0530
Subject: [PATCH 035/172] Upgrade of network_spec.yml, software_config.json and
 pxe_mapping_file.csv

---
 .../import_input_parameters/tasks/main.yml    |   9 +
 .../tasks/restore_pxe_mapping_file.yml        |  49 +++++
 .../tasks/restore_software_config.yml         |  60 ++++++
 .../tasks/transform_network_spec.yml          | 192 ++++++++++++++++++
 .../templates/network_spec.j2                 |  61 +++++-
 .../import_input_parameters/vars/main.yml     |   4 +
 6 files changed, 365 insertions(+), 10 deletions(-)
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_software_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml

diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml
index f4c5b1b7cb..af45a1de1b 100644
--- a/upgrade/roles/import_input_parameters/tasks/main.yml
+++ b/upgrade/roles/import_input_parameters/tasks/main.yml
@@ -12,3 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+
+- name: Transform network_spec.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_network_spec.yml
+
+- name: Restore software_config.json from backup
+  ansible.builtin.include_tasks: restore_software_config.yml
+
+- name: Restore pxe_mapping_file.csv from backup
+  ansible.builtin.include_tasks: restore_pxe_mapping_file.yml
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml
new file mode 100644
index 0000000000..f468359305
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml
@@ -0,0 +1,49 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate backup_location is provided
+  ansible.builtin.fail:
+    msg: "backup_location must be provided to restore pxe_mapping_file.csv"
+  when: backup_location is not defined or (backup_location | string | trim) == ""
+
+- name: Ensure backup directory exists
+  ansible.builtin.file:
+    path: "{{ backup_location }}"
+    state: directory
+    mode: '0755'
+
+- name: Check if backup pxe_mapping_file.csv exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/pxe_mapping_file.csv"
+  register: backup_pxe_mapping_stat
+
+- name: Fail if backup pxe_mapping_file.csv is not present
+  ansible.builtin.fail:
+    msg: "Backup pxe_mapping_file.csv is not present at {{ backup_location }}/pxe_mapping_file.csv"
+  when: not backup_pxe_mapping_stat.stat.exists
+
+- name: Overwrite pxe_mapping_file.csv in input directory from backup
+  ansible.builtin.copy:
+    src: "{{ backup_location }}/pxe_mapping_file.csv"
+    dest: "{{ omnia_input_dir }}/pxe_mapping_file.csv"
+    mode: '0644'
+    remote_src: true
+
+- name: Display pxe_mapping_file.csv restore summary
+  ansible.builtin.debug:
+    msg: |
+      pxe_mapping_file.csv restored from backup.
+      Backup preserved at: {{ backup_location }}/pxe_mapping_file.csv
+      Restored to: {{ omnia_input_dir }}/pxe_mapping_file.csv
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml b/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml
new file mode 100644
index 0000000000..9891023702
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml
@@ -0,0 +1,60 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate backup_location is provided
+  ansible.builtin.fail:
+    msg: "backup_location must be provided to restore software_config.json"
+  when: backup_location is not defined or (backup_location | string | trim) == ""
+
+- name: Ensure backup directory exists
+  ansible.builtin.file:
+    path: "{{ backup_location }}"
+    state: directory
+    mode: '0755'
+
+- name: Check if backup software_config.json exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/software_config.json"
+  register: backup_software_config_stat
+
+- name: Fail if backup software_config.json is not present
+  ansible.builtin.fail:
+    msg: "Backup software_config.json is not present at {{ backup_location }}/software_config.json"
+  when: not backup_software_config_stat.stat.exists
+
+- name: Overwrite software_config.json in input directory from backup
+  ansible.builtin.copy:
+    src: "{{ backup_location }}/software_config.json"
+    dest: "{{ omnia_input_dir }}/software_config.json"
+    mode: '0644'
+    remote_src: true
+
+- name: Validate JSON syntax of software_config.json
+  ansible.builtin.command:
+    cmd: python3 -m json.tool "{{ omnia_input_dir }}/software_config.json"
+  register: software_config_json_validation
+  changed_when: false
+
+- name: Fail if software_config.json JSON validation fails
+  ansible.builtin.fail:
+    msg: "JSON validation failed after restoring software_config.json"
+  when: software_config_json_validation.rc != 0
+
+- name: Display software_config.json restore summary
+  ansible.builtin.debug:
+    msg: |
+      software_config.json restored from backup.
+      Backup preserved at: {{ backup_location }}/software_config.json
+      Restored to: {{ omnia_input_dir }}/software_config.json
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
new file mode 100644
index 0000000000..051bbfb13c
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
@@ -0,0 +1,192 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate backup_location is provided
+  ansible.builtin.fail:
+    msg: "backup_location must be provided to run network_spec.yml upgrade"
+  when: backup_location is not defined or (backup_location | string | trim) == ""
+
+- name: Ensure backup directory exists
+  ansible.builtin.file:
+    path: "{{ backup_location }}"
+    state: directory
+    mode: '0755'
+
+- name: Check if backup network_spec.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/network_spec.yml"
+  register: backup_network_spec_stat
+
+- name: Fail if backup network_spec.yml is not present
+  ansible.builtin.fail:
+    msg: "Backup network_spec.yml is not present at {{ backup_location }}/network_spec.yml"
+  when: not backup_network_spec_stat.stat.exists
+
+- name: Check if network_spec.yml exists
+  ansible.builtin.stat:
+    path: "{{ omnia_input_dir }}/network_spec.yml"
+  register: network_spec_stat
+
+- name: Fail if network_spec.yml is not present
+  ansible.builtin.fail:
+    msg: "network_spec.yml is not present at {{ omnia_input_dir }}/network_spec.yml"
+  when: not network_spec_stat.stat.exists
+
+- name: Read existing network_spec.yml
+  ansible.builtin.slurp:
+    src: "{{ omnia_input_dir }}/network_spec.yml"
+  register: network_spec_slurp
+  when: network_spec_stat.stat.exists
+
+- name: Parse existing network_spec.yml
+  ansible.builtin.set_fact:
+    network_spec_existing: "{{ network_spec_slurp.content | b64decode | from_yaml }}"
+  when: network_spec_stat.stat.exists
+
+- name: Check if network_spec.yml is already in Omnia 2.1 format
+  ansible.builtin.set_fact:
+    network_spec_already_21: >-
+      {{
+        (network_spec_existing.schema_version | default('') | string) == '2.1'
+        and (network_spec_existing.Networks is defined)
+        and ((network_spec_existing.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) > 0)
+      }}
+  when: network_spec_stat.stat.exists
+
+- name: Skip transformation when network_spec.yml is already in 2.1 format
+  ansible.builtin.debug:
+    msg: "network_spec.yml is already in Omnia 2.1 format. Skipping transformation."
+  when: network_spec_already_21 | default(false) | bool
+
+- name: Read backup network_spec.yml (Omnia 2.0 source)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/network_spec.yml"
+  register: backup_network_spec_slurp
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Parse backup network_spec.yml
+  ansible.builtin.set_fact:
+    backup_network_spec: "{{ backup_network_spec_slurp.content | b64decode | from_yaml }}"
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Extract admin_network and ib_network from backup file
+  ansible.builtin.set_fact:
+    admin_network: >-
+      {{
+        (backup_network_spec.admin_network
+          if (backup_network_spec is mapping and backup_network_spec.admin_network is defined)
+          else
+            (
+              (backup_network_spec.Networks | default([])
+                | select('mapping')
+                | selectattr('admin_network', 'defined')
+                | map(attribute='admin_network')
+                | first
+              ) | default({})
+            )
+        )
+      }}
+    ib_network: >-
+      {{
+        (backup_network_spec.ib_network
+          if (backup_network_spec is mapping and backup_network_spec.ib_network is defined)
+          else
+            (
+              (backup_network_spec.Networks | default([])
+                | select('mapping')
+                | selectattr('ib_network', 'defined')
+                | map(attribute='ib_network')
+                | first
+              ) | default({})
+            )
+        )
+      }}
+  when:
+    - not (network_spec_already_21 | default(false) | bool)
+
+- name: Render network_spec.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: network_spec.j2
+    dest: "{{ omnia_input_dir }}/network_spec.yml"
+    mode: '0644'
+  vars:
+    admin_network_netmask_bits: "{{ admin_network.netmask_bits | default('24') }}"
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Read transformed network_spec.yml
+  ansible.builtin.slurp:
+    src: "{{ omnia_input_dir }}/network_spec.yml"
+  register: network_spec_21_slurp
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Parse transformed network_spec.yml
+  ansible.builtin.set_fact:
+    network_spec_21: "{{ network_spec_21_slurp.content | b64decode | from_yaml }}"
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Validate YAML syntax of transformed network_spec.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ omnia_input_dir }}/network_spec.yml','r'))"
+  register: network_spec_yaml_validation
+  changed_when: false
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "YAML validation failed after transforming network_spec.yml"
+  when:
+    - not (network_spec_already_21 | default(false) | bool)
+    - network_spec_yaml_validation.rc != 0
+
+- name: Ensure ib_network.netmask_bits matches admin_network.netmask_bits
+  ansible.builtin.fail:
+    msg: "ib_network.netmask_bits must match admin_network.netmask_bits in Omnia 2.1"
+  when:
+    - not (network_spec_already_21 | default(false) | bool)
+    - >-
+      (ib_network.netmask_bits | default(admin_network.netmask_bits | default('24')) | string)
+      != (admin_network.netmask_bits | default('24') | string)
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "Using backup as input source: {{ backup_location }}/network_spec.yml (backup is not modified)"
+  when: not (network_spec_already_21 | default(false) | bool)
+
+- name: Validate mandatory ib_network is present in transformed output
+  ansible.builtin.fail:
+    msg: "ib_network is mandatory in Omnia 2.1 network_spec.yml"
+  when:
+    - not (network_spec_already_21 | default(false) | bool)
+    - >-
+      (network_spec_21.Networks is not defined)
+      or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0)
+
+- name: Validate mandatory ib_network.subnet is present in transformed output
+  ansible.builtin.fail:
+    msg: "ib_network.subnet is mandatory in Omnia 2.1 network_spec.yml"
+  when:
+    - not (network_spec_already_21 | default(false) | bool)
+    - >-
+      ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | map(attribute='ib_network') | first | default({})).subnet | default('') | string | trim) == ''
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: |
+      network_spec.yml upgraded to Omnia 2.1 format.
+      Backup preserved at: {{ backup_location }}/network_spec.yml
+      Key changes:
+      - Added mandatory ib_network section
+      - primary_oim_bmc_ip treated as optional
+      - ib_network.netmask_bits aligned with admin_network.netmask_bits
diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2
index 98d3073c0f..773a11446c 100644
--- a/upgrade/roles/import_input_parameters/templates/network_spec.j2
+++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2
@@ -1,14 +1,55 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#      http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
 
+# This file is used to specify the network configuration.
+#
+# 'admin_network' is a mandatory field, essential for PXE boot and host communication."
+#
+# The 'admin_network' section contains the following variables:
+# - 'oim_nic_name': The name of the interface on the OIM server associated with the admin network.
+# - 'netmask_bits': The number of bits in the subnet mask.
+# - 'primary_oim_admin_ip': The admin IP address of the OIM server which is configured.
+# - 'primary_oim_bmc_ip': The iDRAC  IP address of the OIM server,
+#     Mandatory only if idrac_telemetry is set to true and telemetry data needs to be collected from the OIM server.
+#     Optional — can be omitted if iDRAC telemetry for the OIM server is not required.
+# - 'dynamic_range': The range of dynamic IP addresses available on the admin network.
+# - 'dns': The list of external DNS server IP address for the admin network.
+# - 'ntp_servers': The list of NTP servers for the admin network. Each NTP server entry should include: 
+#     - 'address': The IP address or hostname of the NTP server.
+#     - 'type': The type of NTP entry, either 'server' or 'pool'.
+#     Example:  
+#     ntp_servers:
+#       - { address: "172.16.10.80", type: "server" }
+
+# 'ib_network' is a mandatory field, essential for IB network configuration.
+# The 'ib_network' section contains the following variables:
+# - 'subnet': The subnet of the IB network.
+# - 'netmask_bits': The number of bits in the subnet mask. This value must be same as the admin_network netmask_bits.
+
+Networks:
+- admin_network:
+    oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}"
+    netmask_bits: "{{ admin_network.netmask_bits | default('24') }}"
+    primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}"
+{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %}
+    primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}"
+{% endif %}
+    dynamic_range: "{{ admin_network.dynamic_range | default('') }}"
+    dns: {{ admin_network.dns | default([]) }}
+    ntp_servers: {{ admin_network.ntp_servers | default([]) }}
+
+- ib_network:
+    subnet: "{{ ib_network.subnet | default('192.168.0.0') }}"
+    netmask_bits: "{{ ib_network.netmask_bits | default(admin_network_netmask_bits | default('24')) }}"
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index f4c5b1b7cb..3c44a98130 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -12,3 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+
+omnia_input_dir: /opt/omnia/input/project_default
+
+backup_location: /opt/omnia/backup/upgrade
\ No newline at end of file

From f656eb75aaa9e0d9373cd209a6296c1a380a239e Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 5 Feb 2026 17:49:53 +0530
Subject: [PATCH 036/172] Update main.yml

---
 upgrade/roles/import_input_parameters/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 3c44a98130..c44a5bbb87 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -15,4 +15,4 @@
 
 omnia_input_dir: /opt/omnia/input/project_default
 
-backup_location: /opt/omnia/backup/upgrade
\ No newline at end of file
+backup_location: /opt/omnia/backups/upgrade
\ No newline at end of file

From 12eed55a3eb56bdd6276947f34fc90cfac6dbb30 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 5 Feb 2026 18:10:07 +0530
Subject: [PATCH 037/172] validation for keys of confs

---
 .../common_utils/slurm_conf_utils.py          | 47 +++++++++++++++-
 .../validation_flows/common_validation.py     | 38 +++++++++----
 common/library/modules/slurm_conf.py          | 55 +++----------------
 3 files changed, 82 insertions(+), 58 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 0e59272815..8deb85febb 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -14,8 +14,9 @@
 
 # These are the slurm options for version - 25.11
 import re
+import os
 from enum import Enum
-
+from collections import OrderedDict
 
 class SlurmParserEnum(str, Enum):
     """Enumeration of Slurm configuration parameter types for parsing and validation."""
@@ -545,6 +546,50 @@ class SlurmParserEnum(str, Enum):
 _HOSTLIST_RE = re.compile(
     r'^(?P<prefix>[^\[\]]*)\[(?P<inner>[^\[\]]+)\](?P<suffix>.*)$')
 
+def get_invalid_keys(conf_dict, conf_name):
+    """Get invalid configuration keys by comparing against expected keys."""
+    current_conf = all_confs.get(conf_name, {})
+    # get difference between conf_dict keys and current_conf keys
+    diff = set(conf_dict.keys()).difference(set(current_conf.keys()))
+    return list(diff)
+
+def parse_slurm_conf(file_path, conf_name, validate):
+    """Parses the slurm.conf file and returns it as a dictionary."""
+    current_conf = all_confs.get(conf_name, {})
+    slurm_dict = OrderedDict()
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"{file_path} not found.")
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            # handles any comment after the data
+            line = line.split('#')[0].strip()
+            if not line:
+                continue
+            # Split the line by one or more spaces
+            items = line.split()
+            tmp_dict = OrderedDict()
+            for item in items:
+                # Split only on the first '=' to allow '=' inside the value
+                key, value = item.split('=', 1)
+                tmp_dict[key.strip()] = value.strip()
+            skey = list(tmp_dict.keys())[0]
+            if validate and skey not in current_conf:
+                raise ValueError(f"Invalid key while parsing {file_path}: {skey}")
+            if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY:
+                slurm_dict[list(tmp_dict.keys())[0]] = list(
+                    slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict]
+            elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV:
+                existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()]
+                new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()]
+                slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values)))
+            elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST:
+                slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values())
+            else:
+                slurm_dict.update(tmp_dict)
+
+    return slurm_dict
 
 def expand_hostlist(expr):
     """
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 06f33be0e4..52fea1ced5 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -36,11 +36,14 @@
 
 from ansible.module_utils.local_repo.software_utils import (
     load_json,
-    load_yaml,
     get_subgroup_dict,
     get_software_names,
     get_json_file_path
 )
+from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import (
+    parse_slurm_conf,
+    get_invalid_keys
+)
 
 file_names = config.files
 create_error_msg = validation_utils.create_error_msg
@@ -1058,16 +1061,29 @@ def validate_omnia_config(
                     "slurm NFS not provided",
                     f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}"
                     ))
-        # config_paths_list = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
-        # for cfg_path_dict in config_paths_list:
-        #     for k,v in cfg_path_dict.items():
-        #         if isinstance(v, str) and not os.path.exists(v):
-        #             errors.append(
-        #                 create_error_msg(
-        #                     input_file_path,
-        #                     "slurm config_paths",
-        #                     f"config_path for {k} - {v} does not exist"
-        #                     ))
+        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+        for cfg_path_dict in cnfg_src:
+            for k,v in cfg_path_dict.items():
+                if isinstance(v, str):
+                    if not os.path.exists(v):
+                        errors.append(
+                            create_error_msg(input_file_path, "slurm_cluster config_sources",
+                                f"provided conf path for {k} - {v} does not exist"))
+                    else: # path and also exists
+                        conf_dict = parse_slurm_conf(v, k, False)
+                        # module.exit_json(failed=True, result=conf_dict)
+                        invalid_keys = get_invalid_keys(conf_dict, k)
+                        if invalid_keys:
+                            errors.append(
+                                create_error_msg(input_file_path, "slurm_cluster config_sources",
+                                    f"invalid keys found in {k} - {invalid_keys}"))
+                else:
+                    invalid_keys = get_invalid_keys(v, k)
+                    if invalid_keys:
+                        errors.append(
+                            create_error_msg(input_file_path, "slurm_cluster config_sources",
+                                f"invalid keys found in {k} - {invalid_keys}"))
+
 
     return errors
 
diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py
index 9b9441e493..a782cb1f79 100644
--- a/common/library/modules/slurm_conf.py
+++ b/common/library/modules/slurm_conf.py
@@ -12,6 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+from collections import OrderedDict
+from ansible.module_utils.basic import AnsibleModule
+from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import (
+    SlurmParserEnum,
+    all_confs,
+    parse_slurm_conf
+)
+
 DOCUMENTATION = r'''
 ---
 module: slurm_conf
@@ -134,12 +143,6 @@
 #   - Hostlist expressions, split and merge computations
 
 
-from collections import OrderedDict
-from ansible.module_utils.basic import AnsibleModule
-from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import SlurmParserEnum, all_confs
-import os
-
-
 def read_dict2ini(conf_dict):
     """Convert a configuration dictionary to INI-style lines for slurm.conf."""
     data = []
@@ -147,7 +150,6 @@ def read_dict2ini(conf_dict):
         if isinstance(v, list):
             for dct_item in v:
                 if isinstance(dct_item, dict):
-                    # TODO: Ordered dict, move the key to the top
                     od = OrderedDict(dct_item)
                     od.move_to_end(k, last=False)  # Move k to the beginning
                     data.append(
@@ -159,45 +161,6 @@ def read_dict2ini(conf_dict):
     return data
 
 
-def parse_slurm_conf(file_path, conf_name, validate):
-    """Parses the slurm.conf file and returns it as a dictionary."""
-    current_conf = all_confs.get(conf_name, {})
-    slurm_dict = OrderedDict()
-
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"{file_path} not found.")
-
-    with open(file_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            # handles any comment after the data
-            line = line.split('#')[0].strip()
-            if not line:
-                continue
-            # Split the line by one or more spaces
-            items = line.split()
-            tmp_dict = OrderedDict()
-            for item in items:
-                # Split only on the first '=' to allow '=' inside the value
-                key, value = item.split('=', 1)
-                tmp_dict[key.strip()] = value.strip()
-            skey = list(tmp_dict.keys())[0]
-            if validate and skey not in current_conf:
-                raise ValueError(f"Invalid key while parsing {file_path}: {skey}")
-            if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY:
-                slurm_dict[list(tmp_dict.keys())[0]] = list(
-                    slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict]
-            elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV:
-                existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()]
-                new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()]
-                slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values)))
-            elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST:
-                slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values())
-            else:
-                slurm_dict.update(tmp_dict)
-
-    return slurm_dict
-
-
 def slurm_conf_dict_merge(conf_dict_list, conf_name):
     """Merge multiple Slurm configuration dictionaries into a single dictionary."""
     merged_dict = OrderedDict()

From 299fdaf8e4973c2e2bf889546bb5200d4d427c0e Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Thu, 5 Feb 2026 20:49:48 +0530
Subject: [PATCH 038/172] added code to fail when requested image version
 doesn't exist

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../local_repo/container_repo_utils.py        | 40 +++++++++++++++++--
 .../local_repo/process_parallel.py            | 14 +++++--
 .../module_utils/local_repo/software_utils.py | 21 ++++++----
 3 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py
index d8d97465d8..914d7bff56 100644
--- a/common/library/module_utils/local_repo/container_repo_utils.py
+++ b/common/library/module_utils/local_repo/container_repo_utils.py
@@ -109,13 +109,47 @@ def sync_container_repository(repo_name, remote_name, package_content, logger):
         bool: True if the synchronization is successful, False otherwise.
     """
     try:
+        logger.info(f"Getting repository version before sync for {repo_name}")
+        verify_command = pulp_container_commands["show_container_repo"] % repo_name
+        verify_result_before = execute_command(verify_command, logger, type_json=True)
+        
+        version_before = None
+        if verify_result_before and isinstance(verify_result_before, dict) and "stdout" in verify_result_before:
+            repo_data_before = verify_result_before["stdout"]
+            if isinstance(repo_data_before, dict):
+                version_before = repo_data_before.get("latest_version_href")
+                logger.info(f"Repository version before sync: {version_before}")
+        
         command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name)
         result = execute_command(command,logger)
         if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
+            logger.error(f"Sync command failed for repository {repo_name}")
             return False
-        else:
-            result = create_container_distribution(repo_name,package_content,logger)
-            return result
+        
+        logger.info(f"Validating sync result for repository {repo_name}")
+        verify_result_after = execute_command(verify_command, logger, type_json=True)
+        
+        if verify_result_after and isinstance(verify_result_after, dict) and "stdout" in verify_result_after:
+            repo_data_after = verify_result_after["stdout"]
+            if isinstance(repo_data_after, dict):
+                version_after = repo_data_after.get("latest_version_href")
+                logger.info(f"Repository version after sync: {version_after}")
+                
+                if not version_after or version_after.endswith("/versions/0/"):
+                    logger.error(f"Sync completed but no content was downloaded for {repo_name}. "
+                               f"The specified image tag likely does not exist in the upstream registry.")
+                    return False
+                
+                if version_before and version_after and version_before == version_after:
+                    logger.error(f"Sync completed but repository version did not change for {repo_name}. "
+                               f"Version remained at {version_after}. "
+                               f"The specified image tag likely does not exist in the remote registry.")
+                    return False
+                
+                logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}")
+        
+        result = create_container_distribution(repo_name,package_content,logger)
+        return result
     except Exception as e:
         logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}")
         return False
diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py
index cfc3beb920..74a24504b7 100644
--- a/common/library/module_utils/local_repo/process_parallel.py
+++ b/common/library/module_utils/local_repo/process_parallel.py
@@ -201,6 +201,13 @@ def execute_task(task, determine_function, user_data, version_variables, arc,
         with log_lock:
             logger.info(f"### {execute_task.__name__} start ###")  # Log task start
 
+        # Build package display name with tag for images
+        package_display = task.get("package", "")
+        if task.get("type") == "image" and "tag" in task:
+            package_display = f"{package_display}:{task['tag']}"
+        elif task.get("type") == "image" and "digest" in task:
+            package_display = f"{package_display}:{task['digest']}"
+
         # Determine the function and its arguments using the provided `determine_function`
         function, args = determine_function(task, repo_store_path, csv_file_path, user_data,
                          version_variables, arc, user_registries, docker_username, docker_password)
@@ -217,7 +224,7 @@ def execute_task(task, determine_function, user_data, version_variables, arc,
                     )
                 return {
                     "task": task,
-                    "package": task.get("package", ""),  # Extract package name if available
+                    "package": package_display,
                     "status": "TIMEOUT",
                     "output": "",
                     "error": f"Timeout reached after {elapsed_time:.2f}s"
@@ -240,7 +247,7 @@ def execute_task(task, determine_function, user_data, version_variables, arc,
 
         return {
             "task": task,
-            "package": task.get("package", ""),  
+            "package": package_display,
             "status": result.upper(),  
             "output": result,
             "error": ""
@@ -251,12 +258,11 @@ def execute_task(task, determine_function, user_data, version_variables, arc,
             logger.error(f"Task failed: {str(e)}")
         return {
             "task": task,
-            "package": task.get("package", ""),  
+            "package": package_display,
             "status": "FAILED",  
             "output": "",
             "error": str(e)  # Include the error message
         }
-
 def worker_process(task, determine_function, user_data, version_variables, arc, repo_store_path,
                   csv_file_path, log_dir, result_queue, user_registries,
                   docker_username, docker_password, timeout):
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index 6c78c51f3f..f1840be158 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -712,16 +712,21 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger):
         raise
    
     for pkg in all_packages:
-
         if pkg["type"] == "image":
-           pkg_prefix = pkg.get("package", "").strip()
-           prefix_found = any(name.startswith(f"{pkg_prefix}:") for name in names)
-           if not prefix_found:
-               new_packages.append(pkg)
+            # Check exact package:tag or package:digest combination
+            pkg_base = pkg.get("package", "").strip()
+            pkg_identifier = pkg_base
+            
+            if "tag" in pkg:
+                pkg_identifier += f":{pkg['tag']}"
+            elif "digest" in pkg:
+                pkg_identifier += f":{pkg['digest']}"
+            
+            if pkg_identifier not in names:
+                new_packages.append(pkg)
         else:
             if pkg.get("package") not in names:
                 new_packages.append(pkg)
-
     logger.info("New packages list: %s", new_packages)
 
     logger.info("Finished get_new_packages_not_in_status()")
@@ -828,7 +833,9 @@ def remove_duplicates_from_trans(trans):
                 type_ = item.get("type")
 
                 if type_ == "image":
-                    key = (item.get("package"), item.get("tag"))
+                    # Use digest if present, otherwise use tag
+                    identifier = item.get("digest") or item.get("tag")
+                    key = (item.get("package"), identifier)
 
                 elif type_ == "pip_module":
                     key = item.get("package")

From f3e4050eaf03b058d4582f25533ea406808886d1 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 6 Feb 2026 08:22:33 +0530
Subject: [PATCH 039/172] updated omnia.sh for upgrade

---
 omnia.sh | 248 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 202 insertions(+), 46 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index c997d2ff97..358cde2162 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -52,6 +52,36 @@ is_local_ip() {
     fi
 }
 
+OMNIA_BASE_DIR="/opt/omnia"
+OMNIA_INPUT_DIR="/opt/omnia/input"
+OMNIA_BACKUPS_DIR="/opt/omnia/backups"
+OMNIA_METADATA_DIR="/opt/omnia/.data"
+OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
+
+update_metadata_upgrade_backup_dir() {
+    local backup_dir="$1"
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running"
+        return 1
+    fi
+
+    podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then
+            sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE'
+        else
+            echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE'
+        fi
+    "
+}
+
+
+
 check_internal_nfs_export() {
     nfs_server_ip=$1
     nfs_server_share_path=$2
@@ -757,9 +787,9 @@ EOF
     # Create the .data directory if it does not exist.
     # This is where the oim_metadata.yml file is stored.
     echo -e "${GREEN} Creating the .data directory if it does not exist.${NC}"
-    mkdir -p "$omnia_path/omnia/.data"
+    mkdir -p "$OMNIA_METADATA_DIR"
 
-    oim_metadata_file="$omnia_path/omnia/.data/oim_metadata.yml"
+    oim_metadata_file="$OMNIA_METADATA_FILE"
 
     if [ ! -f "$oim_metadata_file" ]; then
         echo -e "${GREEN} Creating oim_metadata file${NC}"
@@ -811,7 +841,7 @@ EOF
 
     if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
         echo -e "${RED}Error: $container_name container failed to start.${NC}"
-        rm -rf "$omnia_path/omnia/.data/oim_metadata.yml"
+        rm -rf "$OMNIA_METADATA_FILE"
         exit 1
     fi
 
@@ -832,17 +862,17 @@ post_setup_config() {
     chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp"
     # Create the input directory if it does not exist.
     echo -e "${GREEN} Creating the input directory if it does not exist.${NC}"
-    mkdir -p "$omnia_path/omnia/input/"
+    mkdir -p "$OMNIA_INPUT_DIR/"
 
     # Create the default.yml file if it does not exist.
     # This file contains the name of the project.
-    if [ ! -f "$omnia_path/omnia/input/default.yml" ]; then
+    if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then
         echo -e "${BLUE} Creating default.yml file.${NC}"
         {
             echo "# This file defines the project name."
             echo "# The name of the project should be set in a directory under input."
             echo "project_name: project_default"
-        } >> "$omnia_path/omnia/input/default.yml"
+        } >> "$OMNIA_INPUT_DIR/default.yml"
     fi
 
     # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container
@@ -925,16 +955,17 @@ start_container_session() {
 }
 
 show_help() {
-    echo "Usage: $0 [--install | --uninstall | --version | --help]"
+    echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]"
     echo "  -i, --install     Install and start the Omnia core container"
     echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
+    echo "      --upgrade     Upgrade the Omnia core container from image tag 1.0 to 1.1"
     echo "  -v, --version     Display Omnia version information"
     echo "  -h, --help        More information about usage"
 }
 
 install_omnia_core() {
     local omnia_core_tag="1.1"
-    local omnia_core_registry="docker.io/dellhpcomniaaisolution"
+    local omnia_core_registry=""
     
     # Check if local omnia_core:1.1 exists
     if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
@@ -945,44 +976,20 @@ install_omnia_core() {
         # Tag it as 1.1 for consistency
         podman tag omnia_core:latest omnia_core:${omnia_core_tag}
     else
-        # Try pulling from Docker Hub with retry logic
-        echo -e "${BLUE}Omnia core image not found locally. Attempting to pull from Docker Hub...${NC}"
-        pull_success=false
-        max_retries=3
-        retry_count=0
-        
-        while [ $retry_count -lt $max_retries ]; do
-            retry_count=$((retry_count + 1))
-            echo -e "${BLUE}Attempt $retry_count of $max_retries...${NC}"
-            
-            if podman pull ${omnia_core_registry}/omnia_core:${omnia_core_tag} 2>/dev/null; then
-                echo -e "${GREEN}✓ Successfully pulled omnia_core:${omnia_core_tag} from Docker Hub.${NC}"
-                # Tag it without registry prefix for local use
-                podman tag ${omnia_core_registry}/omnia_core:${omnia_core_tag} omnia_core:${omnia_core_tag}
-                pull_success=true
-                break
-            else
-                if [ $retry_count -lt $max_retries ]; then
-                    echo -e "${YELLOW}Pull failed. Retrying in 5 seconds...${NC}"
-                    sleep 5
-                fi
-            fi
-        done
-        
-        if [ "$pull_success" = false ]; then
-            echo -e "${RED}ERROR: Failed to pull omnia_core image after $max_retries attempts.${NC}"
-            echo ""
-            echo -e "${YELLOW}To resolve this, please follow these steps:${NC}"
-            echo -e "1. Clone the Omnia Artifactory repository:"
-            echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container"
-            echo -e "2. Navigate to the repository directory:"
-            echo -e "   cd omnia-artifactory"
-            echo -e "3. Build the core image locally:"
-            echo -e "   ./build_images.sh core omnia_branch=<version/branch_name>"
-            echo -e "4. After building the image, re-run this script:"
-            echo -e "   ./omnia.sh --install"
-            exit 1
-        fi
+        echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
+        echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}"
+        echo ""
+        echo -e "${YELLOW}One way to build the image locally:${NC}"
+        echo -e "1. Clone the Omnia Artifactory repository:"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container"
+        echo -e "2. Navigate to the repository directory:"
+        echo -e "   cd omnia-artifactory"
+        echo -e "3. Build the core image locally (loads into local Podman by default):"
+        echo -e "   ./build_images.sh core omnia_branch=<version/branch_name>"
+        echo ""
+        echo -e "${YELLOW}Then re-run:${NC}"
+        echo -e "   ./omnia.sh --install"
+        exit 1
     fi
 
     # Check if any other containers with 'omnia' in their name are running
@@ -1139,6 +1146,152 @@ display_version() {
     exit 0
 }
 
+phase1_validate() {
+    local current_image
+    local core_config
+    local previous_omnia_version
+    local shared_path
+
+    echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation"
+
+    if [ "$(id -u)" -ne 0 ]; then
+        if ! sudo -n true >/dev/null 2>&1; then
+            echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo"
+            return 1
+        fi
+    fi
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running"
+        return 1
+    fi
+
+    core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null)
+    if [ -z "$core_config" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container"
+        return 1
+    fi
+
+    previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r')
+    if [ -z "$previous_omnia_version" ]; then
+        echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml"
+        return 1
+    fi
+
+    if [ "$previous_omnia_version" != "2.0.0.0" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version"
+        return 1
+    fi
+
+    shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -z "$shared_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml"
+        return 1
+    fi
+
+    omnia_path="$shared_path"
+
+    if [ ! -d "$omnia_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path"
+        return 1
+    fi
+
+    if [ ! -w "$omnia_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path"
+        return 1
+    fi
+
+    current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null)
+    if [ -z "$current_image" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image"
+        return 1
+    fi
+
+    if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then
+        echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image"
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)"
+
+
+    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
+        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed"
+    return 0
+}
+
+phase2_approval() {
+    local backup_base default_backup_dir
+
+    echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate"
+    echo "============================================"
+    echo "OMNIA UPGRADE SUMMARY"
+    echo "============================================"
+    echo "Current Container Tag: 1.0"
+    echo "Target Container Tag:  1.1"
+    echo "Current Omnia Release: 2.0.0.0"
+    echo "Target Omnia Release:  2.1.0.0"
+    echo "New Features:"
+    echo "  - Add and remove node for slurm cluster"
+    echo "  - Additional Package Installation"
+    echo "============================================"
+
+    default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade"
+    backup_base="$default_backup_dir"
+
+    echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base"
+
+    if ! update_metadata_upgrade_backup_dir "$backup_base"; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata"
+        return 1
+    fi
+
+    read -p "Proceed with upgrade? (y/N): " confirm
+    if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then
+        echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user"
+        return 1
+    fi
+
+    OMNIA_UPGRADE_BACKUP_PATH="$backup_base"
+    export OMNIA_UPGRADE_BACKUP_PATH
+
+    echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted"
+    return 0
+}
+
+upgrade_omnia_core() {
+    local lock_file="/var/lock/omnia_core_upgrade.lock"
+
+    if [ -e "$lock_file" ]; then
+        echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}"
+        exit 1
+    fi
+
+    mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true
+    echo "$$" > "$lock_file" || {
+        echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}"
+        exit 1
+    }
+    trap 'rm -f "$lock_file"' EXIT
+
+    if ! phase1_validate; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
+        exit 1
+    fi
+
+    if ! phase2_approval; then
+        exit 0
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Upgrade tasks for backup and container swap are deferred to a follow-up PR"
+    exit 0
+}
+
 # Main function to check if omnia_core container is already running.
 # If yes, ask the user if they want to enter the container or reinstall.
 # If no, set it up.
@@ -1150,6 +1303,9 @@ main() {
         --uninstall|-u)
             cleanup_omnia_core
             ;;
+        --upgrade)
+            upgrade_omnia_core
+            ;;
         --version|-v)
             display_version
             ;;

From 9bb967a19cc7dec8a879a269461b74b5b015569d Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 05:02:47 +0000
Subject: [PATCH 040/172] login-nodes directory creation

---
 discovery/roles/slurm_config/tasks/create_slurm_dir.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 9ce43dcd6a..35fe7910b0 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -114,7 +114,7 @@
     - "{{ (ctld_list + cmpt_list + login_list + compiler_login_list) | product(common_dir) }}"
     - "{{ ctld_list | product(ctld_dir) }}"
     - "{{ dbd_list | product(db_dir) }}"
-    - "{{ cmpt_list | product(cmpt_dir) }}"
+    - "{{   ( cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}"
   loop_control:
     loop_var: product
 

From c519d6e81a9c40ef008e516c143f25e032cbe90d Mon Sep 17 00:00:00 2001
From: balajikumaran-c-s <balajikumaran.cs@dellteam.com>
Date: Fri, 6 Feb 2026 08:19:00 +0000
Subject: [PATCH 041/172] Add pulp and openchami image pull prereqs

---
 prepare_oim/prepare_oim.yml                   | 11 ++++++
 .../openchami/tasks/deployment_prereq.yml     | 30 ++++++++++++++++
 .../deploy_containers/openchami/vars/main.yml | 36 +++++++++++++++++++
 .../pulp/tasks/deployment_prereq.yml          | 14 ++++++++
 .../deploy_containers/pulp/vars/main.yml      |  4 +++
 5 files changed, 95 insertions(+)
 create mode 100644 prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml

diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml
index 49bead531f..a78d21e8d9 100644
--- a/prepare_oim/prepare_oim.yml
+++ b/prepare_oim/prepare_oim.yml
@@ -97,6 +97,17 @@
         name: deploy_containers/openchami  # noqa:role-name[path]
         tasks_from: verify_openchami.yml
 
+- name: OpenCHAMI deployment prereq
+  hosts: oim
+  connection: ssh
+  gather_facts: false
+  tags: openchami
+  tasks:
+    - name: Pull OpenCHAMI images
+      ansible.builtin.include_role:
+        name: deploy_containers/openchami  # noqa:role-name[path]
+        tasks_from: deployment_prereq.yml
+
 - name: Deploy the openchami container
   hosts: localhost
   connection: local
diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml
new file mode 100644
index 0000000000..109bc725f3
--- /dev/null
+++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml
@@ -0,0 +1,30 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Pull OpenCHAMI images using Podman
+  ansible.builtin.command:
+    cmd: "podman pull {{ item }}"
+  loop: "{{ openchami_images }}"
+  register: pull_result
+  retries: "{{ pull_image_retries }}"
+  delay: "{{ pull_image_delay }}"
+  until: pull_result.rc == 0
+  changed_when: false
+
+- name: Fail if any OpenCHAMI image pull failed
+  ansible.builtin.fail:
+    msg: "Failed to pull OpenCHAMI image: {{ item.item }}. Error: {{ item.stderr }}"
+  loop: "{{ pull_result.results }}"
+  when: item.rc != 0
diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml
index 6d0848e0af..2d7db2ca85 100644
--- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml
+++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml
@@ -36,5 +36,41 @@ data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci"
 data_s3_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/s3"
 s3_work_dir: "{{ oim_shared_path }}/omnia/openchami/s3"
 
+# Usage: deploy_openchami.yml - pull openchami images
+pull_image_retries: 5
+pull_image_delay: 10
+
+# OpenCHAMI image tags
+openchami_local_ca_tag: "v0.2.2"
+openchami_opaal_tag: "v0.3.10"
+openchami_smd_tag: "v2.18.0"
+openchami_bss_tag: "v1.32.0"
+openchami_cloud_init_tag: "v1.2.3"
+openchami_coredhcp_tag: "v0.3.0"
+# Third-party image tags for OpenCHAMI
+minio_tag: "latest"
+postgres_tag: "11.5-alpine"
+hydra_tag: "v2.3"
+haproxy_tag: "latest"
+registry_tag: "latest"
+curl_tag: "latest"
+acme_tag: "3.1.1"
+
+# OpenCHAMI images list for podman pull on OIM
+openchami_images:
+  - "ghcr.io/openchami/local-ca:{{ openchami_local_ca_tag }}"
+  - "ghcr.io/openchami/opaal:{{ openchami_opaal_tag }}"
+  - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}"
+  - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}"
+  - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}"
+  - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}"
+  - "docker.io/minio/minio:{{ minio_tag }}"
+  - "docker.io/library/postgres:{{ postgres_tag }}"
+  - "docker.io/oryd/hydra:{{ hydra_tag }}"
+  - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}"
+  - "docker.io/library/registry:{{ registry_tag }}"
+  - "cgr.dev/chainguard/curl:{{ curl_tag }}"
+  - "docker.io/neilpang/acme.sh:{{ acme_tag }}"
+
 # Usage: verify_openchami.yml
 cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN"
diff --git a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml
index 4ae77823a0..09ec52e6a4 100644
--- a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml
+++ b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml
@@ -38,6 +38,20 @@
   when: hostname_enabled
   no_log: true
 
+- name: Pull Pulp image using Podman
+  ansible.builtin.command:
+    cmd: "podman pull {{ pulp_image }}"
+  register: pulp_pull_result
+  retries: "{{ pull_image_retries }}"
+  delay: "{{ pull_image_delay }}"
+  until: pulp_pull_result is not failed
+  changed_when: false
+
+- name: Fail if Pulp image pull failed
+  ansible.builtin.fail:
+    msg: "Failed to pull Pulp image: {{ pulp_image }}. Error: {{ pulp_pull_result.stderr }}"
+  when: pulp_pull_result.rc != 0
+
 - name: Invoke Pulp Container Deployment Tasks for HTTP
   ansible.builtin.include_tasks: deploy_pulp_container_http.yml
   when: not pulp_protocol_https
diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
index 5613c13055..26dbec2dae 100644
--- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
+++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
@@ -27,6 +27,10 @@ pulp_protocol_https: true
 # Tag is fixed for the Pulp container image as of 10-06-2025
 pulp_image: "docker.io/pulp/pulp:3.80"
 
+# Usage: deployment_prereq.yml - pull image retries
+pull_image_retries: 5
+pull_image_delay: 10
+
 arg_list:
   - "-e PULP_WORKERS=10"
   - "-e PULP_API_WORKERS=10"

From 799439c2ee673a769928c7ac7acad1f7cba46373 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Fri, 6 Feb 2026 14:25:38 +0530
Subject: [PATCH 042/172] Fix issue when slurm cluster not active

---
 .../slurm_config/tasks/check_ctld_running.yml | 70 ++++++++++---------
 discovery/roles/slurm_config/tasks/confs.yml  | 13 +++-
 .../slurm_config/tasks/create_slurm_dir.yml   |  2 +-
 discovery/roles/slurm_config/vars/main.yml    |  1 -
 4 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 5f89e051b8..dacd879bf7 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-- name: Initialize ctld_state dict
-  ansible.builtin.set_fact:
-    ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}"
-
 - name: Check if remote host is reachable via SSH
   ansible.builtin.wait_for:
     host: "{{ item }}"
@@ -24,38 +20,44 @@
     state: started
   delegate_to: localhost
   register: ssh_check
+  ignore_errors: true
 
-- name: Check if slurmctld is running on remote host
-  ansible.builtin.service_facts:
-  delegate_to: "{{ item }}"
-  register: service_facts
+- name: Block when ssh_check is success
   when: ssh_check is success
+  block:
+    - name: Initialize ctld_state dict
+      ansible.builtin.set_fact:
+        ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}"
 
-- name: Update ctld_state if slurmctld is running
-  ansible.builtin.set_fact:
-    ctld_state: "{{ ctld_state | combine({item: true}) }}"
-  when:
-    - ssh_check is success
-    - service_facts is success
-    - ansible_facts.services['slurmctld.service'] is defined
-    - ansible_facts.services['slurmctld.service'].state == 'running'
+    - name: Check if slurmctld is running on remote host
+      ansible.builtin.service_facts:
+      delegate_to: "{{ item }}"
+      register: service_facts
 
-- name: Update /etc/hosts with controller hostname and IP
-  ansible.builtin.lineinfile:
-    path: /etc/hosts
-    regexp: '^{{ ip.value }}\s+{{ ip.key }}'
-    line: "{{ ip.value }} {{ ip.key }}"
-    state: present
-  loop: "{{ ip_name_map | dict2items }}"
-  loop_control:
-    loop_var: ip
-  delegate_to: "{{ item }}"
-  when: ssh_check is success
+    - name: Update ctld_state if slurmctld is running
+      ansible.builtin.set_fact:
+        ctld_state: "{{ ctld_state | combine({item: true}) }}"
+      when:
+        - service_facts is success
+        - ansible_facts.services['slurmctld.service'] is defined
+        - ansible_facts.services['slurmctld.service'].state == 'running'
+
+    - name: Update /etc/hosts with controller hostname and IP
+      ansible.builtin.lineinfile:
+        path: /etc/hosts
+        regexp: '^{{ ip.value }}\s+{{ ip.key }}'
+        line: "{{ ip.value }} {{ ip.key }}"
+        state: present
+      loop: "{{ ip_name_map | dict2items }}"
+      loop_control:
+        loop_var: ip
+      delegate_to: "{{ item }}"
 
-- name: Trigger the scontrol reconfigure
-  ansible.builtin.command: scontrol reconfigure
-  changed_when: scontrol_reconfig.rc == 0
-  failed_when: false
-  register: scontrol_reconfig
-  delegate_to: "{{ item }}"
-  when: ctld_state[item] is true
+    - name: Trigger the scontrol reconfigure
+      ansible.builtin.command: scontrol reconfigure
+      changed_when: scontrol_reconfig.rc == 0
+      failed_when: false
+      register: scontrol_reconfig
+      delegate_to: "{{ item }}"
+      when:
+        - ctld_state[item] is true
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 33315709cc..1ff30acf34 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -86,12 +86,19 @@
     loop_var: existing_conf_set
   register: prepared_conf_lists
 
+# All the updates to the confs follow after this point before merge
 - name: Prepend ClusterName and SlurmctldHost to slurm conf sources
   ansible.builtin.set_fact: # TODO: Change order if needed
     conf_merge_dict: "{{ conf_merge_dict
-     | combine({'slurm': [{'ClusterName': cluster_name, 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}"
+     | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}"
   when: "'slurm' in conf_merge_dict"
 
+- name: Slurm dbd - DbdHost and StorageHost
+  ansible.builtin.set_fact:
+    conf_merge_dict: "{{ conf_merge_dict
+     | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}"
+  when: "'slurmdbd' in conf_merge_dict"
+
 - name: Merge the confs
   slurm_conf:
     op: merge
@@ -141,6 +148,10 @@
   loop_control:
     loop_var: product
 
+- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS
+  ansible.builtin.set_fact:
+    conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}"
+
 - name: Create backup directory with timestamp
   ansible.builtin.file:
     path: "{{ backup_dir }}"
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 35fe7910b0..81a08adfca 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -114,7 +114,7 @@
     - "{{ (ctld_list + cmpt_list + login_list + compiler_login_list) | product(common_dir) }}"
     - "{{ ctld_list | product(ctld_dir) }}"
     - "{{ dbd_list | product(db_dir) }}"
-    - "{{   ( cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}"
+    - "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}"
   loop_control:
     loop_var: product
 
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 3a8c43ad93..9722725a88 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -90,7 +90,6 @@ common_mode: "0755"
 slurm_dbd_mode: "0600"
 slurm_db_cnf_mode: "0600"
 dbd_slurm_conf:
-  AccountingStorageHost: "{{ dbd_list[0] }}"
   AccountingStoragePort: "{{ slurm_dbd_port }}"
   AccountingStorageType: accounting_storage/slurmdbd
 partition_params:

From 61dea066b0fec79c1a7dcd91c9805cf0cfc993dc Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Fri, 6 Feb 2026 14:57:17 +0530
Subject: [PATCH 043/172] improve container images validation

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../local_repo/container_repo_utils.py        | 78 +++++++++++++++++--
 .../module_utils/local_repo/download_image.py |  4 +-
 2 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py
index 914d7bff56..3b8eb29662 100644
--- a/common/library/module_utils/local_repo/container_repo_utils.py
+++ b/common/library/module_utils/local_repo/container_repo_utils.py
@@ -98,13 +98,15 @@ def create_container_distribution(repo_name,package_content,logger):
         logger.error(f"Error creating distribution {repo_name}: {e}")
         return False
 
-def sync_container_repository(repo_name, remote_name, package_content, logger):
+def sync_container_repository(repo_name, remote_name, package_content, logger, tag=None):
     """
     Synchronizes and distribute container repository with a remote.
     Args:
         repo_name (str): The name of the repository.
         remote_name (str): The name of the remote.
         package_content (str): Upstream name.
+        logger: Logger instance.
+        tag (str, optional): The tag to validate in repository content.
     Returns:
         bool: True if the synchronization is successful, False otherwise.
     """
@@ -141,10 +143,76 @@ def sync_container_repository(repo_name, remote_name, package_content, logger):
                     return False
                 
                 if version_before and version_after and version_before == version_after:
-                    logger.error(f"Sync completed but repository version did not change for {repo_name}. "
-                               f"Version remained at {version_after}. "
-                               f"The specified image tag likely does not exist in the remote registry.")
-                    return False
+                    # Check if tag actually exists using precise Pulp commands
+                    try:
+                        # Step 1: Get distribution to find repository href
+                        dist_command = f"pulp container distribution show --name {repo_name}"
+                        dist_result = execute_command(dist_command, logger, type_json=True)
+                        
+                        if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result:
+                            logger.error(f"Failed to get distribution info for {repo_name}. Assuming tag doesn't exist.")
+                            return False
+                        
+                        dist_data = dist_result["stdout"]
+                        if not isinstance(dist_data, dict) or "repository" not in dist_data:
+                            logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.")
+                            return False
+                        
+                        repo_href = dist_data["repository"]
+                        logger.info(f"Found repository href: {repo_href}")
+                        
+                        # Step 2: Get repository version href
+                        repo_command = f"pulp container repository show --href {repo_href}"
+                        repo_result = execute_command(repo_command, logger, type_json=True)
+                        
+                        if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result:
+                            logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.")
+                            return False
+                        
+                        repo_data = repo_result["stdout"]
+                        if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data:
+                            logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.")
+                            return False
+                        
+                        repo_ver_href = repo_data["latest_version_href"]
+                        logger.info(f"Found repository version href: {repo_ver_href}")
+                        
+                        # Step 3: Check if tag exists in content
+                        tags_command = f"pulp show --href '/pulp/api/v3/content/container/tags/?repository_version={repo_ver_href}'"
+                        tags_result = execute_command(tags_command, logger, type_json=True)
+                        
+                        if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result:
+                            logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.")
+                            return False
+                        
+                        tags_data = tags_result["stdout"]
+                        if not isinstance(tags_data, dict) or "results" not in tags_data:
+                            logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.")
+                            return False
+                        
+                        tags = tags_data["results"]
+                        tag_exists = False
+                        
+                        # Use the tag parameter if provided, otherwise fall back to checking package_content
+                        tag_to_check = tag if tag else package_content
+                        
+                        for tag_item in tags:
+                            if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check:
+                                tag_exists = True
+                                break
+                        
+                        if tag_exists:
+                            logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.")
+                        else:
+                            logger.error(f"Sync completed but repository version did not change for {repo_name}. "
+                                       f"Version remained at {version_after}. "
+                                       f"Tag '{tag_to_check}' does not exist in Pulp repository content. "
+                                       f"This indicates the tag likely does not exist in the upstream registry.")
+                            return False
+                            
+                    except Exception as e:
+                        logger.error(f"Error checking repository tag existence: {e}. Assuming tag doesn't exist.")
+                        return False
                 
                 logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}")
         
diff --git a/common/library/module_utils/local_repo/download_image.py b/common/library/module_utils/local_repo/download_image.py
index ffc5518177..98a1cb5b66 100644
--- a/common/library/module_utils/local_repo/download_image.py
+++ b/common/library/module_utils/local_repo/download_image.py
@@ -345,8 +345,10 @@ def process_image(package, status_file_path, version_variables,
                 raise Exception(f"Failed to create remote: {remote_name}")
 
         # Sync and distribute
+        # Pass tag_val if it exists (for tag-based images), otherwise None (for digest-based images)
+        tag_to_pass = tag_val if "tag" in package else None
         result = sync_container_repository(
-            repository_name, remote_name, package_content, logger
+            repository_name, remote_name, package_content, logger, tag=tag_to_pass
         )
         if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
             raise Exception(f"Failed to sync repository: {repository_name}")

From 24acd8ce980222ef66ad1a660a5db48875c5fc6b Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 15:29:29 +0530
Subject: [PATCH 044/172] Upgrade of template handling logic and
 high_availability_config.yml

Addition of upgrade logic for high_availability_config.yml and template handling logic
---
 .../import_input_parameters/tasks/main.yml    |  11 +-
 .../tasks/precheck_backup_location.yml        |  25 ++++
 .../tasks/restore_input_files.yml             |  25 ++++
 .../tasks/restore_pxe_mapping_file.yml        |  49 --------
 .../tasks/restore_single_input_file.yml       |  54 +++++++++
 .../tasks/restore_software_config.yml         |  60 ---------
 .../transform_high_availability_config.yml    | 114 ++++++++++++++++++
 .../tasks/transform_network_spec.yml          |  89 +++-----------
 .../templates/high_availability_config.j2     |  27 +++++
 .../import_input_parameters/vars/main.yml     |  52 +++++++-
 upgrade/upgrade_oim.yml                       |   1 +
 11 files changed, 323 insertions(+), 184 deletions(-)
 create mode 100644 upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_input_files.yml
 delete mode 100644 upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml
 delete mode 100644 upgrade/roles/import_input_parameters/tasks/restore_software_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/templates/high_availability_config.j2

diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml
index af45a1de1b..7687f852bb 100644
--- a/upgrade/roles/import_input_parameters/tasks/main.yml
+++ b/upgrade/roles/import_input_parameters/tasks/main.yml
@@ -13,11 +13,14 @@
 # limitations under the License.
 ---
 
+- name: Validate backup location for upgrade input processing
+  ansible.builtin.include_tasks: precheck_backup_location.yml
+
 - name: Transform network_spec.yml from Omnia 2.0 to 2.1
   ansible.builtin.include_tasks: transform_network_spec.yml
 
-- name: Restore software_config.json from backup
-  ansible.builtin.include_tasks: restore_software_config.yml
+- name: Transform high_availability_config.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_high_availability_config.yml
 
-- name: Restore pxe_mapping_file.csv from backup
-  ansible.builtin.include_tasks: restore_pxe_mapping_file.yml
+- name: Restore input files from backup
+  ansible.builtin.include_tasks: restore_input_files.yml
diff --git a/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml
new file mode 100644
index 0000000000..fe058f83a9
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml
@@ -0,0 +1,25 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate backup_location is provided
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_location_missing }}"
+  when: backup_location is not defined or (backup_location | string | trim) == ""
+
+- name: Ensure backup directory exists
+  ansible.builtin.file:
+    path: "{{ backup_location }}"
+    state: directory
+    mode: "{{ backup_dir_mode }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml b/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml
new file mode 100644
index 0000000000..3dd6d45206
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml
@@ -0,0 +1,25 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate restore_input_files is defined
+  ansible.builtin.set_fact:
+    restore_input_files_effective: "{{ restore_input_files | default([]) }}"
+
+- name: Restore input files from backup (overwrite target)
+  ansible.builtin.include_tasks: restore_single_input_file.yml
+  loop: "{{ restore_input_files_effective }}"
+  loop_control:
+    loop_var: restore_item
+  when: (restore_input_files_effective | length) > 0
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml
deleted file mode 100644
index f468359305..0000000000
--- a/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-- name: Validate backup_location is provided
-  ansible.builtin.fail:
-    msg: "backup_location must be provided to restore pxe_mapping_file.csv"
-  when: backup_location is not defined or (backup_location | string | trim) == ""
-
-- name: Ensure backup directory exists
-  ansible.builtin.file:
-    path: "{{ backup_location }}"
-    state: directory
-    mode: '0755'
-
-- name: Check if backup pxe_mapping_file.csv exists
-  ansible.builtin.stat:
-    path: "{{ backup_location }}/pxe_mapping_file.csv"
-  register: backup_pxe_mapping_stat
-
-- name: Fail if backup pxe_mapping_file.csv is not present
-  ansible.builtin.fail:
-    msg: "Backup pxe_mapping_file.csv is not present at {{ backup_location }}/pxe_mapping_file.csv"
-  when: not backup_pxe_mapping_stat.stat.exists
-
-- name: Overwrite pxe_mapping_file.csv in input directory from backup
-  ansible.builtin.copy:
-    src: "{{ backup_location }}/pxe_mapping_file.csv"
-    dest: "{{ omnia_input_dir }}/pxe_mapping_file.csv"
-    mode: '0644'
-    remote_src: true
-
-- name: Display pxe_mapping_file.csv restore summary
-  ansible.builtin.debug:
-    msg: |
-      pxe_mapping_file.csv restored from backup.
-      Backup preserved at: {{ backup_location }}/pxe_mapping_file.csv
-      Restored to: {{ omnia_input_dir }}/pxe_mapping_file.csv
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml
new file mode 100644
index 0000000000..f55d14bd3e
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml
@@ -0,0 +1,54 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Validate restore item fields
+  ansible.builtin.fail:
+    msg: "{{ msg_restore_item_name_missing }}"
+  when: restore_item.name is not defined or (restore_item.name | string | trim) == ""
+
+- name: Check if backup file exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/{{ restore_item.name }}"
+  register: restore_backup_stat
+
+- name: Fail if backup file is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_file_missing }}"
+  when: not restore_backup_stat.stat.exists
+
+- name: Overwrite input file from backup
+  ansible.builtin.copy:
+    src: "{{ backup_location }}/{{ restore_item.name }}"
+    dest: "{{ input_project_dir }}/{{ restore_item.name }}"
+    mode: "{{ restore_item.mode | default(default_file_mode) }}"
+    remote_src: true
+
+- name: Validate restored file (optional)
+  ansible.builtin.command:
+    cmd: "{{ restore_item.validate_cmd }}"
+  register: restore_validate
+  changed_when: false
+  when: restore_item.validate_cmd is defined and (restore_item.validate_cmd | string | trim) != ""
+
+- name: Fail if restored file validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_validation_failed }}"
+  when:
+    - restore_item.validate_cmd is defined and (restore_item.validate_cmd | string | trim) != ""
+    - restore_validate.rc != 0
+
+- name: Display restore summary
+  ansible.builtin.debug:
+    msg: "{{ msg_restore_summary }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml b/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml
deleted file mode 100644
index 9891023702..0000000000
--- a/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-- name: Validate backup_location is provided
-  ansible.builtin.fail:
-    msg: "backup_location must be provided to restore software_config.json"
-  when: backup_location is not defined or (backup_location | string | trim) == ""
-
-- name: Ensure backup directory exists
-  ansible.builtin.file:
-    path: "{{ backup_location }}"
-    state: directory
-    mode: '0755'
-
-- name: Check if backup software_config.json exists
-  ansible.builtin.stat:
-    path: "{{ backup_location }}/software_config.json"
-  register: backup_software_config_stat
-
-- name: Fail if backup software_config.json is not present
-  ansible.builtin.fail:
-    msg: "Backup software_config.json is not present at {{ backup_location }}/software_config.json"
-  when: not backup_software_config_stat.stat.exists
-
-- name: Overwrite software_config.json in input directory from backup
-  ansible.builtin.copy:
-    src: "{{ backup_location }}/software_config.json"
-    dest: "{{ omnia_input_dir }}/software_config.json"
-    mode: '0644'
-    remote_src: true
-
-- name: Validate JSON syntax of software_config.json
-  ansible.builtin.command:
-    cmd: python3 -m json.tool "{{ omnia_input_dir }}/software_config.json"
-  register: software_config_json_validation
-  changed_when: false
-
-- name: Fail if software_config.json JSON validation fails
-  ansible.builtin.fail:
-    msg: "JSON validation failed after restoring software_config.json"
-  when: software_config_json_validation.rc != 0
-
-- name: Display software_config.json restore summary
-  ansible.builtin.debug:
-    msg: |
-      software_config.json restored from backup.
-      Backup preserved at: {{ backup_location }}/software_config.json
-      Restored to: {{ omnia_input_dir }}/software_config.json
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml
new file mode 100644
index 0000000000..494dfda41a
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml
@@ -0,0 +1,114 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup high_availability_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/high_availability_config.yml"
+  register: backup_ha_config_stat
+
+- name: Fail if backup high_availability_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_ha_config_missing }}"
+  when: not backup_ha_config_stat.stat.exists
+
+- name: Check if high_availability_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/high_availability_config.yml"
+  register: ha_config_stat
+
+- name: Fail if high_availability_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_ha_config_missing }}"
+  when: not ha_config_stat.stat.exists
+
+- name: Read backup high_availability_config.yml (source of truth)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/high_availability_config.yml"
+  register: backup_ha_config_slurp
+
+- name: Parse backup high_availability_config.yml
+  ansible.builtin.set_fact:
+    backup_ha_config: "{{ backup_ha_config_slurp.content | b64decode | from_yaml }}"
+
+- name: Normalize service_k8s_cluster_ha to a list
+  ansible.builtin.set_fact:
+    ha_service_k8s_cluster_ha: >-
+      {{
+        (
+          [backup_ha_config.service_k8s_cluster_ha]
+          if (backup_ha_config.service_k8s_cluster_ha is mapping)
+          else (backup_ha_config.service_k8s_cluster_ha | default([]))
+        )
+      }}
+
+- name: Collect HA entries missing virtual_ip_address
+  ansible.builtin.set_fact:
+    ha_entries_missing_vip: >-
+      {{
+        (ha_service_k8s_cluster_ha | default([]))
+        | select('mapping')
+        | selectattr('virtual_ip_address', 'undefined')
+        | map(attribute='cluster_name')
+        | list
+      }}
+
+- name: Collect HA entries with empty virtual_ip_address
+  ansible.builtin.set_fact:
+    ha_entries_empty_vip: >-
+      {{
+        (ha_service_k8s_cluster_ha | default([]))
+        | select('mapping')
+        | selectattr('virtual_ip_address', 'defined')
+        | selectattr('virtual_ip_address', 'match', '^\\s*$')
+        | map(attribute='cluster_name')
+        | list
+      }}
+
+- name: Fail if virtual_ip_address is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_ha_virtual_ip_missing }}"
+  when:
+    - (ha_service_k8s_cluster_ha | default([]) | length) == 0
+      or ((ha_entries_missing_vip | default([]) | length) > 0)
+      or ((ha_entries_empty_vip | default([]) | length) > 0)
+
+- name: Write high_availability_config.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: high_availability_config.j2
+    dest: "{{ input_project_dir }}/high_availability_config.yml"
+    mode: "{{ default_file_mode }}"
+  vars:
+    ha_service_k8s_cluster_ha: "{{ ha_service_k8s_cluster_ha }}"
+
+- name: Validate YAML syntax of transformed high_availability_config.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/high_availability_config.yml','r'))"
+  register: ha_yaml_validation
+  changed_when: false
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_yaml_validation_failed }}"
+  when:
+    - ha_yaml_validation.rc != 0
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "{{ msg_using_backup_ha_config }}"
+  when: true
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: "{{ msg_ha_config_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
index 051bbfb13c..d4b3a92e29 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
@@ -13,17 +13,6 @@
 # limitations under the License.
 ---
 
-- name: Validate backup_location is provided
-  ansible.builtin.fail:
-    msg: "backup_location must be provided to run network_spec.yml upgrade"
-  when: backup_location is not defined or (backup_location | string | trim) == ""
-
-- name: Ensure backup directory exists
-  ansible.builtin.file:
-    path: "{{ backup_location }}"
-    state: directory
-    mode: '0755'
-
 - name: Check if backup network_spec.yml exists
   ansible.builtin.stat:
     path: "{{ backup_location }}/network_spec.yml"
@@ -31,55 +20,27 @@
 
 - name: Fail if backup network_spec.yml is not present
   ansible.builtin.fail:
-    msg: "Backup network_spec.yml is not present at {{ backup_location }}/network_spec.yml"
+    msg: "{{ msg_backup_network_spec_missing }}"
   when: not backup_network_spec_stat.stat.exists
 
 - name: Check if network_spec.yml exists
   ansible.builtin.stat:
-    path: "{{ omnia_input_dir }}/network_spec.yml"
+    path: "{{ input_project_dir }}/network_spec.yml"
   register: network_spec_stat
 
 - name: Fail if network_spec.yml is not present
   ansible.builtin.fail:
-    msg: "network_spec.yml is not present at {{ omnia_input_dir }}/network_spec.yml"
+    msg: "{{ msg_network_spec_missing }}"
   when: not network_spec_stat.stat.exists
 
-- name: Read existing network_spec.yml
-  ansible.builtin.slurp:
-    src: "{{ omnia_input_dir }}/network_spec.yml"
-  register: network_spec_slurp
-  when: network_spec_stat.stat.exists
-
-- name: Parse existing network_spec.yml
-  ansible.builtin.set_fact:
-    network_spec_existing: "{{ network_spec_slurp.content | b64decode | from_yaml }}"
-  when: network_spec_stat.stat.exists
-
-- name: Check if network_spec.yml is already in Omnia 2.1 format
-  ansible.builtin.set_fact:
-    network_spec_already_21: >-
-      {{
-        (network_spec_existing.schema_version | default('') | string) == '2.1'
-        and (network_spec_existing.Networks is defined)
-        and ((network_spec_existing.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) > 0)
-      }}
-  when: network_spec_stat.stat.exists
-
-- name: Skip transformation when network_spec.yml is already in 2.1 format
-  ansible.builtin.debug:
-    msg: "network_spec.yml is already in Omnia 2.1 format. Skipping transformation."
-  when: network_spec_already_21 | default(false) | bool
-
-- name: Read backup network_spec.yml (Omnia 2.0 source)
+- name: Read backup network_spec.yml (source of truth)
   ansible.builtin.slurp:
     src: "{{ backup_location }}/network_spec.yml"
   register: backup_network_spec_slurp
-  when: not (network_spec_already_21 | default(false) | bool)
 
 - name: Parse backup network_spec.yml
   ansible.builtin.set_fact:
     backup_network_spec: "{{ backup_network_spec_slurp.content | b64decode | from_yaml }}"
-  when: not (network_spec_already_21 | default(false) | bool)
 
 - name: Extract admin_network and ib_network from backup file
   ansible.builtin.set_fact:
@@ -114,79 +75,69 @@
         )
       }}
   when:
-    - not (network_spec_already_21 | default(false) | bool)
+    - true
 
 - name: Render network_spec.yml in Omnia 2.1 format
   ansible.builtin.template:
     src: network_spec.j2
-    dest: "{{ omnia_input_dir }}/network_spec.yml"
-    mode: '0644'
+    dest: "{{ input_project_dir }}/network_spec.yml"
+    mode: "{{ default_file_mode }}"
   vars:
     admin_network_netmask_bits: "{{ admin_network.netmask_bits | default('24') }}"
-  when: not (network_spec_already_21 | default(false) | bool)
+  when: true
 
 - name: Read transformed network_spec.yml
   ansible.builtin.slurp:
-    src: "{{ omnia_input_dir }}/network_spec.yml"
+    src: "{{ input_project_dir }}/network_spec.yml"
   register: network_spec_21_slurp
-  when: not (network_spec_already_21 | default(false) | bool)
+  when: true
 
 - name: Parse transformed network_spec.yml
   ansible.builtin.set_fact:
     network_spec_21: "{{ network_spec_21_slurp.content | b64decode | from_yaml }}"
-  when: not (network_spec_already_21 | default(false) | bool)
+  when: true
 
 - name: Validate YAML syntax of transformed network_spec.yml
   ansible.builtin.command:
-    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ omnia_input_dir }}/network_spec.yml','r'))"
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/network_spec.yml','r'))"
   register: network_spec_yaml_validation
   changed_when: false
-  when: not (network_spec_already_21 | default(false) | bool)
+  when: true
 
 - name: Fail if YAML validation fails
   ansible.builtin.fail:
-    msg: "YAML validation failed after transforming network_spec.yml"
+    msg: "{{ msg_yaml_validation_failed }}"
   when:
-    - not (network_spec_already_21 | default(false) | bool)
     - network_spec_yaml_validation.rc != 0
 
 - name: Ensure ib_network.netmask_bits matches admin_network.netmask_bits
   ansible.builtin.fail:
-    msg: "ib_network.netmask_bits must match admin_network.netmask_bits in Omnia 2.1"
+    msg: "{{ msg_ib_netmask_mismatch }}"
   when:
-    - not (network_spec_already_21 | default(false) | bool)
     - >-
       (ib_network.netmask_bits | default(admin_network.netmask_bits | default('24')) | string)
       != (admin_network.netmask_bits | default('24') | string)
 
 - name: Display backup path (no-op when skipped)
   ansible.builtin.debug:
-    msg: "Using backup as input source: {{ backup_location }}/network_spec.yml (backup is not modified)"
-  when: not (network_spec_already_21 | default(false) | bool)
+    msg: "{{ msg_using_backup_network_spec }}"
+  when: true
 
 - name: Validate mandatory ib_network is present in transformed output
   ansible.builtin.fail:
-    msg: "ib_network is mandatory in Omnia 2.1 network_spec.yml"
+    msg: "{{ msg_ib_network_missing }}"
   when:
-    - not (network_spec_already_21 | default(false) | bool)
     - >-
       (network_spec_21.Networks is not defined)
       or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0)
 
 - name: Validate mandatory ib_network.subnet is present in transformed output
   ansible.builtin.fail:
-    msg: "ib_network.subnet is mandatory in Omnia 2.1 network_spec.yml"
+    msg: "{{ msg_ib_subnet_missing }}"
   when:
-    - not (network_spec_already_21 | default(false) | bool)
     - >-
       ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | map(attribute='ib_network') | first | default({})).subnet | default('') | string | trim) == ''
 
 - name: Display transformation summary
   ansible.builtin.debug:
-    msg: |
-      network_spec.yml upgraded to Omnia 2.1 format.
-      Backup preserved at: {{ backup_location }}/network_spec.yml
-      Key changes:
-      - Added mandatory ib_network section
-      - primary_oim_bmc_ip treated as optional
-      - ib_network.netmask_bits aligned with admin_network.netmask_bits
+    msg: "{{ msg_network_spec_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 b/upgrade/roles/import_input_parameters/templates/high_availability_config.j2
new file mode 100644
index 0000000000..b116d962fe
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/high_availability_config.j2
@@ -0,0 +1,27 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+# ***********************************************************************
+# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE.
+# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE.
+# ***********************************************************************
+
+# ***********************************************************************
+# High Availability (HA) Configuration for Kubernetes (K8s) Service Node(List)
+# - cluster_name is required field it should match one of the values defined in omnia_config.yml where deployment is set to true.
+# - enable_k8s_ha: <Mandatory> Indicates whether to enable HA for the Kubernetes (K8s) service node. Set to 'true' to enable, 'false' to disable.
+# - virtual_ip_address: <Mandatory> The virtual IP address for the K8s service node setup.
+# ***********************************************************************
+
+{{ {'service_k8s_cluster_ha': ha_service_k8s_cluster_ha} | to_nice_yaml(indent=2) }}
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index c44a5bbb87..d7281bdc0b 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -13,6 +13,54 @@
 # limitations under the License.
 ---
 
-omnia_input_dir: /opt/omnia/input/project_default
+backup_location: /opt/omnia/backups/upgrade
 
-backup_location: /opt/omnia/backups/upgrade
\ No newline at end of file
+backup_dir_mode: '0755'
+default_file_mode: '0644'
+
+msg_backup_location_missing: "backup_location must be provided"
+msg_restore_item_name_missing: "restore_item must define 'name'"
+msg_validation_failed: "Validation failed for {{ restore_item.name }}"
+msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
+msg_backup_network_spec_missing: "Backup network_spec.yml missing"
+msg_network_spec_missing: "network_spec.yml missing"
+msg_network_spec_already_21: "network_spec.yml already in 2.1 format - overwriting"
+msg_backup_ha_config_missing: "Backup high_availability_config.yml missing"
+msg_ha_config_missing: "high_availability_config.yml missing"
+msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting"
+msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory"
+msg_yaml_validation_failed: "YAML validation failed"
+
+msg_ib_netmask_mismatch: "ib_network.netmask_bits must match admin_network.netmask_bits"
+msg_ib_network_missing: "ib_network is mandatory"
+msg_ib_subnet_missing: "ib_network.subnet is mandatory"
+msg_using_backup_network_spec: "Using backup network_spec.yml (backup not modified)"
+msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)"
+
+msg_restore_summary: |
+  {{ restore_item.name }} restored from backup.
+  Backup: {{ backup_location }}/{{ restore_item.name }}
+  Target: {{ input_project_dir }}/{{ restore_item.name }}
+
+msg_network_spec_transform_summary: |
+  network_spec.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/network_spec.yml
+  Changes:
+  - Added mandatory ib_network
+  - Made primary_oim_bmc_ip optional
+  - Aligned ib_network.netmask_bits with admin_network.netmask_bits
+
+msg_ha_config_transform_summary: |
+  high_availability_config.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/high_availability_config.yml
+  Changes:
+  - Ensured service_k8s_cluster_ha is a list
+  - Ensured virtual_ip_address is present
+
+restore_input_files:
+  - name: software_config.json
+    mode: '0644'
+    validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'"
+  - name: pxe_mapping_file.csv
+    mode: '0644'
+    validate_cmd: ""
\ No newline at end of file
diff --git a/upgrade/upgrade_oim.yml b/upgrade/upgrade_oim.yml
index 3e91f1a479..aa6e6fb5fc 100644
--- a/upgrade/upgrade_oim.yml
+++ b/upgrade/upgrade_oim.yml
@@ -17,4 +17,5 @@
   hosts: localhost
   connection: local
   roles:
+    - role: ../utils/roles/include_input_dir
     - role: upgrade_oim

From c6866538c83789353087abdce749c19d188c3076 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 15:45:11 +0530
Subject: [PATCH 045/172] Update main.yml

---
 .../import_input_parameters/vars/main.yml     | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index d7281bdc0b..93b328e279 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -18,30 +18,38 @@ backup_location: /opt/omnia/backups/upgrade
 backup_dir_mode: '0755'
 default_file_mode: '0644'
 
+# Precheck backup location messages
 msg_backup_location_missing: "backup_location must be provided"
+
+# Restore input files messages
 msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
+
+# Network spec transformation messages
 msg_backup_network_spec_missing: "Backup network_spec.yml missing"
 msg_network_spec_missing: "network_spec.yml missing"
 msg_network_spec_already_21: "network_spec.yml already in 2.1 format - overwriting"
-msg_backup_ha_config_missing: "Backup high_availability_config.yml missing"
-msg_ha_config_missing: "high_availability_config.yml missing"
-msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting"
-msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory"
 msg_yaml_validation_failed: "YAML validation failed"
-
 msg_ib_netmask_mismatch: "ib_network.netmask_bits must match admin_network.netmask_bits"
 msg_ib_network_missing: "ib_network is mandatory"
 msg_ib_subnet_missing: "ib_network.subnet is mandatory"
 msg_using_backup_network_spec: "Using backup network_spec.yml (backup not modified)"
+
+# High availability config transformation messages
+msg_backup_ha_config_missing: "Backup high_availability_config.yml missing"
+msg_ha_config_missing: "high_availability_config.yml missing"
+msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting"
+msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory"
 msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)"
 
+### Restore summary messages
 msg_restore_summary: |
   {{ restore_item.name }} restored from backup.
   Backup: {{ backup_location }}/{{ restore_item.name }}
   Target: {{ input_project_dir }}/{{ restore_item.name }}
 
+# Restore summary message for network spec transformation
 msg_network_spec_transform_summary: |
   network_spec.yml upgraded to 2.1 format.
   Backup preserved at: {{ backup_location }}/network_spec.yml
@@ -50,6 +58,7 @@ msg_network_spec_transform_summary: |
   - Made primary_oim_bmc_ip optional
   - Aligned ib_network.netmask_bits with admin_network.netmask_bits
 
+# Restore summary message for high availability config transformation
 msg_ha_config_transform_summary: |
   high_availability_config.yml upgraded to 2.1 format.
   Backup preserved at: {{ backup_location }}/high_availability_config.yml
@@ -57,6 +66,19 @@ msg_ha_config_transform_summary: |
   - Ensured service_k8s_cluster_ha is a list
   - Ensured virtual_ip_address is present
 
+# === Input files to restore from backup ===
+# Add input files here that should be copied from backup_location to input_project_dir
+# Each entry should have:
+# - name: filename (required)
+# - mode: file permissions (optional, defaults to default_file_mode)
+# - validate_cmd: validation command (optional, runs after restore)
+#
+# Examples of files to add:
+# - Static configuration files that don't need transformation
+# - Files that are the same format in 2.0 and 2.1
+# - Files where you want to preserve the backup values exactly
+#
+# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml)
 restore_input_files:
   - name: software_config.json
     mode: '0644'

From 2e3f4b4f377b167deb2e5409a3320ddfd8b617a8 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 16:14:59 +0530
Subject: [PATCH 046/172] Update network_spec.j2

---
 upgrade/roles/import_input_parameters/templates/network_spec.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2
index 773a11446c..d9e41ba469 100644
--- a/upgrade/roles/import_input_parameters/templates/network_spec.j2
+++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.

From 3c4c28636497d0dcc4250cd0f924ff6976538291 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 16:24:13 +0530
Subject: [PATCH 047/172] Update main.yml

---
 upgrade/roles/import_input_parameters/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 93b328e279..a87c855751 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -13,7 +13,7 @@
 # limitations under the License.
 ---
 
-backup_location: /opt/omnia/backups/upgrade
+backup_location: /opt/omnia/backups/upgrade/input
 
 backup_dir_mode: '0755'
 default_file_mode: '0644'

From e13132034aeb18eb55a27794cd93ffd7124171d0 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 11:07:32 +0000
Subject: [PATCH 048/172] slurm backup and rollback feature

---
 utils/roles/slurm_cleanup/defaults/main.yml   |   5 +
 utils/roles/slurm_cleanup/tasks/main.yml      |  73 +++
 .../slurm_config_backup/defaults/main.yml     |   4 +
 .../roles/slurm_config_backup/tasks/main.yml  | 116 +++++
 .../slurm_config_rollback/defaults/main.yml   |   5 +
 .../slurm_config_rollback/tasks/main.yml      | 427 ++++++++++++++++++
 utils/slurm_config_util.yml                   |  26 ++
 7 files changed, 656 insertions(+)
 create mode 100644 utils/roles/slurm_cleanup/defaults/main.yml
 create mode 100644 utils/roles/slurm_cleanup/tasks/main.yml
 create mode 100644 utils/roles/slurm_config_backup/defaults/main.yml
 create mode 100644 utils/roles/slurm_config_backup/tasks/main.yml
 create mode 100644 utils/roles/slurm_config_rollback/defaults/main.yml
 create mode 100644 utils/roles/slurm_config_rollback/tasks/main.yml
 create mode 100644 utils/slurm_config_util.yml

diff --git a/utils/roles/slurm_cleanup/defaults/main.yml b/utils/roles/slurm_cleanup/defaults/main.yml
new file mode 100644
index 0000000000..f54396449f
--- /dev/null
+++ b/utils/roles/slurm_cleanup/defaults/main.yml
@@ -0,0 +1,5 @@
+---
+
+slurm_share_dir_name: slurm
+slurm_cleanup_pre_backup_default: 'y'
+slurm_cleanup_confirm_token: 'YES'
diff --git a/utils/roles/slurm_cleanup/tasks/main.yml b/utils/roles/slurm_cleanup/tasks/main.yml
new file mode 100644
index 0000000000..5c59cae2d0
--- /dev/null
+++ b/utils/roles/slurm_cleanup/tasks/main.yml
@@ -0,0 +1,73 @@
+---
+
+- name: Include variable file omnia_config.yml
+  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
+  tags: slurm_cleanup
+
+- name: Include storage vars
+  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
+  tags: slurm_cleanup
+
+- name: Set facts for slurm
+  ansible.builtin.set_fact:
+    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
+  tags: slurm_cleanup
+
+- name: Read the slurm mount point
+  ansible.builtin.set_fact:
+    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
+  tags: slurm_cleanup
+
+- name: Set slurm_config_path
+  ansible.builtin.set_fact:
+    slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}"
+  tags: slurm_cleanup
+
+- name: Prompt for pre-cleanup backup
+  ansible.builtin.pause:
+    prompt: "Before cleanup, take a config backup? (y/n)"
+  register: pre_cleanup_backup
+  tags: slurm_cleanup
+
+- name: Set pre-cleanup backup choice
+  ansible.builtin.set_fact:
+    pre_cleanup_backup_choice: "{{ pre_cleanup_backup.user_input | default('') | trim | lower }}"
+  tags: slurm_cleanup
+
+- name: Fail if pre-cleanup backup choice is empty
+  ansible.builtin.fail:
+    msg: "No input provided for pre-cleanup backup prompt. Cleanup aborted."
+  when: pre_cleanup_backup_choice | length == 0
+  tags: slurm_cleanup
+
+- name: Validate pre-cleanup backup choice
+  ansible.builtin.fail:
+    msg: "Invalid input '{{ pre_cleanup_backup.user_input | default('') }}'. Enter 'y' or 'n'."
+  when: pre_cleanup_backup_choice not in ['y', 'yes', 'n', 'no']
+  tags: slurm_cleanup
+
+- name: Run config backup before cleanup
+  ansible.builtin.include_role:
+    name: slurm_config_backup
+    apply:
+      tags: slurm_cleanup
+  when: pre_cleanup_backup_choice in ['y', 'yes']
+  tags: slurm_cleanup
+
+- name: Confirm cleanup
+  ansible.builtin.pause:
+    prompt: "This will delete {{ slurm_config_path }}. Type {{ slurm_cleanup_confirm_token }} to continue"
+  register: cleanup_confirm
+  tags: slurm_cleanup
+
+- name: Fail if cleanup not confirmed
+  ansible.builtin.fail:
+    msg: "Cleanup aborted"
+  when: cleanup_confirm.user_input != slurm_cleanup_confirm_token
+  tags: slurm_cleanup
+
+- name: Delete slurm share directory
+  ansible.builtin.file:
+    path: "{{ slurm_config_path }}"
+    state: absent
+  tags: slurm_cleanup
diff --git a/utils/roles/slurm_config_backup/defaults/main.yml b/utils/roles/slurm_config_backup/defaults/main.yml
new file mode 100644
index 0000000000..b631a205d0
--- /dev/null
+++ b/utils/roles/slurm_config_backup/defaults/main.yml
@@ -0,0 +1,4 @@
+---
+
+slurm_share_dir_name: slurm
+slurm_backups_dir_name: slurm_backups
diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml
new file mode 100644
index 0000000000..4871ab705b
--- /dev/null
+++ b/utils/roles/slurm_config_backup/tasks/main.yml
@@ -0,0 +1,116 @@
+---
+
+- name: Include variable file omnia_config.yml
+  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
+
+- name: Include storage vars
+  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
+
+- name: Set facts for slurm
+  ansible.builtin.set_fact:
+    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
+
+- name: Read the slurm mount point
+  ansible.builtin.set_fact:
+    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
+
+- name: Display resolved slurm share path
+  ansible.builtin.debug:
+    msg: "Resolved share_path={{ share_path }} (nfs_storage_name={{ nfs_storage_name }})"
+
+- name: Slurp remote YAML file
+  ansible.builtin.slurp:
+    src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
+  register: slurped_yaml
+
+- name: Parse YAML into vars
+  ansible.builtin.set_fact:
+    node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}"
+
+- name: Read the node name group
+  ansible.builtin.set_fact:
+    name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}"
+
+- name: Group the functional_groups
+  ansible.builtin.set_fact:
+    tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}"
+
+- name: Re-organize the groups
+  ansible.builtin.set_fact:
+    grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}"
+  loop: "{{ tmp_grouped_nodes }}"
+
+- name: Assign slurm lists
+  ansible.builtin.set_fact:
+    ctld_list: "{{ grouped_nodes | dict2items
+                   | selectattr('key', 'match', '^' ~ 'slurm_control_node_')
+                   | map(attribute='value') | list | flatten }}"
+
+- name: Fail if Slurm controller list is empty
+  ansible.builtin.fail:
+    msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun."
+  when: ctld_list | length == 0
+
+- name: Set slurm_config_path
+  ansible.builtin.set_fact:
+    slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}"
+
+- name: Display resolved slurm config path
+  ansible.builtin.debug:
+    msg: "Resolved slurm_config_path={{ slurm_config_path }}"
+
+- name: Prompt for backup base name
+  ansible.builtin.pause:
+    prompt: "Enter backup base name (leave empty for timestamp-only)"
+  register: backup_base_name_input
+
+- name: Set backup id
+  ansible.builtin.set_fact:
+    backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
+    backup_base_name: "{{ backup_base_name_input.user_input | default('') }}"
+
+- name: Set backup directory
+  ansible.builtin.set_fact:
+    slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}"
+    backup_id: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}"
+    backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}"
+
+- name: Ensure slurm backups root exists
+  ansible.builtin.file:
+    path: "{{ slurm_backups_root }}"
+    state: directory
+    mode: '0755'
+
+- name: Display slurm backups root
+  ansible.builtin.debug:
+    msg: "Resolved slurm_backups_root={{ slurm_backups_root }}"
+
+- name: Create backup directory
+  ansible.builtin.file:
+    path: "{{ backup_dir }}"
+    state: directory
+    mode: '0755'
+
+- name: Create backup config directories
+  ansible.builtin.file:
+    path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}"
+    state: directory
+    mode: '0755'
+  loop:
+    - etc/slurm
+    - etc/munge
+    - etc/my.cnf.d
+
+- name: Backup controller config directories
+  ansible.builtin.command: >-
+    cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/"
+  loop:
+    - etc/slurm
+    - etc/munge
+    - etc/my.cnf.d
+  changed_when: true
+  failed_when: false
+
+- name: Display backup location
+  ansible.builtin.debug:
+    msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}"
diff --git a/utils/roles/slurm_config_rollback/defaults/main.yml b/utils/roles/slurm_config_rollback/defaults/main.yml
new file mode 100644
index 0000000000..601e25cd18
--- /dev/null
+++ b/utils/roles/slurm_config_rollback/defaults/main.yml
@@ -0,0 +1,5 @@
+---
+
+slurm_share_dir_name: slurm
+slurm_backups_dir_name: slurm_backups
+slurm_rollback_backup_list_limit_default: 20
diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
new file mode 100644
index 0000000000..e9822de876
--- /dev/null
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -0,0 +1,427 @@
+---
+
+- name: Include variable file omnia_config.yml
+  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
+  tags: config_rollback
+
+- name: Include storage vars
+  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
+  tags: config_rollback
+
+- name: Set facts for slurm
+  ansible.builtin.set_fact:
+    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
+  tags: config_rollback
+
+- name: Read the slurm mount point
+  ansible.builtin.set_fact:
+    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
+  tags: config_rollback
+
+- name: Slurp remote YAML file
+  ansible.builtin.slurp:
+    src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
+  register: slurped_yaml
+  tags: config_rollback
+
+- name: Parse YAML into vars
+  ansible.builtin.set_fact:
+    node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}"
+  tags: config_rollback
+
+- name: Get name and IP mapping 1
+  ansible.builtin.set_fact:
+    tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}"
+  tags: config_rollback
+
+- name: Get name and IP mapping 2
+  ansible.builtin.set_fact:
+    ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}"
+  loop: "{{ tmp_ip_name_map | dict2items }}"
+  tags: config_rollback
+
+- name: Read the node name group
+  ansible.builtin.set_fact:
+    name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}"
+  tags: config_rollback
+
+- name: Group the functional_groups
+  ansible.builtin.set_fact:
+    tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}"
+  tags: config_rollback
+
+- name: Re-organize the groups
+  ansible.builtin.set_fact:
+    grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}"
+  loop: "{{ tmp_grouped_nodes }}"
+  tags: config_rollback
+
+- name: Assign slurm lists
+  ansible.builtin.set_fact:
+    ctld_list: "{{ grouped_nodes | dict2items
+                   | selectattr('key', 'match', '^' ~ 'slurm_control_node_')
+                   | map(attribute='value') | list | flatten }}"
+  tags: config_rollback
+
+- name: Fail if Slurm controller list is empty
+  ansible.builtin.fail:
+    msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun."
+  when: ctld_list | length == 0
+  tags: config_rollback
+
+- name: Set slurm controller IP
+  ansible.builtin.set_fact:
+    controller_ip: "{{ ip_name_map[ctld_list | first] }}"
+  when: ctld_list | length > 0
+  tags: config_rollback
+
+- name: Add slurm controller as dynamic host
+  ansible.builtin.add_host:
+    name: slurm_controller
+    ansible_host: "{{ controller_ip }}"
+    ansible_user: root
+    ansible_port: 22
+  when: controller_ip is defined
+  tags: config_rollback
+
+- name: Set slurm paths
+  ansible.builtin.set_fact:
+    slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}"
+    slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}"
+  tags: config_rollback
+
+- name: Find available backups
+  ansible.builtin.find:
+    paths: "{{ slurm_backups_root }}"
+    file_type: directory
+    depth: 1
+  register: backup_dirs
+  tags: config_rollback
+
+- name: Fail if no backups found
+  ansible.builtin.fail:
+    msg: "No backups found in {{ slurm_backups_root }}"
+  when: backup_dirs.files | length == 0
+  tags: config_rollback
+
+- name: Set rollback backup list limit
+  ansible.builtin.set_fact:
+    rollback_backup_list_limit_effective: "{{ lookup('vars', 'rollback_backup_list_limit', default=slurm_rollback_backup_list_limit_default) | int }}"
+  tags: config_rollback
+
+- name: Build backup choices
+  ansible.builtin.set_fact:
+    backup_choices: >-
+      {{
+        (
+          backup_dirs.files
+          | sort(attribute='mtime', reverse=true)
+          | map(attribute='path')
+          | list
+        )[:(rollback_backup_list_limit_effective | int)]
+      }}
+    total_backup_count: "{{ backup_dirs.files | length }}"
+  tags: config_rollback
+
+- name: Notify if backup list is truncated
+  ansible.builtin.debug:
+    msg: "Showing latest {{ rollback_backup_list_limit_effective }} backups out of {{ total_backup_count }}. Increase rollback_backup_list_limit to show more."
+  when: (total_backup_count | int) > (rollback_backup_list_limit_effective | int)
+  tags: config_rollback
+
+- name: Display backup list order
+  ansible.builtin.debug:
+    msg: "Backup list is sorted latest first."
+  tags: config_rollback
+
+- name: Show backup choices
+  ansible.builtin.debug:
+    msg: "{{ backup_choice_index + 1 }}: {{ item | basename }}"
+  loop: "{{ backup_choices }}"
+  loop_control:
+    index_var: backup_choice_index
+  tags: config_rollback
+
+- name: Prompt user to select backup number
+  ansible.builtin.pause:
+    prompt: "Enter the backup number to rollback to"
+  register: backup_choice_input
+  tags: config_rollback
+
+- name: Set backup choice index
+  ansible.builtin.set_fact:
+    backup_choice_index: "{{ backup_choice_input.user_input | default('') | trim }}"
+  tags: config_rollback
+
+- name: Fail if backup selection is empty
+  ansible.builtin.fail:
+    msg: "No backup number selected. Rollback aborted."
+  when: backup_choice_index | length == 0
+  tags: config_rollback
+
+- name: Validate backup choice input is within range
+  ansible.builtin.fail:
+    msg: "Invalid selection '{{ backup_choice_input.user_input | default('') }}'. Enter a number between 1 and {{ backup_choices | length }}."
+  when:
+    - (backup_choice_index | int) < 1 or (backup_choice_index | int) > (backup_choices | length)
+  tags: config_rollback
+
+- name: Set selected backup
+  ansible.builtin.set_fact:
+    selected_backup_dir: "{{ backup_choices[(backup_choice_index | int) - 1] }}"
+  tags: config_rollback
+
+- name: Set selected backup controller root
+  ansible.builtin.set_fact:
+    selected_backup_ctld_root: "{{ selected_backup_dir }}/{{ ctld_list[0] }}"
+  tags: config_rollback
+
+- name: Check slurm.conf exists in selected backup
+  ansible.builtin.stat:
+    path: "{{ selected_backup_ctld_root }}/etc/slurm/slurm.conf"
+  register: slurm_conf_stat
+  tags: config_rollback
+
+- name: Fail if slurm.conf missing in backup
+  ansible.builtin.fail:
+    msg: "Selected backup is missing {{ ctld_list[0] }}/etc/slurm/slurm.conf"
+  when: not slurm_conf_stat.stat.exists
+  tags: config_rollback
+
+- name: Check key slurm conf files existence in selected backup
+  ansible.builtin.stat:
+    path: "{{ selected_backup_ctld_root }}/etc/slurm/{{ item }}"
+  loop:
+    - slurmdbd.conf
+    - cgroup.conf
+    - gres.conf
+  register: slurm_conf_files_stats
+  tags: config_rollback
+
+- name: Compute missing slurm conf files in selected backup
+  ansible.builtin.set_fact:
+    missing_slurm_conf_files: "{{ slurm_conf_files_stats.results | rejectattr('stat.exists') | map(attribute='item') | list }}"
+  tags: config_rollback
+
+- name: Warn if slurm conf files are missing in selected backup
+  ansible.builtin.debug:
+    msg: "WARNING: Missing files in selected backup under etc/slurm: {{ missing_slurm_conf_files }}"
+  when: missing_slurm_conf_files | length > 0
+  tags: config_rollback
+
+- name: Prompt to continue if slurm conf files are missing
+  ansible.builtin.pause:
+    prompt: "Some slurm config files are missing in the selected backup. Continue anyway? (y/N)"
+  register: continue_missing_confs
+  when: missing_slurm_conf_files | length > 0
+  tags: config_rollback
+
+- name: Fail if user does not want to continue with missing slurm conf files
+  ansible.builtin.fail:
+    msg: "Rollback aborted"
+  when:
+    - missing_slurm_conf_files | length > 0
+    - continue_missing_confs.user_input | default('N') | lower != 'y'
+  tags: config_rollback
+
+- name: Check munge.key exists in selected backup
+  ansible.builtin.stat:
+    path: "{{ selected_backup_ctld_root }}/etc/munge/munge.key"
+  register: munge_key_stat
+  tags: config_rollback
+
+- name: Warn if munge.key is missing in selected backup
+  ansible.builtin.debug:
+    msg: "WARNING: munge.key is missing in selected backup under etc/munge."
+  when: not munge_key_stat.stat.exists
+  tags: config_rollback
+
+- name: Prompt to continue if munge.key is missing
+  ansible.builtin.pause:
+    prompt: "munge.key is missing in the selected backup. Continue anyway? (y/N)"
+  register: continue_missing_munge_key
+  when: not munge_key_stat.stat.exists
+  tags: config_rollback
+
+- name: Fail if user does not want to continue without munge.key
+  ansible.builtin.fail:
+    msg: "Rollback aborted"
+  when:
+    - not munge_key_stat.stat.exists
+    - continue_missing_munge_key.user_input | default('N') | lower != 'y'
+  tags: config_rollback
+
+- name: Check backup directories
+  ansible.builtin.stat:
+    path: "{{ selected_backup_ctld_root }}/{{ item }}"
+  loop:
+    - etc/slurm
+    - etc/munge
+    - etc/my.cnf.d
+  register: backup_dir_stats
+  tags: config_rollback
+
+- name: Compute missing backup directories
+  ansible.builtin.set_fact:
+    missing_backup_dirs: "{{ backup_dir_stats.results | rejectattr('stat.exists') | map(attribute='item') | list }}"
+  tags: config_rollback
+
+- name: Warn if backup directories missing
+  ansible.builtin.debug:
+    msg: "WARNING: Missing directories in backup: {{ missing_backup_dirs }}"
+  when: missing_backup_dirs | length > 0
+  tags: config_rollback
+
+- name: Prompt to continue if backup directories missing
+  ansible.builtin.pause:
+    prompt: "Some directories are missing in the backup. Continue anyway? (y/N)"
+  register: continue_missing
+  when: missing_backup_dirs | length > 0
+  tags: config_rollback
+
+- name: Fail if user does not want to continue
+  ansible.builtin.fail:
+    msg: "Rollback aborted"
+  when:
+    - missing_backup_dirs | length > 0
+    - continue_missing.user_input | default('N') | lower != 'y'
+  tags: config_rollback
+
+- name: Prompt for safety backup before rollback
+  ansible.builtin.pause:
+    prompt: "Create a safety backup of current state before rollback? (y/n)"
+  register: pre_rollback_backup
+  tags: config_rollback
+
+- name: Set pre-rollback backup choice
+  ansible.builtin.set_fact:
+    pre_rollback_backup_choice: "{{ pre_rollback_backup.user_input | default('') | trim | lower }}"
+  tags: config_rollback
+
+- name: Fail if pre-rollback backup choice is empty
+  ansible.builtin.fail:
+    msg: "No input provided for safety backup prompt. Rollback aborted."
+  when: pre_rollback_backup_choice | length == 0
+  tags: config_rollback
+
+- name: Validate pre-rollback backup choice
+  ansible.builtin.fail:
+    msg: "Invalid input '{{ pre_rollback_backup.user_input | default('') }}'. Enter 'y' or 'n'."
+  when: pre_rollback_backup_choice not in ['y', 'yes', 'n', 'no']
+  tags: config_rollback
+
+- name: Run safety backup before rollback
+  ansible.builtin.include_role:
+    name: slurm_config_backup
+    apply:
+      tags: config_rollback
+  when: pre_rollback_backup_choice in ['y', 'yes']
+  tags: config_rollback
+
+- name: Stat slurmdbd.conf before restore
+  ansible.builtin.stat:
+    path: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/slurmdbd.conf"
+    checksum_algorithm: sha1
+  register: slurmdbd_before
+  tags: config_rollback
+
+- name: Restore config directories
+  ansible.builtin.command: >-
+    rsync -a "{{ selected_backup_ctld_root }}/{{ item }}/" "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/"
+  loop:
+    - etc/slurm
+    - etc/munge
+    - etc/my.cnf.d
+  changed_when: true
+  failed_when: false
+  tags: config_rollback
+
+- name: Check slurmdbd.conf permissions after restore
+  ansible.builtin.stat:
+    path: /etc/slurm/slurmdbd.conf
+  delegate_to: slurm_controller
+  register: slurmdbd_conf_perm_stat
+  tags: config_rollback
+
+- name: Fix slurmdbd.conf permissions after restore
+  ansible.builtin.file:
+    path: /etc/slurm/slurmdbd.conf
+    mode: '0600'
+  delegate_to: slurm_controller
+  when: slurmdbd_conf_perm_stat.stat.exists
+  tags: config_rollback
+
+- name: Check munge.key permissions after restore
+  ansible.builtin.stat:
+    path: /etc/munge/munge.key
+  delegate_to: slurm_controller
+  register: munge_key_perm_stat
+  tags: config_rollback
+
+- name: Fix munge.key permissions after restore
+  ansible.builtin.file:
+    path: /etc/munge/munge.key
+    mode: '0400'
+  delegate_to: slurm_controller
+  when: munge_key_perm_stat.stat.exists
+  tags: config_rollback
+
+- name: Stat slurmdbd.conf after restore
+  ansible.builtin.stat:
+    path: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/slurmdbd.conf"
+    checksum_algorithm: sha1
+  register: slurmdbd_after
+  tags: config_rollback
+
+- name: Check slurmctld is active before reconfigure
+  ansible.builtin.command: systemctl is-active slurmctld
+  delegate_to: slurm_controller
+  register: slurmctld_active
+  changed_when: false
+  failed_when: false
+  tags: config_rollback
+
+- name: Fail if slurmctld is not active
+  ansible.builtin.fail:
+    msg: "slurmctld is not active on the controller. Rollback applied on disk, but cannot reconfigure until slurmctld is running. Verify munge and slurmctld services and restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the controller."
+  when: slurmctld_active.stdout | default('') | trim != 'active'
+  tags: config_rollback
+
+- name: Run scontrol reconfigure
+  tags: config_rollback
+  block:
+    - name: Execute scontrol reconfigure
+      ansible.builtin.command: scontrol reconfigure
+      delegate_to: slurm_controller
+      register: reconfigure_out
+      changed_when: true
+      failed_when: reconfigure_out.rc != 0
+  rescue:
+    - name: Display scontrol reconfigure error
+      ansible.builtin.debug:
+        msg: "scontrol reconfigure failed. stdout={{ reconfigure_out.stdout | default('') }} stderr={{ reconfigure_out.stderr | default('') }}"
+
+    - name: Fail with rollback guidance
+      ansible.builtin.fail:
+        msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)."
+
+- name: Prompt to restart slurmdbd if slurmdbd.conf changed
+  ansible.builtin.pause:
+    prompt: "slurmdbd.conf has changed. Restart slurmdbd now? (Y/n)"
+  register: restart_slurmdbd_prompt
+  when:
+    - slurmdbd_before.stat.exists
+    - slurmdbd_after.stat.exists
+    - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
+  tags: config_rollback
+
+- name: Restart slurmdbd
+  ansible.builtin.command: systemctl restart slurmdbd
+  delegate_to: slurm_controller
+  when:
+    - slurmdbd_before.stat.exists
+    - slurmdbd_after.stat.exists
+    - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
+    - restart_slurmdbd_prompt.user_input | default('Y') | lower != 'n'
+  changed_when: true
+  tags: config_rollback
diff --git a/utils/slurm_config_util.yml b/utils/slurm_config_util.yml
new file mode 100644
index 0000000000..7cb5249ccd
--- /dev/null
+++ b/utils/slurm_config_util.yml
@@ -0,0 +1,26 @@
+---
+
+- name: Include input project directory
+  when: not project_dir_status | default(false) | bool
+  ansible.builtin.import_playbook: include_input_dir.yml
+  vars:
+    omnia_metadata_support: true
+  tags: always
+
+- name: Create oim group
+  ansible.builtin.import_playbook: create_container_group.yml
+  vars:
+    oim_group: true
+  tags: always
+
+- name: Slurm config utilities
+  hosts: oim
+  connection: ssh
+  gather_facts: true
+  roles:
+    - role: slurm_config_backup
+      tags: config_backup
+    - role: slurm_cleanup
+      tags: slurm_cleanup
+    - role: slurm_config_rollback
+      tags: config_rollback

From b7071143ae8eb6f05be29c10efc53cfe78ec35dc Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 16:43:28 +0530
Subject: [PATCH 049/172] Fixed ansible lint issues

---
 .../tasks/transform_network_spec.yml             | 16 +++++++++++++++-
 .../roles/import_input_parameters/vars/main.yml  |  3 ++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
index d4b3a92e29..17e742d22f 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml
@@ -131,12 +131,26 @@
       (network_spec_21.Networks is not defined)
       or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0)
 
+- name: Extract ib_network subnet from transformed output
+  ansible.builtin.set_fact:
+    ib_network_subnet: >-
+      {{
+        (
+          network_spec_21.Networks
+          | select('mapping')
+          | selectattr('ib_network', 'defined')
+          | map(attribute='ib_network')
+          | first
+          | default({})
+        ).subnet | default('')
+      }}
+
 - name: Validate mandatory ib_network.subnet is present in transformed output
   ansible.builtin.fail:
     msg: "{{ msg_ib_subnet_missing }}"
   when:
     - >-
-      ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | map(attribute='ib_network') | first | default({})).subnet | default('') | string | trim) == ''
+      (ib_network_subnet | string | trim) == ''
 
 - name: Display transformation summary
   ansible.builtin.debug:
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index a87c855751..89fddebea3 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -85,4 +85,5 @@ restore_input_files:
     validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'"
   - name: pxe_mapping_file.csv
     mode: '0644'
-    validate_cmd: ""
\ No newline at end of file
+    validate_cmd: ""
+    
\ No newline at end of file

From fa4662a758aeb85f8ee7e3da40994e393ba2f86a Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 16:53:38 +0530
Subject: [PATCH 050/172] Update main.yml

---
 upgrade/roles/import_input_parameters/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 89fddebea3..126f158e6e 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -86,4 +86,4 @@ restore_input_files:
   - name: pxe_mapping_file.csv
     mode: '0644'
     validate_cmd: ""
-    
\ No newline at end of file
+

From bae4c11877ab2af39ce9feb5694e72d99000ee75 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Fri, 6 Feb 2026 17:30:25 +0530
Subject: [PATCH 051/172] Input validation with type check for basic types

---
 .../common_utils/slurm_conf_utils.py          | 71 ++++++++++++++++++-
 .../validation_flows/common_validation.py     |  7 +-
 .../slurm_config/tasks/check_ctld_running.yml |  1 +
 discovery/roles/slurm_config/tasks/confs.yml  | 22 ------
 discovery/roles/slurm_config/vars/main.yml    |  2 -
 5 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 8deb85febb..faf1b54ff0 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -525,8 +525,9 @@ class SlurmParserEnum(str, Enum):
     "Link": S_P_STRING,  # Communication link IDs
     "Links": S_P_CSV,  # Communication link IDs
     "MultipleFiles": S_P_CSV,  # list of GRES device files
-    "Name": S_P_STRING,  # Gres name
-    "Type": S_P_STRING  # Gres type (e.g. model name)
+    "Type": S_P_STRING,  # Gres type (e.g. model name)
+    "Name": S_P_ARRAY,  # Gres name
+    "NodeName": S_P_ARRAY
 }
 
 all_confs = {
@@ -546,6 +547,72 @@ class SlurmParserEnum(str, Enum):
 _HOSTLIST_RE = re.compile(
     r'^(?P<prefix>[^\[\]]*)\[(?P<inner>[^\[\]]+)\](?P<suffix>.*)$')
 
+def validate_config_types(conf_dict, conf_name):
+    """Validate configuration keys and value types based on SlurmParserEnum."""
+    current_conf = all_confs.get(conf_name, {})
+    invalid_keys = set(conf_dict.keys()).difference(set(current_conf.keys()))
+    type_errors = []
+   
+    for key, value in conf_dict.items():
+        if key in current_conf:
+            expected_type_enum = current_conf[key]
+            expected_type = expected_type_enum.value        
+            error = None
+
+            if expected_type == "int":
+                if not isinstance(value, int):
+                    try:
+                        int(str(value))
+                    except (ValueError, TypeError):
+                        error = f"Expected integer, got {type(value).__name__}"
+            
+            elif expected_type == "float":
+                if not isinstance(value, (int, float)):
+                    try:
+                        float(str(value))
+                    except (ValueError, TypeError):
+                        error = f"Expected float, got {type(value).__name__}"
+            
+            elif expected_type == "bool":
+                if not isinstance(value, bool):
+                    if str(value).lower() not in ['yes', 'no', 'true', 'false', '0', '1']:
+                        error = f"Expected boolean, got {type(value).__name__}"
+            
+            elif expected_type == "str":
+                if not isinstance(value, str):
+                    error = f"Expected string, got {type(value).__name__}"
+            
+            elif expected_type == "csv":
+                if not isinstance(value, str):
+                    error = f"Expected CSV string, got {type(value).__name__}"
+            
+            elif expected_type == "list":
+                if not isinstance(value, list):
+                    error = f"Expected list, got {type(value).__name__}"
+            
+            elif expected_type == "array":
+                if not isinstance(value, list):
+                    error = f"Expected array (list), got {type(value).__name__}"
+                elif value and not all(isinstance(item, dict) for item in value):
+                    error = "Expected array of dicts, got mixed types"
+
+            elif expected_type == "object":
+                if not isinstance(value, (dict, object)):
+                    error = f"Expected object, got {type(value).__name__}"
+
+            if error:
+                type_errors.append({
+                    "error_key": "omnia_config.yml",
+                    "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'",
+                    "error_value": "slurm_cluster config_sources"
+                    })
+    
+    return {
+        'invalid_keys': list(invalid_keys),
+        'type_errors': type_errors,
+        'valid': len(invalid_keys) == 0 and len(type_errors) == 0
+    }
+
 def get_invalid_keys(conf_dict, conf_name):
     """Get invalid configuration keys by comparing against expected keys."""
     current_conf = all_confs.get(conf_name, {})
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 52fea1ced5..2eafc3884d 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -42,7 +42,8 @@
 )
 from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import (
     parse_slurm_conf,
-    get_invalid_keys
+    get_invalid_keys,
+    validate_config_types
 )
 
 file_names = config.files
@@ -1072,7 +1073,9 @@ def validate_omnia_config(
                     else: # path and also exists
                         conf_dict = parse_slurm_conf(v, k, False)
                         # module.exit_json(failed=True, result=conf_dict)
-                        invalid_keys = get_invalid_keys(conf_dict, k)
+                        # invalid_keys = get_invalid_keys(conf_dict, k)
+                        type_errors = validate_config_types(conf_dict, k)
+                        module.exit_json(failed=True, result=type_errors)
                         if invalid_keys:
                             errors.append(
                                 create_error_msg(input_file_path, "slurm_cluster config_sources",
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index dacd879bf7..52984c2afb 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -21,6 +21,7 @@
   delegate_to: localhost
   register: ssh_check
   ignore_errors: true
+  ignore_unreachable: true
 
 - name: Block when ssh_check is success
   when: ssh_check is success
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 1ff30acf34..fdf461f88c 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -152,28 +152,6 @@
   ansible.builtin.set_fact:
     conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}"
 
-- name: Create backup directory with timestamp
-  ansible.builtin.file:
-    path: "{{ backup_dir }}"
-    state: directory
-    mode: '0755'
-    owner: "{{ slurm_user }}"
-    group: "{{ slurm_user_group }}"
-  when: ctld_list
-
-- name: Backup existing SLURM configuration files with timestamp
-  ansible.builtin.copy:
-    src: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf"
-    dest: "{{ backup_dir }}/{{ item.item.key }}.conf"
-    remote_src: true
-    mode: preserve
-  loop: "{{ merged_conf.results }}"
-  when:
-    - ctld_list
-    - item.item.key in conf_files
-  register: backup_results
-  failed_when: false
-
 - name: Write merged .conf
   ansible.builtin.copy:
     content: "{{ item.ini_lines | join('\n') }}\n"
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 9722725a88..939e3ac204 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -128,5 +128,3 @@ offline_path_aarch64:
     dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
 
 ssh_private_key_path: /root/.ssh/oim_rsa
-
-backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"

From 69b722625b1fa162c1f7cb5ae5b982df19182781 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 17:30:43 +0530
Subject: [PATCH 052/172] Update main.yml

---
 upgrade/roles/import_input_parameters/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 126f158e6e..b208b154ca 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -13,7 +13,7 @@
 # limitations under the License.
 ---
 
-backup_location: /opt/omnia/backups/upgrade/input
+backup_location: /opt/omnia/backups/upgrade/input/project_default
 
 backup_dir_mode: '0755'
 default_file_mode: '0644'

From a2b49d8ed0f87edfbaa555b112708440c615066d Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 6 Feb 2026 17:33:33 +0530
Subject: [PATCH 053/172] Update main.yml

---
 upgrade/roles/import_input_parameters/vars/main.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index b208b154ca..bc4ca7430a 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -86,4 +86,3 @@ restore_input_files:
   - name: pxe_mapping_file.csv
     mode: '0644'
     validate_cmd: ""
-

From b096afb38145eef09fe01bc967463dfa18cbbe35 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 6 Feb 2026 17:58:09 +0530
Subject: [PATCH 054/172] Update omnia.sh

---
 omnia.sh | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 358cde2162..b2da6b6024 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -1212,8 +1212,9 @@ phase1_validate() {
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)"
+    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)"
 
+   
 
     if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
         echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
@@ -1264,8 +1265,61 @@ phase2_approval() {
     return 0
 }
 
+phase3_backup_creation() {
+    local backup_base="$1"
+
+    echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation"
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running"
+        return 1
+    fi
+
+    if [ -z "$backup_base" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Backup destination is empty"
+        return 1
+    fi
+
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'
+        mkdir -p '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'
+
+        if [ -f '$OMNIA_INPUT_DIR/default.yml' ]; then
+            cp -a '$OMNIA_INPUT_DIR/default.yml' '${backup_base%/}/input/'
+        fi
+
+        if [ -d '$OMNIA_INPUT_DIR/project_default' ]; then
+            cp -a '$OMNIA_INPUT_DIR/project_default' '${backup_base%/}/input/'
+        fi
+
+        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+            exit 1
+        fi
+        cp -a '$OMNIA_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml'
+    "; then
+        echo "[ERROR] [ORCHESTRATOR] Backup failed; cleaning up partial backup"
+        podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true
+        return 1
+    fi
+
+    if [ -f "/etc/containers/systemd/omnia_core.container" ]; then
+        if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:${backup_base%/}/configs/omnia_core.container" >/dev/null 2>&1; then
+            echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file"
+            podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true
+            return 1
+        fi
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_base"
+    echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed"
+    return 0
+}
+
 upgrade_omnia_core() {
     local lock_file="/var/lock/omnia_core_upgrade.lock"
+    local backup_base
 
     if [ -e "$lock_file" ]; then
         echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}"
@@ -1288,7 +1342,18 @@ upgrade_omnia_core() {
         exit 0
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Upgrade tasks for backup and container swap are deferred to a follow-up PR"
+    backup_base="$OMNIA_UPGRADE_BACKUP_PATH"
+    if [ -z "$backup_base" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Backup path is empty"
+        exit 1
+    fi
+
+    if ! phase3_backup_creation "$backup_base"; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3"
+        exit 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Upgrade tasks for container swap are deferred to a follow-up PR"
     exit 0
 }
 

From ea05124537adc8152b5fcfff60a2ad1f98a99c41 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 6 Feb 2026 18:06:15 +0530
Subject: [PATCH 055/172] Update omnia.sh

---
 omnia.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/omnia.sh b/omnia.sh
index b2da6b6024..0239ebdf3f 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -1212,7 +1212,7 @@ phase1_validate() {
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)"
+    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)"
 
    
 

From ab1b7cd147fcecddf935caf3e6343f74963d0c0b Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Fri, 6 Feb 2026 18:06:31 +0530
Subject: [PATCH 056/172] add warning for user_registry and remove
 user_registry from input validation

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../validation_flows/local_repo_validation.py | 44 ----------------
 local_repo/local_repo.yml                     |  2 +-
 .../check_additional_packages_images.yml      | 50 +++++++++++++++++++
 .../tasks/check_images_per_arch.yml           | 43 ++++++++++++++++
 local_repo/roles/validation/tasks/main.yml    |  3 ++
 local_repo/roles/validation/vars/main.yml     |  3 ++
 6 files changed, 100 insertions(+), 45 deletions(-)
 create mode 100644 local_repo/roles/validation/tasks/check_additional_packages_images.yml
 create mode 100644 local_repo/roles/validation/tasks/check_images_per_arch.yml

diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
index efeda63c8a..bcec9f4197 100644
--- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
@@ -129,50 +129,6 @@ def validate_local_repo_config(input_file_path, data,
     software_config_file_path = create_file_path(input_file_path, file_names["software_config"])
     software_config_json = load_json(software_config_file_path)
 
-    # Check if additional_packages is enabled and contains image packages
-    additional_packages_enabled = any(sw.get("name") == "additional_packages" for sw in software_config_json.get("softwares", []))
-    if additional_packages_enabled:
-        # Get arch values from additional_packages entry in software_config.json
-        additional_packages_archs = []
-        for software in software_config_json.get("softwares", []):
-            if software.get("name") == "additional_packages":
-                arch_list = software.get("arch", [])
-                additional_packages_archs = arch_list  # Get all archs
-                break
-
-        # Check each arch specific additional_packages.json
-        has_image_packages = False
-        for additional_packages_arch in additional_packages_archs:
-            additional_packages_path = create_file_path(
-                input_file_path,
-                f"config/{additional_packages_arch}/{software_config_json['cluster_os_type']}/{software_config_json['cluster_os_version']}/additional_packages.json"
-            )
-            
-            if os.path.exists(additional_packages_path):
-                additional_packages_data = load_json(additional_packages_path)
-                has_image_packages = False
-                
-                # Check all sections for image packages
-                for section_name, section_data in additional_packages_data.items():
-                    if isinstance(section_data, dict) and "cluster" in section_data:
-                        cluster_packages = section_data.get("cluster", [])
-                        
-                        for package in cluster_packages:
-                            if package.get("type") == "image":
-                                has_image_packages = True
-                                break
-
-                    if has_image_packages:
-                        break
-
-        # If any architecture has image packages, user_registry must be defined and not empty
-        if has_image_packages and user_registry is None:
-            errors.append(create_error_msg(
-                local_repo_yml,
-                "user_registry", 
-                "user_registry must be defined when additional_packages.json contains packages of type 'image'"
-            ))
-
     # Extra validation: custom_slurm must have <arch>_slurm_custom in user_repo_url_<arch>
     for sw in software_config_json["softwares"]:
         if sw["name"] == "slurm_custom":
diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml
index cb394fa845..3a743c3f47 100644
--- a/local_repo/local_repo.yml
+++ b/local_repo/local_repo.yml
@@ -114,7 +114,7 @@
   connection: ssh
   gather_facts: false
   tasks:
-    - name: Read network_spec vars
+    - name: Validate Pulp Container and Endpoint
       ansible.builtin.include_role:
         name: pulp_validation
 
diff --git a/local_repo/roles/validation/tasks/check_additional_packages_images.yml b/local_repo/roles/validation/tasks/check_additional_packages_images.yml
new file mode 100644
index 0000000000..3b5663095b
--- /dev/null
+++ b/local_repo/roles/validation/tasks/check_additional_packages_images.yml
@@ -0,0 +1,50 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Load local_repo_config.yml
+  ansible.builtin.include_vars:
+    file: "{{ local_repo_config_file }}"
+    name: local_repo_config
+
+- name: Check if additional_packages is enabled in software_config
+  ansible.builtin.set_fact:
+    additional_packages_enabled: "{{ software | selectattr('name', 'equalto', 'additional_packages') | list | length > 0 }}"
+
+- name: Get additional_packages architectures
+  ansible.builtin.set_fact:
+    additional_packages_archs: "{{ (software | selectattr('name', 'equalto', 'additional_packages') | first).arch | default([]) }}"
+  when: additional_packages_enabled
+
+- name: Check for image packages in additional_packages.json
+  when: additional_packages_enabled
+  block:
+    - name: Initialize image found flag
+      ansible.builtin.set_fact:
+        has_image_packages: false
+
+    - name: Check each architecture for image packages
+      ansible.builtin.include_tasks: check_images_per_arch.yml
+      loop: "{{ additional_packages_archs }}"
+      loop_control:
+        loop_var: arch_item
+      when: additional_packages_archs is defined
+
+    - name: Display warning if images found in additional_packages.json but user_registry not defined
+      ansible.builtin.pause:
+        prompt: "{{ additional_packages_image_warning_msg }}"
+        seconds: "{{ warning_wait_time_warning }}"
+      when:
+        - has_image_packages | bool
+        - local_repo_config.user_registry is not defined or local_repo_config.user_registry is none or local_repo_config.user_registry | length == 0
diff --git a/local_repo/roles/validation/tasks/check_images_per_arch.yml b/local_repo/roles/validation/tasks/check_images_per_arch.yml
new file mode 100644
index 0000000000..aa20840e3e
--- /dev/null
+++ b/local_repo/roles/validation/tasks/check_images_per_arch.yml
@@ -0,0 +1,43 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Set additional_packages.json path for {{ arch_item }}
+  ansible.builtin.set_fact:
+    additional_packages_path: "{{ project_input_path }}/config/{{ arch_item }}/{{ cluster_os_type }}/{{ cluster_os_version }}/additional_packages.json"
+
+- name: Check if additional_packages.json exists for {{ arch_item }}
+  ansible.builtin.stat:
+    path: "{{ additional_packages_path }}"
+  register: additional_packages_file
+
+- name: Load and check additional_packages.json for {{ arch_item }}
+  when: additional_packages_file.stat.exists
+  block:
+    - name: Load additional_packages.json
+      ansible.builtin.include_vars:
+        file: "{{ additional_packages_path }}"
+        name: additional_packages_data
+
+    - name: Check for image type packages in additional_packages
+      ansible.builtin.set_fact:
+        has_image_packages: true
+      when: >
+        additional_packages_data | dict2items |
+        selectattr('value.cluster', 'defined') |
+        map(attribute='value.cluster') |
+        flatten |
+        selectattr('type', 'defined') |
+        selectattr('type', 'equalto', 'image') |
+        list | length > 0
diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml
index ea9c61aeb5..41f584dd15 100644
--- a/local_repo/roles/validation/tasks/main.yml
+++ b/local_repo/roles/validation/tasks/main.yml
@@ -22,6 +22,9 @@
 - name: Validate software_config.json
   ansible.builtin.include_tasks: validate_software_config_json.yml
 
+- name: Check for images in additional_packages
+  ansible.builtin.include_tasks: check_additional_packages_images.yml
+
 - name: Validate metadata
   ansible.builtin.include_tasks: validate_metadata.yml
 
diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml
index ec343cb3ef..08a082ded7 100644
--- a/local_repo/roles/validation/vars/main.yml
+++ b/local_repo/roles/validation/vars/main.yml
@@ -146,6 +146,9 @@ user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry shoul
 time_out: 30
 user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager."   # noqa: yaml[line-length]
 cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml"  # noqa: yaml[line-length]
+additional_packages_image_warning_msg: |
+  WARNING: additional_packages.json contains packages of type 'image', but 'user_registry' is not defined in local_repo_config.yml.
+  Please specify 'user_registry' in local_repo_config.yml if these images are coming from a user registry.
 
 # Usage: validate_user_repo_url.yml
 user_repo_url_fail_msg: "Failed. Please ensure user_repo_url is proper and should not have jinja variables.

From fc6f0e1653656d12e21c745bd89adbf55066de34 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Fri, 6 Feb 2026 18:26:30 +0530
Subject: [PATCH 057/172] Error messages formatted

---
 .../common_utils/slurm_conf_utils.py          |  2 +-
 .../validation_flows/common_validation.py     | 28 +++++++++----------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index faf1b54ff0..22b38d7ad3 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -604,7 +604,7 @@ def validate_config_types(conf_dict, conf_name):
                 type_errors.append({
                     "error_key": "omnia_config.yml",
                     "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'",
-                    "error_value": "slurm_cluster config_sources"
+                    "error_value": "slurm_cluster->config_sources"
                     })
     
     return {
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 2eafc3884d..8c850effbf 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1065,29 +1065,27 @@ def validate_omnia_config(
         cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
         for cfg_path_dict in cnfg_src:
             for k,v in cfg_path_dict.items():
+                conf_dict = None
                 if isinstance(v, str):
                     if not os.path.exists(v):
                         errors.append(
                             create_error_msg(input_file_path, "slurm_cluster config_sources",
                                 f"provided conf path for {k} - {v} does not exist"))
-                    else: # path and also exists
+                        continue
+                    else: # path exists
                         conf_dict = parse_slurm_conf(v, k, False)
-                        # module.exit_json(failed=True, result=conf_dict)
-                        # invalid_keys = get_invalid_keys(conf_dict, k)
-                        type_errors = validate_config_types(conf_dict, k)
-                        module.exit_json(failed=True, result=type_errors)
-                        if invalid_keys:
-                            errors.append(
-                                create_error_msg(input_file_path, "slurm_cluster config_sources",
-                                    f"invalid keys found in {k} - {invalid_keys}"))
                 else:
-                    invalid_keys = get_invalid_keys(v, k)
-                    if invalid_keys:
+                    conf_dict = v
+                
+                # Validate config types once for both cases
+                if conf_dict:
+                    validation_result = validate_config_types(conf_dict, k)
+                    if validation_result['type_errors']:
+                        errors.extend(validation_result['type_errors'])
+                    if validation_result['invalid_keys']:
                         errors.append(
-                            create_error_msg(input_file_path, "slurm_cluster config_sources",
-                                f"invalid keys found in {k} - {invalid_keys}"))
-
-
+                            create_error_msg('omnia_config.yml', "slurm_cluster->config_sources",
+                                f"{k}.conf invalid keys found - {','.join(validation_result['invalid_keys'])}"))
     return errors
 
 def check_is_service_cluster_functional_groups_defined(

From c54886f03c21d5b266c1d983892b241e605e2fba Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Fri, 6 Feb 2026 18:29:01 +0530
Subject: [PATCH 058/172] lint fix

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../module_utils/local_repo/software_utils.py      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index f1840be158..61b0eb31b4 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -625,14 +625,14 @@ def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_
                 for item in value:
                     # For every image, check if it is present in Pulp
                     if is_additional_packages and item.get("type") == "image":
-                            logger.info("Calling function to check %s existence in Pulp", item)
-                            tag_missing_entry = check_additional_image_in_pulp(item, logger)
-                            logger.info("tag_missing_entry: %s", tag_missing_entry)
-                            if tag_missing_entry == {}:
-                                continue
-                            if tag_missing_entry:
-                                filtered_list.append(tag_missing_entry)
+                        logger.info("Calling function to check %s existence in Pulp", item)
+                        tag_missing_entry = check_additional_image_in_pulp(item, logger)
+                        logger.info("tag_missing_entry: %s", tag_missing_entry)
+                        if tag_missing_entry == {}:
                             continue
+                        if tag_missing_entry:
+                            filtered_list.append(tag_missing_entry)
+                        continue
 
                     # Get package name
                     pkg_name = item.get("package")

From b1965a4074be43f85c103eb360e2926fe25ff471 Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Fri, 6 Feb 2026 18:40:16 +0530
Subject: [PATCH 059/172] lint fix

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../local_repo/process_parallel.py            |  2 +-
 .../module_utils/local_repo/software_utils.py | 27 +++++++++----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py
index 74a24504b7..2c55098c98 100644
--- a/common/library/module_utils/local_repo/process_parallel.py
+++ b/common/library/module_utils/local_repo/process_parallel.py
@@ -96,7 +96,7 @@ def load_docker_credentials(vault_yml_path, vault_password_file):
 
             if response.status_code == 200:
                 return docker_username, docker_password
-            
+
             if response.status_code == 429:
                 raise RuntimeError("Docker Hub rate limit exceeded. Please try again later.")
 
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index 61b0eb31b4..0924452d32 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -230,7 +230,7 @@ def parse_repo_urls(repo_config, local_repo_config_path,
     logger.info(f"Processing repository URLs for architectures: {archs_to_process}")
 
     for arch in archs_to_process:
-        
+
         # Always ensure these are lists
         rhel_repo_entry[arch] = list(local_yaml.get(f"rhel_os_url_{arch}") or [])
         repo_entries[arch] = list(local_yaml.get(f"omnia_repo_url_rhel_{arch}") or [])
@@ -338,8 +338,8 @@ def parse_repo_urls(repo_config, local_repo_config_path,
     seen_urls = set()
     for arch, entries in repo_entries.items():
         if not entries:
-           logger.info(f"No OMNIA repository entries found for {arch}")
-           continue
+            logger.info(f"No OMNIA repository entries found for {arch}")
+            continue
 
         for repo in entries:
             name = repo.get("name", "unknown")
@@ -455,7 +455,7 @@ def get_subgroup_dict(user_data,logger):
                                     for item in user_data.get(software_name, [])]
         subgroup_dict[software_name] = subgroups if isinstance(
             user_data.get(software_name), list) else [sw['name']]
-    
+
     logger.info("Completed get_subgroup_dict(). Found %d software entries.", len(software_names))
     logger.info("Final subgroup_dict: %s", subgroup_dict)
 
@@ -479,17 +479,17 @@ def get_csv_software(file_name):
     """
 
     csv_software = []
- 
+
     if not os.path.isfile(file_name):
         return csv_software
- 
+
     with open(file_name, mode='r') as csv_file:
         reader = csv.DictReader(csv_file)
         csv_software = [row.get(CSV_COLUMNS["column1"], "").strip()
                         for row in reader]
 
     return csv_software
- 
+
 
 def get_failed_software(file_path):
     """
@@ -702,7 +702,6 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger):
         raise
 
     names = [row['name'] for row in status_csv_content]
-    
     # Read all packages from JSON
     try:
         all_packages = parse_json_data(json_path, PACKAGE_TYPES, logger,None, subgroup_list)
@@ -710,18 +709,18 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger):
     except Exception as e:
         logger.error("Failed to parse JSON file '%s': %s", json_path, e)
         raise
-   
+
     for pkg in all_packages:
         if pkg["type"] == "image":
             # Check exact package:tag or package:digest combination
             pkg_base = pkg.get("package", "").strip()
             pkg_identifier = pkg_base
-            
+
             if "tag" in pkg:
                 pkg_identifier += f":{pkg['tag']}"
             elif "digest" in pkg:
                 pkg_identifier += f":{pkg['digest']}"
-            
+
             if pkg_identifier not in names:
                 new_packages.append(pkg)
         else:
@@ -753,7 +752,7 @@ def process_software(software, fresh_installation, json_path, csv_path, subgroup
         failed_packages = None
         logger.info("Fresh installation detected — skipping failed package check.")
     else:
-        try:    
+        try:
             failed_packages = None if fresh_installation else get_failed_software(csv_path)
             logger.info("Failed packages: %s", failed_packages)
         except Exception as e:
@@ -771,7 +770,7 @@ def process_software(software, fresh_installation, json_path, csv_path, subgroup
             raise
     else:
         logger.info("No failed RPM packages found for: %s", software)
- 
+
     # Parse main JSON data
     try:
         combined = parse_json_data(
@@ -803,7 +802,7 @@ def get_software_names_and_arch(json_data, arch):
         sw_arch = sw_arch_dict[sw["name"]]
         if arch in sw_arch:
             result.append(sw["name"])
-    
+
     return result
 
 def remove_duplicates_from_trans(trans):

From 810a8fd0d6798866d2143f16bf7cd9aabede529b Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Fri, 6 Feb 2026 19:27:33 +0530
Subject: [PATCH 060/172] skip ib network configuration if mellanox card is not
 present

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../templates/doca-ofed/configure-ib-network.sh.j2          | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2
index 1cb95d6f9b..249b90b6a5 100644
--- a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2
+++ b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2
@@ -1,6 +1,12 @@
 #!/bin/bash
 set -euo pipefail
 
+# Check if Mellanox hardware is present
+if ! lspci | grep -i 'mellanox'; then
+    echo "No Mellanox RDMA hardware detected. Skipping IB network configuration."
+    exit 0
+fi
+
 ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}"
 NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"
 IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}"

From c90040682c776bb5bfd73d220d88a2517b8cf0dd Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 14:03:54 +0000
Subject: [PATCH 061/172] rollback feature update

---
 utils/roles/slurm_config_rollback/tasks/main.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index e9822de876..f3aa65a3ae 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -405,10 +405,9 @@
       ansible.builtin.fail:
         msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)."
 
-- name: Prompt to restart slurmdbd if slurmdbd.conf changed
-  ansible.builtin.pause:
-    prompt: "slurmdbd.conf has changed. Restart slurmdbd now? (Y/n)"
-  register: restart_slurmdbd_prompt
+- name: Notify slurmdbd.conf changed
+  ansible.builtin.debug:
+    msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd."
   when:
     - slurmdbd_before.stat.exists
     - slurmdbd_after.stat.exists
@@ -422,6 +421,5 @@
     - slurmdbd_before.stat.exists
     - slurmdbd_after.stat.exists
     - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
-    - restart_slurmdbd_prompt.user_input | default('Y') | lower != 'n'
   changed_when: true
   tags: config_rollback

From dff8462a07b301db5004c7a886a795d8fc947fd1 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 14:29:25 +0000
Subject: [PATCH 062/172] slurmdbd service before scontrol reconfig

---
 .../roles/slurm_config_backup/tasks/main.yml  |  8 +++-
 .../slurm_config_rollback/tasks/main.yml      | 45 ++++++++++---------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml
index 4871ab705b..4d01014180 100644
--- a/utils/roles/slurm_config_backup/tasks/main.yml
+++ b/utils/roles/slurm_config_backup/tasks/main.yml
@@ -69,11 +69,15 @@
     backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
     backup_base_name: "{{ backup_base_name_input.user_input | default('') }}"
 
+- name: Set backup name suffix
+  ansible.builtin.set_fact:
+    backup_name_suffix: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}"
+
 - name: Set backup directory
   ansible.builtin.set_fact:
     slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}"
-    backup_id: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}"
-    backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}"
+    backup_id: "{{ backup_name_suffix }}"
+    backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ backup_name_suffix }}"
 
 - name: Ensure slurm backups root exists
   ansible.builtin.file:
diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index f3aa65a3ae..0610313e32 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -326,8 +326,11 @@
   tags: config_rollback
 
 - name: Restore config directories
-  ansible.builtin.command: >-
-    rsync -a "{{ selected_backup_ctld_root }}/{{ item }}/" "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/"
+  ansible.builtin.copy:
+    src: "{{ selected_backup_ctld_root }}/{{ item }}/"
+    dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/"
+    remote_src: true
+    directory_mode: '0755'
   loop:
     - etc/slurm
     - etc/munge
@@ -373,6 +376,25 @@
   register: slurmdbd_after
   tags: config_rollback
 
+- name: Notify slurmdbd.conf changed
+  ansible.builtin.debug:
+    msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd."
+  when:
+    - slurmdbd_before.stat.exists
+    - slurmdbd_after.stat.exists
+    - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
+  tags: config_rollback
+
+- name: Restart slurmdbd
+  ansible.builtin.command: systemctl restart slurmdbd
+  delegate_to: slurm_controller
+  when:
+    - slurmdbd_before.stat.exists
+    - slurmdbd_after.stat.exists
+    - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
+  changed_when: true
+  tags: config_rollback
+
 - name: Check slurmctld is active before reconfigure
   ansible.builtin.command: systemctl is-active slurmctld
   delegate_to: slurm_controller
@@ -404,22 +426,3 @@
     - name: Fail with rollback guidance
       ansible.builtin.fail:
         msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)."
-
-- name: Notify slurmdbd.conf changed
-  ansible.builtin.debug:
-    msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd."
-  when:
-    - slurmdbd_before.stat.exists
-    - slurmdbd_after.stat.exists
-    - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
-  tags: config_rollback
-
-- name: Restart slurmdbd
-  ansible.builtin.command: systemctl restart slurmdbd
-  delegate_to: slurm_controller
-  when:
-    - slurmdbd_before.stat.exists
-    - slurmdbd_after.stat.exists
-    - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum
-  changed_when: true
-  tags: config_rollback

From a7595104f839fb93eb9d1bbf34567082cfe0e854 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 15:00:46 +0000
Subject: [PATCH 063/172] lint issue for permission

---
 utils/roles/slurm_config_rollback/tasks/main.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index 0610313e32..ee873529fd 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -330,6 +330,7 @@
     src: "{{ selected_backup_ctld_root }}/{{ item }}/"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/"
     remote_src: true
+    mode: '0644'
     directory_mode: '0755'
   loop:
     - etc/slurm

From 677182e63ad5822e68da3ae930131cb7f73ba2bf Mon Sep 17 00:00:00 2001
From: balajikumaran-c-s <balajikumaran.cs@dellteam.com>
Date: Fri, 6 Feb 2026 21:05:13 +0530
Subject: [PATCH 064/172] Fix host/container path handling for Omnia metadata
 and input

---
 omnia.sh | 50 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 0239ebdf3f..f05c9ebe84 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -52,11 +52,15 @@ is_local_ip() {
     fi
 }
 
-OMNIA_BASE_DIR="/opt/omnia"
-OMNIA_INPUT_DIR="/opt/omnia/input"
-OMNIA_BACKUPS_DIR="/opt/omnia/backups"
-OMNIA_METADATA_DIR="/opt/omnia/.data"
-OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
+# Container-side paths (used inside podman exec commands)
+CONTAINER_INPUT_DIR="/opt/omnia/input"
+CONTAINER_BACKUPS_DIR="/opt/omnia/backups"
+CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
+
+# Host-side paths (initialized dynamically after omnia_path is set)
+OMNIA_INPUT_DIR=""
+OMNIA_METADATA_DIR=""
+OMNIA_METADATA_FILE=""
 
 update_metadata_upgrade_backup_dir() {
     local backup_dir="$1"
@@ -68,14 +72,14 @@ update_metadata_upgrade_backup_dir() {
 
     podman exec -u root omnia_core bash -c "
         set -e
-        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
-            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
             exit 1
         fi
-        if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then
-            sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE'
+        if grep -q '^upgrade_backup_dir:' '$CONTAINER_METADATA_FILE'; then
+            sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$CONTAINER_METADATA_FILE'
         else
-            echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE'
+            echo 'upgrade_backup_dir: ${backup_dir}' >> '$CONTAINER_METADATA_FILE'
         fi
     "
 }
@@ -560,6 +564,11 @@ init_container_config() {
     # Create the pulp_ha directory if it does not exist.
     echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}"
     mkdir -p "$omnia_path/omnia/pulp/pulp_ha"
+
+    # Initialize host-side path variables based on user-provided omnia_path
+    OMNIA_INPUT_DIR="$omnia_path/omnia/input"
+    OMNIA_METADATA_DIR="$omnia_path/omnia/.data"
+    OMNIA_METADATA_FILE="$omnia_path/omnia/.data/oim_metadata.yml"
 }
 
 
@@ -617,6 +626,11 @@ fetch_config() {
     else
         echo -e "${GREEN} Successfully fetched data from metadata file.${NC}"
     fi
+
+    # Initialize host-side path variables based on fetched omnia_path
+    OMNIA_INPUT_DIR="$omnia_path/omnia/input"
+    OMNIA_METADATA_DIR="$omnia_path/omnia/.data"
+    OMNIA_METADATA_FILE="$omnia_path/omnia/.data/oim_metadata.yml"
 }
 
 # Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is
@@ -1242,7 +1256,7 @@ phase2_approval() {
     echo "  - Additional Package Installation"
     echo "============================================"
 
-    default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade"
+    default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade"
     backup_base="$default_backup_dir"
 
     echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base"
@@ -1285,19 +1299,19 @@ phase3_backup_creation() {
         rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'
         mkdir -p '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'
 
-        if [ -f '$OMNIA_INPUT_DIR/default.yml' ]; then
-            cp -a '$OMNIA_INPUT_DIR/default.yml' '${backup_base%/}/input/'
+        if [ -f '$CONTAINER_INPUT_DIR/default.yml' ]; then
+            cp -a '$CONTAINER_INPUT_DIR/default.yml' '${backup_base%/}/input/'
         fi
 
-        if [ -d '$OMNIA_INPUT_DIR/project_default' ]; then
-            cp -a '$OMNIA_INPUT_DIR/project_default' '${backup_base%/}/input/'
+        if [ -d '$CONTAINER_INPUT_DIR/project_default' ]; then
+            cp -a '$CONTAINER_INPUT_DIR/project_default' '${backup_base%/}/input/'
         fi
 
-        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
-            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
             exit 1
         fi
-        cp -a '$OMNIA_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml'
+        cp -a '$CONTAINER_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml'
     "; then
         echo "[ERROR] [ORCHESTRATOR] Backup failed; cleaning up partial backup"
         podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true

From 1345236d2ea7e5369f7ab10ee5dbf78f9d4e7343 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 15:41:36 +0000
Subject: [PATCH 065/172] lint issue for systemd

---
 .../slurm_config_rollback/tasks/main.yml      | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index ee873529fd..1d3b23ec68 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -326,12 +326,11 @@
   tags: config_rollback
 
 - name: Restore config directories
-  ansible.builtin.copy:
+  ansible.posix.synchronize:
     src: "{{ selected_backup_ctld_root }}/{{ item }}/"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/"
-    remote_src: true
-    mode: '0644'
-    directory_mode: '0755'
+    archive: true
+    recursive: true
   loop:
     - etc/slurm
     - etc/munge
@@ -387,7 +386,9 @@
   tags: config_rollback
 
 - name: Restart slurmdbd
-  ansible.builtin.command: systemctl restart slurmdbd
+  ansible.builtin.systemd:
+    name: slurmdbd
+    state: restarted
   delegate_to: slurm_controller
   when:
     - slurmdbd_before.stat.exists
@@ -396,18 +397,20 @@
   changed_when: true
   tags: config_rollback
 
-- name: Check slurmctld is active before reconfigure
-  ansible.builtin.command: systemctl is-active slurmctld
+- name: Gather service facts on controller
+  ansible.builtin.service_facts:
   delegate_to: slurm_controller
-  register: slurmctld_active
-  changed_when: false
-  failed_when: false
+  tags: config_rollback
+
+- name: Set slurmctld state
+  ansible.builtin.set_fact:
+    slurmctld_state: "{{ ansible_facts.services['slurmctld.service'].state | default('unknown') }}"
   tags: config_rollback
 
 - name: Fail if slurmctld is not active
   ansible.builtin.fail:
     msg: "slurmctld is not active on the controller. Rollback applied on disk, but cannot reconfigure until slurmctld is running. Verify munge and slurmctld services and restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the controller."
-  when: slurmctld_active.stdout | default('') | trim != 'active'
+  when: slurmctld_state != 'running'
   tags: config_rollback
 
 - name: Run scontrol reconfigure

From 901fe1dcd51df17ee7bf1d5018c28b44b6eb3a78 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 16:11:38 +0000
Subject: [PATCH 066/172] lint long line

---
 utils/roles/slurm_config_rollback/tasks/main.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index 1d3b23ec68..19282515b3 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -409,7 +409,11 @@
 
 - name: Fail if slurmctld is not active
   ansible.builtin.fail:
-    msg: "slurmctld is not active on the controller. Rollback applied on disk, but cannot reconfigure until slurmctld is running. Verify munge and slurmctld services and restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the controller."
+    msg: >-
+      slurmctld is not active on the controller. Rollback applied on disk, but cannot
+      reconfigure until slurmctld is running. Verify munge and slurmctld services and
+      restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the
+      controller.
   when: slurmctld_state != 'running'
   tags: config_rollback
 
@@ -429,4 +433,7 @@
 
     - name: Fail with rollback guidance
       ansible.builtin.fail:
-        msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)."
+        msg: >-
+          Rollback applied on disk, but scontrol reconfigure failed. Recommended action:
+          rollback to the safety backup created before this rollback (if you chose to
+          create it).

From dd63ae0c6c4a06aabb616927f1e4271681cfd910 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 6 Feb 2026 16:26:49 +0000
Subject: [PATCH 067/172] copy module usage

---
 utils/roles/slurm_config_rollback/tasks/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index 19282515b3..0a66c096b0 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -326,11 +326,11 @@
   tags: config_rollback
 
 - name: Restore config directories
-  ansible.posix.synchronize:
+  ansible.builtin.copy:
     src: "{{ selected_backup_ctld_root }}/{{ item }}/"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/"
-    archive: true
-    recursive: true
+    remote_src: true
+    mode: preserve
   loop:
     - etc/slurm
     - etc/munge

From 2ddc2c306188cb024d0da3e06c77ddc3a5922a7a Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Sat, 7 Feb 2026 00:25:18 +0530
Subject: [PATCH 068/172] Just Added complex gres conf !!!

---
 .../common_utils/slurm_conf_utils.py          | 63 ++++++++++++++-----
 .../validation_flows/common_validation.py     |  4 +-
 common/library/modules/slurm_conf.py          |  2 +-
 3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 22b38d7ad3..d152363616 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 # These are the slurm options for version - 25.11
+import json
 import re
 import os
 from enum import Enum
@@ -67,6 +68,7 @@ class SlurmParserEnum(str, Enum):
 
 
 nodename_options = {
+    "NodeName": S_P_STRING,
     "BcastAddr": S_P_STRING,
     "Boards": S_P_UINT16,
     "CoreSpecCount": S_P_UINT16,
@@ -99,12 +101,14 @@ class SlurmParserEnum(str, Enum):
 
 
 nodeset_options = {
+    "NodeSet": S_P_STRING,
     "Feature": S_P_STRING,
     "Nodes": S_P_STRING
 }
 
 
 partition_options = {
+    "Partition": S_P_STRING,
     "AllocNodes": S_P_CSV,
     "AllowAccounts": S_P_CSV,
     "AllowGroups": S_P_CSV,
@@ -514,7 +518,7 @@ class SlurmParserEnum(str, Enum):
 }
 
 # From https://github.com/SchedMD/slurm/blob/slurm-<VERSION>s/src/interfaces/gres.c#L101C40-L116C2
-gres_options = {
+_gres_options = {
     "AutoDetect": S_P_STRING,
     "Count": S_P_STRING,  # Number of Gres available
     "CPUs": S_P_STRING,  # CPUs to bind to Gres resource
@@ -525,11 +529,26 @@ class SlurmParserEnum(str, Enum):
     "Link": S_P_STRING,  # Communication link IDs
     "Links": S_P_CSV,  # Communication link IDs
     "MultipleFiles": S_P_CSV,  # list of GRES device files
-    "Type": S_P_STRING,  # Gres type (e.g. model name)
-    "Name": S_P_ARRAY,  # Gres name
-    "NodeName": S_P_ARRAY
+    "Type": S_P_STRING
 }
 
+gres_options = _gres_options.copy()
+gres_options.update({
+    "Name": S_P_ARRAY,
+    "NodeName": S_P_ARRAY
+})
+
+gres_nodename_options = _gres_options.copy()
+gres_nodename_options.update({
+    "NodeName": S_P_STRING,
+    "Name": S_P_STRING
+})
+
+gres_name_options = _gres_options.copy()
+gres_name_options.update({
+    "Name": S_P_STRING
+})
+
 all_confs = {
     "slurm": slurm_options,
     "slurmdbd": slurmdbd_options,
@@ -538,19 +557,23 @@ class SlurmParserEnum(str, Enum):
     "gres": gres_options,
     # TOD: GRES can have different combinations, NodeName and Name
     # https://slurm.schedmd.com/gres.conf.html#SECTION_EXAMPLES
-    "PartitionName": partition_options,
-    "NodeName": nodename_options,
-    "DownNodes": downnodes_options,
-    "NodeSet": nodeset_options
+    "slurm->PartitionName": partition_options,
+    "slurm->NodeName": nodename_options,
+    "slurm->DownNodes": downnodes_options,
+    "slurm->NodeSet": nodeset_options,
+    "gres->Name": gres_name_options,
+    "gres->NodeName": gres_nodename_options
 }
 
 _HOSTLIST_RE = re.compile(
     r'^(?P<prefix>[^\[\]]*)\[(?P<inner>[^\[\]]+)\](?P<suffix>.*)$')
 
-def validate_config_types(conf_dict, conf_name):
+def validate_config_types(conf_dict, conf_name, module):
     """Validate configuration keys and value types based on SlurmParserEnum."""
     current_conf = all_confs.get(conf_name, {})
-    invalid_keys = set(conf_dict.keys()).difference(set(current_conf.keys()))
+    module.warn(f"current_conf: {current_conf}")
+    module.warn(f"conf_dict: {conf_dict}")
+    invalid_keys = list(set(conf_dict.keys()).difference(set(current_conf.keys())))
     type_errors = []
    
     for key, value in conf_dict.items():
@@ -593,24 +616,30 @@ def validate_config_types(conf_dict, conf_name):
             elif expected_type == "array":
                 if not isinstance(value, list):
                     error = f"Expected array (list), got {type(value).__name__}"
-                elif value and not all(isinstance(item, dict) for item in value):
-                    error = "Expected array of dicts, got mixed types"
-
+                elif value:
+                    if not all(isinstance(item, dict) for item in value):
+                        error = "Expected array of dicts, got mixed types"
+                    else:
+                        # Recursively validate each dict item in the array
+                        for item in value:
+                            item_result = validate_config_types(item, f"{conf_name}->{key}", module)
+                            module.warn(f"item: {item}")
+                            module.warn(json.dumps(item_result))
+                            type_errors.extend(item_result['type_errors'])
+                            invalid_keys.extend(item_result['invalid_keys'])
             elif expected_type == "object":
                 if not isinstance(value, (dict, object)):
                     error = f"Expected object, got {type(value).__name__}"
 
             if error:
-                type_errors.append({
+                type_errors.append({ # format for error message in input validator
                     "error_key": "omnia_config.yml",
                     "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'",
                     "error_value": "slurm_cluster->config_sources"
                     })
-    
     return {
         'invalid_keys': list(invalid_keys),
-        'type_errors': type_errors,
-        'valid': len(invalid_keys) == 0 and len(type_errors) == 0
+        'type_errors': type_errors
     }
 
 def get_invalid_keys(conf_dict, conf_name):
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 8c850effbf..af87ce7339 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1076,10 +1076,8 @@ def validate_omnia_config(
                         conf_dict = parse_slurm_conf(v, k, False)
                 else:
                     conf_dict = v
-                
-                # Validate config types once for both cases
                 if conf_dict:
-                    validation_result = validate_config_types(conf_dict, k)
+                    validation_result = validate_config_types(conf_dict, k, module)
                     if validation_result['type_errors']:
                         errors.extend(validation_result['type_errors'])
                     if validation_result['invalid_keys']:
diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py
index a782cb1f79..4866077242 100644
--- a/common/library/modules/slurm_conf.py
+++ b/common/library/modules/slurm_conf.py
@@ -173,7 +173,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name):
                         existing_dict = merged_dict.get(ky, {})
                         inner_dict = existing_dict.get(item.get(ky), {})
                         # Get the sub-options for this array type (e.g., nodename_options, partition_options)
-                        sub_options = all_confs.get(ky, {})
+                        sub_options = all_confs.get(f"{conf_name}->{ky}", {})
                         # Merge item into inner_dict, handling CSV fields specially
                         for k, v in item.items():
                             if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict:

From 2affffc05e5b7c3f636f3443a902b967a546d59a Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Sat, 7 Feb 2026 00:27:28 +0530
Subject: [PATCH 069/172] Removed debugging lines module

---
 .../input_validation/common_utils/slurm_conf_utils.py | 11 -----------
 .../validation_flows/common_validation.py             |  1 -
 2 files changed, 12 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index d152363616..3f6e2fac30 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -571,8 +571,6 @@ class SlurmParserEnum(str, Enum):
 def validate_config_types(conf_dict, conf_name, module):
     """Validate configuration keys and value types based on SlurmParserEnum."""
     current_conf = all_confs.get(conf_name, {})
-    module.warn(f"current_conf: {current_conf}")
-    module.warn(f"conf_dict: {conf_dict}")
     invalid_keys = list(set(conf_dict.keys()).difference(set(current_conf.keys())))
     type_errors = []
    
@@ -623,8 +621,6 @@ def validate_config_types(conf_dict, conf_name, module):
                         # Recursively validate each dict item in the array
                         for item in value:
                             item_result = validate_config_types(item, f"{conf_name}->{key}", module)
-                            module.warn(f"item: {item}")
-                            module.warn(json.dumps(item_result))
                             type_errors.extend(item_result['type_errors'])
                             invalid_keys.extend(item_result['invalid_keys'])
             elif expected_type == "object":
@@ -642,13 +638,6 @@ def validate_config_types(conf_dict, conf_name, module):
         'type_errors': type_errors
     }
 
-def get_invalid_keys(conf_dict, conf_name):
-    """Get invalid configuration keys by comparing against expected keys."""
-    current_conf = all_confs.get(conf_name, {})
-    # get difference between conf_dict keys and current_conf keys
-    diff = set(conf_dict.keys()).difference(set(current_conf.keys()))
-    return list(diff)
-
 def parse_slurm_conf(file_path, conf_name, validate):
     """Parses the slurm.conf file and returns it as a dictionary."""
     current_conf = all_confs.get(conf_name, {})
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index af87ce7339..ae4e693b9e 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -42,7 +42,6 @@
 )
 from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import (
     parse_slurm_conf,
-    get_invalid_keys,
     validate_config_types
 )
 

From 07f5a888ad97fca4a8f5eb4e8c689f5e7e61f2a7 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Sat, 7 Feb 2026 20:58:43 +0530
Subject: [PATCH 070/172] pylint fix

---
 .../common_utils/slurm_conf_utils.py          | 72 ++++++++++++-------
 common/library/modules/slurm_conf.py          | 12 ++--
 2 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 3f6e2fac30..401109640b 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 # These are the slurm options for version - 25.11
-import json
 import re
 import os
 from enum import Enum
 from collections import OrderedDict
 
+
 class SlurmParserEnum(str, Enum):
     """Enumeration of Slurm configuration parameter types for parsing and validation."""
 
@@ -62,6 +62,7 @@ class SlurmParserEnum(str, Enum):
 
 
 downnodes_options = {
+    "DownNodes": S_P_STRING,
     "Reason": S_P_STRING,
     "State": S_P_STRING,
 }
@@ -157,7 +158,8 @@ class SlurmParserEnum(str, Enum):
     "TRESBillingWeights": S_P_CSV
 }
 
-# From https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/common/read_config.c
+# From
+# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/common/read_config.c
 slurm_options = {
     "AccountingStorageBackupHost": S_P_STRING,
     "AccountingStorageEnforce": S_P_CSV,
@@ -402,7 +404,8 @@ class SlurmParserEnum(str, Enum):
     "SlurmctldHost": S_P_LIST
 }
 
-# From https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/slurmdbd/read_config.c
+# From
+# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/slurmdbd/read_config.c
 slurmdbd_options = {
     "AllowNoDefAcct": S_P_BOOLEAN,
     "AllResourcesAbsolute": S_P_BOOLEAN,
@@ -473,7 +476,8 @@ class SlurmParserEnum(str, Enum):
     "TrackSlurmctldDown": S_P_BOOLEAN
 }
 
-# From https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/interfaces/cgroup.c#L332
+# From
+# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/interfaces/cgroup.c#L332
 cgroup_options = {
     "CgroupAutomount": S_P_BOOLEAN,
     "CgroupMountpoint": S_P_STRING,
@@ -500,7 +504,8 @@ class SlurmParserEnum(str, Enum):
     "SystemdTimeout": S_P_UINT64
 }
 
-# From https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/plugins/mpi/pmix/mpi_pmix.c#L83
+# From
+# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/plugins/mpi/pmix/mpi_pmix.c#L83
 mpi_options = {
     "PMIxCliTmpDirBase": S_P_STRING,
     "PMIxCollFence": S_P_STRING,
@@ -517,7 +522,8 @@ class SlurmParserEnum(str, Enum):
     "PMIxTlsUCX": S_P_CSV
 }
 
-# From https://github.com/SchedMD/slurm/blob/slurm-<VERSION>s/src/interfaces/gres.c#L101C40-L116C2
+# From
+# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>s/src/interfaces/gres.c#L101C40-L116C2
 _gres_options = {
     "AutoDetect": S_P_STRING,
     "Count": S_P_STRING,  # Number of Gres available
@@ -568,16 +574,18 @@ class SlurmParserEnum(str, Enum):
 _HOSTLIST_RE = re.compile(
     r'^(?P<prefix>[^\[\]]*)\[(?P<inner>[^\[\]]+)\](?P<suffix>.*)$')
 
+
 def validate_config_types(conf_dict, conf_name, module):
     """Validate configuration keys and value types based on SlurmParserEnum."""
     current_conf = all_confs.get(conf_name, {})
-    invalid_keys = list(set(conf_dict.keys()).difference(set(current_conf.keys())))
+    invalid_keys = list(
+        set(conf_dict.keys()).difference(set(current_conf.keys())))
     type_errors = []
-   
+
     for key, value in conf_dict.items():
         if key in current_conf:
             expected_type_enum = current_conf[key]
-            expected_type = expected_type_enum.value        
+            expected_type = expected_type_enum.value
             error = None
 
             if expected_type == "int":
@@ -586,41 +594,44 @@ def validate_config_types(conf_dict, conf_name, module):
                         int(str(value))
                     except (ValueError, TypeError):
                         error = f"Expected integer, got {type(value).__name__}"
-            
+
             elif expected_type == "float":
                 if not isinstance(value, (int, float)):
                     try:
                         float(str(value))
                     except (ValueError, TypeError):
                         error = f"Expected float, got {type(value).__name__}"
-            
+
             elif expected_type == "bool":
                 if not isinstance(value, bool):
-                    if str(value).lower() not in ['yes', 'no', 'true', 'false', '0', '1']:
+                    if str(value).lower() not in [
+                            'yes', 'no', 'true', 'false', '0', '1']:
                         error = f"Expected boolean, got {type(value).__name__}"
-            
+
             elif expected_type == "str":
                 if not isinstance(value, str):
                     error = f"Expected string, got {type(value).__name__}"
-            
+
             elif expected_type == "csv":
                 if not isinstance(value, str):
                     error = f"Expected CSV string, got {type(value).__name__}"
-            
+
             elif expected_type == "list":
                 if not isinstance(value, list):
                     error = f"Expected list, got {type(value).__name__}"
-            
+
             elif expected_type == "array":
                 if not isinstance(value, list):
-                    error = f"Expected array (list), got {type(value).__name__}"
+                    error = f"Expected array (list), got {
+                        type(value).__name__}"
                 elif value:
                     if not all(isinstance(item, dict) for item in value):
                         error = "Expected array of dicts, got mixed types"
                     else:
                         # Recursively validate each dict item in the array
                         for item in value:
-                            item_result = validate_config_types(item, f"{conf_name}->{key}", module)
+                            item_result = validate_config_types(
+                                item, f"{conf_name}->{key}", module)
                             type_errors.extend(item_result['type_errors'])
                             invalid_keys.extend(item_result['invalid_keys'])
             elif expected_type == "object":
@@ -628,16 +639,17 @@ def validate_config_types(conf_dict, conf_name, module):
                     error = f"Expected object, got {type(value).__name__}"
 
             if error:
-                type_errors.append({ # format for error message in input validator
+                type_errors.append({  # format for error message in input validator
                     "error_key": "omnia_config.yml",
                     "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'",
                     "error_value": "slurm_cluster->config_sources"
-                    })
+                })
     return {
         'invalid_keys': list(invalid_keys),
         'type_errors': type_errors
     }
 
+
 def parse_slurm_conf(file_path, conf_name, validate):
     """Parses the slurm.conf file and returns it as a dictionary."""
     current_conf = all_confs.get(conf_name, {})
@@ -661,21 +673,31 @@ def parse_slurm_conf(file_path, conf_name, validate):
                 tmp_dict[key.strip()] = value.strip()
             skey = list(tmp_dict.keys())[0]
             if validate and skey not in current_conf:
-                raise ValueError(f"Invalid key while parsing {file_path}: {skey}")
+                raise ValueError(
+                    f"Invalid key while parsing {file_path}: {skey}")
             if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY:
                 slurm_dict[list(tmp_dict.keys())[0]] = list(
                     slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict]
             elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV:
-                existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()]
-                new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()]
-                slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values)))
+                existing_values = [
+                    v.strip() for v in slurm_dict.get(
+                        skey, "").split(',') if v.strip()]
+                new_values = [v.strip()
+                              for v in tmp_dict[skey].split(',') if v.strip()]
+                slurm_dict[skey] = ",".join(
+                    list(
+                        dict.fromkeys(
+                            existing_values +
+                            new_values)))
             elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST:
-                slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values())
+                slurm_dict[skey] = list(slurm_dict.get(
+                    skey, [])) + list(tmp_dict.values())
             else:
                 slurm_dict.update(tmp_dict)
 
     return slurm_dict
 
+
 def expand_hostlist(expr):
     """
     Expand simple Slurm-style hostlist expressions, e.g.:
diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py
index 4866077242..dcacbcae2f 100644
--- a/common/library/modules/slurm_conf.py
+++ b/common/library/modules/slurm_conf.py
@@ -161,7 +161,7 @@ def read_dict2ini(conf_dict):
     return data
 
 
-def slurm_conf_dict_merge(conf_dict_list, conf_name):
+def slurm_conf_dict_merge(conf_dict_list, conf_name, replace):
     """Merge multiple Slurm configuration dictionaries into a single dictionary."""
     merged_dict = OrderedDict()
     current_conf = all_confs.get(conf_name, {})
@@ -176,7 +176,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name):
                         sub_options = all_confs.get(f"{conf_name}->{ky}", {})
                         # Merge item into inner_dict, handling CSV fields specially
                         for k, v in item.items():
-                            if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict:
+                            if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict and not replace:
                                 # Merge CSV values
                                 existing_values = [val.strip() for val in inner_dict[k].split(',') if val.strip()]
                                 new_values = [val.strip() for val in v.split(',') if val.strip()]
@@ -193,7 +193,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name):
                 else:
                     new_items = [vl]
                 merged_dict[ky] = list(dict.fromkeys(existing_list + new_items))
-            elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV:
+            elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV and not replace:
                 existing_values = [v.strip() for v in merged_dict.get(ky, "").split(',') if v.strip()]
                 new_values = [v.strip() for v in vl.split(',') if v.strip()]
                 merged_dict[ky] = ",".join(list(dict.fromkeys(existing_values + new_values)))
@@ -215,7 +215,8 @@ def run_module():
         "conf_map": {'type': 'dict', 'default': {}},
         "conf_sources": {'type': 'list', 'elements': 'raw', 'default': []},
         "conf_name": {'type': 'str', 'default': 'slurm'},
-        "validate": {'type': 'bool', 'default': False}
+        "validate": {'type': 'bool', 'default': False},
+        "replace": {'type': 'bool', 'default': False}
     }
 
     result = {"changed": False, "failed": False}
@@ -230,6 +231,7 @@ def run_module():
     try:
         conf_name = module.params['conf_name']
         validate = module.params['validate']
+        replace = module.params['replace']
         # Parse the slurm.conf file
         if module.params['op'] == 'parse':
             s_dict = parse_slurm_conf(module.params['path'], conf_name, validate)
@@ -249,7 +251,7 @@ def run_module():
                     conf_dict_list.append(OrderedDict(s_dict))
                 else:
                     raise TypeError(f"Invalid type for conf_source: {type(conf_source)}")
-            merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name)
+            merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name, replace)
             result['conf_dict'] = merged_dict
             result['ini_lines'] = read_dict2ini(merged_dict)
     except (FileNotFoundError, ValueError, TypeError, AttributeError) as e:

From 3a7481e8ead97e8a6d33745a5c01ce35b59790fe Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Sat, 7 Feb 2026 21:12:44 +0530
Subject: [PATCH 071/172] Validation keys now in module utils, now removed

---
 .../roles/slurm_config/defaults/main.yml      | 226 +-----------------
 1 file changed, 1 insertion(+), 225 deletions(-)

diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml
index 03ea48760c..a8fbc8e9c8 100644
--- a/discovery/roles/slurm_config/defaults/main.yml
+++ b/discovery/roles/slurm_config/defaults/main.yml
@@ -25,231 +25,7 @@ default_corespersocket: 1
 share_prefix: "/"
 conf_path_items: {}
 conf_dict_items: {}
-# This validates just the keys and not the values, as native support for this schema is not available from slurm
-__conf_keys:
-  slurm:
-# updated from version 22.08 using cmd -> scontrol show config
-    - AccountingStorageBackupHost
-    - AccountingStorageEnforce
-    - AccountingStorageHost
-    - AccountingStorageExternalHost
-    - AccountingStorageParameters
-    - AccountingStoragePort
-    - AccountingStorageTRES
-    - AccountingStorageType
-    - AccountingStorageUser
-    - AccountingStoreFlags
-    - AcctGatherEnergyType
-    - AcctGatherFilesystemType
-    - AcctGatherInterconnectType
-    - AcctGatherNodeFreq
-    - AcctGatherProfileType
-    - AllowSpecResourcesUsage
-    - AuthAltTypes
-    - AuthAltParameters
-    - AuthInfo
-    - AuthType
-    - BatchStartTimeout
-    - BcastExclude
-    - BcastParameters
-    - BurstBufferType
-    - CliFilterPlugins
-    # - ClusterName # This will be set from the input "omnia_config.yml"
-    - CommunicationParameters
-    - CompleteWait
-    - CoreSpecPlugin
-    - CpuFreqDef
-    - CpuFreqGovernors
-    - CredType
-    - DebugFlags
-    - DefMemPerNode
-    - DependencyParameters
-    - DisableRootJobs
-    - EioTimeout
-    - EnforcePartLimits
-    - Epilog
-    - EpilogMsgTime
-    - EpilogSlurmctld
-    - ExtSensorsType
-    - ExtSensorsFreq
-    - FederationParameters
-    - FirstJobId
-    - GetEnvTimeout
-    - GresTypes
-    - GpuFreqDef
-    - GroupUpdateForce
-    - GroupUpdateTime
-    - HealthCheckInterval
-    - HealthCheckNodeState
-    - HealthCheckProgram
-    - InactiveLimit
-    - InteractiveStepOptions
-    - JobAcctGatherFrequency
-    - JobAcctGatherType
-    - JobAcctGatherParams
-    - JobCompHost
-    - JobCompLoc
-    - JobCompPort
-    - JobCompType
-    - JobCompUser
-    - JobContainerType
-    - JobCredentialPrivateKey
-    - JobCredentialPublicCertificate
-    - JobDefaults
-    - JobFileAppend
-    - JobRequeue
-    - JobSubmitPlugins
-    - KillOnBadExit
-    - KillWait
-    - LaunchParameters
-    - LaunchType
-    - Licenses
-    - LogTimeFormat
-    - MailDomain
-    - MailProg
-    - MaxArraySize
-    - MaxDBDMsgs
-    - MaxJobCount
-    - MaxJobId
-    - MaxMemPerNode
-    - MaxNodeCount
-    - MaxStepCount
-    - MaxTasksPerNode
-    - MCSPlugin
-    - MCSParameters
-    - MessageTimeout
-    - MinJobAge
-    - MpiDefault
-    - MpiParams
-    - NodeFeaturesPlugins
-    - OverTimeLimit
-    - PluginDir
-    - PlugStackConfig
-    - PowerParameters
-    - PowerPlugin
-    - PreemptType
-    - PreemptExemptTime
-    - PrEpParameters
-    - PrEpPlugins
-    - PriorityParameters
-    - PrioritySiteFactorParameters
-    - PrioritySiteFactorPlugin
-    - PriorityType
-    - PrivateData
-    - ProctrackType
-    - Prolog
-    - PrologEpilogTimeout
-    - PrologSlurmctld
-    - PrologFlags
-    - PropagatePrioProcess
-    - PropagateResourceLimits
-    - PropagateResourceLimitsExcept
-    - RebootProgram
-    - ReconfigFlags
-    - RequeueExit
-    - RequeueExitHold
-    - ResumeFailProgram
-    - ResumeProgram
-    - ResumeRate
-    - ResumeTimeout
-    - ResvEpilog
-    - ResvOverRun
-    - ResvProlog
-    - ReturnToService
-    - RoutePlugin
-    - SchedulerParameters
-    - SchedulerTimeSlice
-    - SchedulerType
-    - ScronParameters
-    - SelectType
-    - SelectTypeParameters
-    - SlurmUser
-    - SlurmctldAddr
-    - SlurmctldDebug
-    - SlurmctldLogFile
-    - SlurmctldPort
-    - SlurmctldSyslogDebug
-    - SlurmctldPrimaryOffProg
-    - SlurmctldPrimaryOnProg
-    - SlurmctldTimeout
-    - SlurmctldParameters
-    - SlurmdDebug
-    - SlurmdLogFile
-    - SlurmdParameters
-    - SlurmdPidFile
-    - SlurmdPort
-    - SlurmdSpoolDir
-    - SlurmdSyslogDebug
-    - SlurmdTimeout
-    - SlurmdUser
-    - SlurmSchedLogFile
-    - SlurmSchedLogLevel
-    - SlurmctldPidFile
-    - SlurmctldPlugstack
-    - SrunEpilog
-    - SrunPortRange
-    - SrunProlog
-    - StateSaveLocation
-    - SuspendExcNodes
-    - SuspendExcParts
-    - SuspendProgram
-    - SuspendRate
-    - SuspendTime
-    - SuspendTimeout
-    - SwitchParameters
-    - SwitchType
-    - TaskEpilog
-    - TaskPlugin
-    - TaskPluginParam
-    - TaskProlog
-    - TCPTimeout
-    - TmpFS
-    - TopologyParam
-    - TopologyPlugin
-    - TrackWCKey
-    - TreeWidth
-    - UsePam
-    - UnkillableStepProgram
-    - UnkillableStepTimeout
-    - VSizeFactor
-    - WaitTime
-    - X11Parameters
-  mpi:
-    - PMIxCliTmpDirBase
-    - PMIxCollFence
-    - PMIxDebug
-    - PMIxDirectConn
-    - PMIxDirectConnEarly
-    - PMIxDirectConnUCX
-    - PMIxDirectSameArch
-    - PMIxEnv
-    - PMIxFenceBarrier
-    - PMIxNetDevicesUCX
-    - PMIxShareServerTopology
-    - PMIxTimeout
-    - PMIxTlsUCX
-  cgroup:
-    - CgroupMountpoint
-    - CgroupPlugin
-    - CgroupSlice
-    - SystemdTimeout
-    - IgnoreSystemd
-    - IgnoreSystemdOnFailure
-    - EnableControllers
-    - EnableExtraControllers
-    - AllowedRAMSpace
-    - AllowedSwapSpace
-    - ConstrainCores
-    - ConstrainDevices
-    - ConstrainRAMSpace
-    - ConstrainSwapSpace
-    - MaxRAMPercent
-    - MaxSwapPercent
-    - MemorySwappiness
-    - MinRAMSpace
-    - SignalChildrenProcesses
-  slurmdbd: {}
-  gres: {}
+
 __default_config:
   cgroup:
     # CgroupAutomount: true

From bef1a76842e6702d0b0e27a913e431f72e694d3c Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Mon, 9 Feb 2026 00:31:07 +0530
Subject: [PATCH 072/172] Added all possible confs from the slurm source code
 with types

---
 .../common_utils/slurm_conf_utils.py          | 239 ++++++++++++++++--
 1 file changed, 212 insertions(+), 27 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 401109640b..20d61afc98 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -61,14 +61,14 @@ class SlurmParserEnum(str, Enum):
 S_P_LIST = SlurmParserEnum.S_P_LIST
 
 
-downnodes_options = {
+slurm_downnodes_options = {
     "DownNodes": S_P_STRING,
     "Reason": S_P_STRING,
     "State": S_P_STRING,
 }
 
 
-nodename_options = {
+slurm_nodename_options = {
     "NodeName": S_P_STRING,
     "BcastAddr": S_P_STRING,
     "Boards": S_P_UINT16,
@@ -101,14 +101,14 @@ class SlurmParserEnum(str, Enum):
 }
 
 
-nodeset_options = {
+slurm_nodeset_options = {
     "NodeSet": S_P_STRING,
     "Feature": S_P_STRING,
     "Nodes": S_P_STRING
 }
 
 
-partition_options = {
+slurm_partitionname_options = {
     "Partition": S_P_STRING,
     "AllocNodes": S_P_CSV,
     "AllowAccounts": S_P_CSV,
@@ -504,24 +504,6 @@ class SlurmParserEnum(str, Enum):
     "SystemdTimeout": S_P_UINT64
 }
 
-# From
-# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/plugins/mpi/pmix/mpi_pmix.c#L83
-mpi_options = {
-    "PMIxCliTmpDirBase": S_P_STRING,
-    "PMIxCollFence": S_P_STRING,
-    "PMIxDebug": S_P_UINT32,
-    "PMIxDirectConn": S_P_BOOLEAN,
-    "PMIxDirectConnEarly": S_P_BOOLEAN,
-    "PMIxDirectConnUCX": S_P_BOOLEAN,
-    "PMIxDirectSameArch": S_P_BOOLEAN,
-    "PMIxEnv": S_P_STRING,
-    "PMIxFenceBarrier": S_P_BOOLEAN,
-    "PMIxNetDevicesUCX": S_P_STRING,
-    "PMIxShareServerTopology": S_P_BOOLEAN,
-    "PMIxTimeout": S_P_UINT32,
-    "PMIxTlsUCX": S_P_CSV
-}
-
 # From
 # https://github.com/SchedMD/slurm/blob/slurm-<VERSION>s/src/interfaces/gres.c#L101C40-L116C2
 _gres_options = {
@@ -555,20 +537,223 @@ class SlurmParserEnum(str, Enum):
     "Name": S_P_STRING
 })
 
+# From
+# https://github.com/SchedMD/slurm/blob/slurm-<VERSION>/src/plugins/mpi/pmix/mpi_pmix.c#L83
+mpi_options = {
+    "PMIxCliTmpDirBase": S_P_STRING,
+    "PMIxCollFence": S_P_STRING,
+    "PMIxDebug": S_P_UINT32,
+    "PMIxDirectConn": S_P_BOOLEAN,
+    "PMIxDirectConnEarly": S_P_BOOLEAN,
+    "PMIxDirectConnUCX": S_P_BOOLEAN,
+    "PMIxDirectSameArch": S_P_BOOLEAN,
+    "PMIxEnv": S_P_STRING,
+    "PMIxFenceBarrier": S_P_BOOLEAN,
+    "PMIxNetDevicesUCX": S_P_STRING,
+    "PMIxShareServerTopology": S_P_BOOLEAN,
+    "PMIxTimeout": S_P_UINT32,
+    "PMIxTlsUCX": S_P_CSV
+}
+
+# src/common/oci_config.c
+oci_options = {
+    "ContainerPath": S_P_STRING,
+    "CreateEnvFile": S_P_STRING,
+    "DisableHooks": S_P_STRING,
+    "EnvExclude": S_P_STRING,
+    "MountSpoolDir": S_P_STRING,
+    "RunTimeCreate": S_P_STRING,
+    "RunTimeDelete": S_P_STRING,
+    "RunTimeKill": S_P_STRING,
+    "RunTimeEnvExclude": S_P_STRING,
+    "RunTimeQuery": S_P_STRING,
+    "RunTimeRun": S_P_STRING,
+    "RunTimeStart": S_P_STRING,
+    "SrunPath": S_P_STRING,
+    "SrunArgs": S_P_LIST,
+    "DisableCleanup": S_P_BOOLEAN,
+    "StdIODebug": S_P_STRING,
+    "SyslogDebug": S_P_STRING,
+    "FileDebug": S_P_STRING,
+    "DebugFlags": S_P_STRING,
+    "IgnoreFileConfigJson": S_P_BOOLEAN
+}
+
+# From
+# src/plugins/acct_gather_*/*
+acct_gather_options = {
+    "EnergyIPMIDriverType": S_P_UINT32,
+    "EnergyIPMIDisableAutoProbe": S_P_UINT32,
+    "EnergyIPMIDriverAddress": S_P_UINT32,
+    "EnergyIPMIRegisterSpacing": S_P_UINT32,
+    "EnergyIPMIDriverDevice": S_P_STRING,
+    "EnergyIPMIProtocolVersion": S_P_UINT32,
+    "EnergyIPMIUsername": S_P_STRING,
+    "EnergyIPMIPassword": S_P_STRING,
+    "EnergyIPMIPrivilegeLevel": S_P_UINT32,
+    "EnergyIPMIAuthenticationType": S_P_UINT32,
+    "EnergyIPMICipherSuiteId": S_P_UINT32,
+    "EnergyIPMISessionTimeout": S_P_UINT32,
+    "EnergyIPMIRetransmissionTimeout": S_P_UINT32,
+    "EnergyIPMIWorkaroundFlags": S_P_UINT32,
+    "EnergyIPMIRereadSdrCache": S_P_BOOLEAN,
+    "EnergyIPMIIgnoreNonInterpretableSensors": S_P_BOOLEAN,
+    "EnergyIPMIBridgeSensors": S_P_BOOLEAN,
+    "EnergyIPMIInterpretOemData": S_P_BOOLEAN,
+    "EnergyIPMISharedSensors": S_P_BOOLEAN,
+    "EnergyIPMIDiscreteReading": S_P_BOOLEAN,
+    "EnergyIPMIIgnoreScanningDisabled": S_P_BOOLEAN,
+    "EnergyIPMIAssumeBmcOwner": S_P_BOOLEAN,
+    "EnergyIPMIEntitySensorNames": S_P_BOOLEAN,
+    "EnergyIPMIFrequency": S_P_UINT32,
+    "EnergyIPMICalcAdjustment": S_P_BOOLEAN,
+    "EnergyIPMIPowerSensors": S_P_STRING,
+    "EnergyIPMITimeout": S_P_UINT32,
+    "EnergyIPMIVariable": S_P_STRING,
+    "ProfileHDF5Dir": S_P_STRING,
+    "ProfileHDF5Default": S_P_STRING,
+    "ProfileInfluxDBDatabase": S_P_STRING,
+    "ProfileInfluxDBDefault": S_P_STRING,
+    "ProfileInfluxDBFrequency": S_P_UINT32,
+    "ProfileInfluxDBHost": S_P_STRING,
+    "ProfileInfluxDBPass": S_P_STRING,
+    "ProfileInfluxDBRTPolicy": S_P_STRING,
+    "ProfileInfluxDBTimeout": S_P_UINT32,
+    "ProfileInfluxDBUser": S_P_STRING,
+    "InterconnectOFEDPort": S_P_UINT32,
+    "InfinibandOFEDPort": S_P_UINT32,
+    "SysfsInterfaces": S_P_STRING
+}
+
+# src/plugins/burst_buffer/common/burst_buffer_common.c
+burst_buffer_options = {
+    "AllowUsers": S_P_STRING,
+    "CreateBuffer": S_P_STRING,
+    "DefaultPool": S_P_STRING,
+    "DenyUsers": S_P_STRING,
+    "DestroyBuffer": S_P_STRING,
+    "Directive": S_P_STRING,
+    "Flags": S_P_STRING,
+    "GetSysState": S_P_STRING,
+    "GetSysStatus": S_P_STRING,
+    "Granularity": S_P_STRING,
+    "OtherTimeout": S_P_UINT32,
+    "PollInterval": S_P_UINT32,
+    "Pools": S_P_STRING,
+    "StageInTimeout": S_P_UINT32,
+    "StageOutTimeout": S_P_UINT32,
+    "StartStageIn": S_P_STRING,
+    "StartStageOut": S_P_STRING,
+    "StopStageIn": S_P_STRING,
+    "StopStageOut": S_P_STRING,
+    "ValidateTimeout": S_P_UINT32
+}
+
+# src/plugins/node_features/helpers/node_features_helpers.c
+helpers_options = {
+    "AllowUserBoot": S_P_STRING,
+    "BootTime": S_P_UINT32,
+    "ExecTime": S_P_UINT32,
+    "Feature": S_P_ARRAY,
+    "MutuallyExclusive": S_P_LIST,
+    "NodeName": S_P_ARRAY
+}
+
+helpers_nodename_options = {
+    "AllowUserBoot": S_P_STRING,
+    "BootTime": S_P_UINT32,
+    "ExecTime": S_P_UINT32,
+    "Feature": S_P_CSV,
+    "MutuallyExclusive": S_P_LIST
+}
+
+helpers_feature_options = {
+    "Feature": S_P_CSV,
+    "Helper": S_P_STRING,
+    "Flags": S_P_STRING
+}
+
+# src/plugins/namespace/tmpfs/read_jcconf.c
+job_container_options = {
+    "AutoBasePath": S_P_BOOLEAN,
+    "InitScript": S_P_STRING,
+    "BasePath": S_P_ARRAY,
+    "EntireStepInNS": S_P_BOOLEAN,
+    "NodeName": S_P_ARRAY,
+    "Shared": S_P_BOOLEAN,
+    "CloneNSScript": S_P_STRING,
+    "CloneNSEpilog": S_P_STRING,
+    "CloneNSScript_Wait": S_P_UINT32,
+    "CloneNSEpilog_Wait": S_P_UINT32
+}
+
+job_container_nodename_options = {
+    "AutoBasePath": S_P_BOOLEAN,
+    "BasePath": S_P_STRING,
+    "Dirs": S_P_STRING,
+    "EntireStepInNS": S_P_BOOLEAN,
+    "NodeName": S_P_STRING,
+    "Shared": S_P_BOOLEAN,
+    "CloneNSScript": S_P_STRING,
+    "CloneNSEpilog": S_P_STRING,
+    "CloneNSScript_Wait": S_P_UINT32,
+    "CloneNSEpilog_Wait": S_P_UINT32
+}
+
+job_container_basename_options = {
+    "BasePath": S_P_STRING,
+    "Dirs": S_P_STRING
+}
+
+# src/plugins/topology/tree/switch_record.c
+topology_options = {
+    "SwitchName": S_P_ARRAY,
+    "LinkSpeed": S_P_UINT32,
+    "Nodes": S_P_STRING,
+    "Switches": S_P_STRING,
+    "BlockName": S_P_ARRAY,
+    "BlockSizes": S_P_STRING
+}
+
+topology_switchname_options = {
+    "SwitchName": S_P_STRING,
+    "LinkSpeed": S_P_UINT32,
+    "Nodes": S_P_STRING,
+    "Switches": S_P_STRING
+}
+
+topology_blockname_options = {
+    "BlockName": S_P_STRING,
+    "BlockSizes": S_P_STRING,
+    "Nodes": S_P_STRING
+}
+
 all_confs = {
     "slurm": slurm_options,
     "slurmdbd": slurmdbd_options,
     "cgroup": cgroup_options,
     "mpi": mpi_options,
+    "oci": oci_options,
+    "acct_gather": acct_gather_options,
+    "burst_buffer": burst_buffer_options,
+    "helpers": helpers_options,
+    "job_container": job_container_options,
+    "topology": topology_options,
     "gres": gres_options,
     # TOD: GRES can have different combinations, NodeName and Name
     # https://slurm.schedmd.com/gres.conf.html#SECTION_EXAMPLES
-    "slurm->PartitionName": partition_options,
-    "slurm->NodeName": nodename_options,
-    "slurm->DownNodes": downnodes_options,
-    "slurm->NodeSet": nodeset_options,
+    "slurm->PartitionName": slurm_partitionname_options,
+    "slurm->NodeName": slurm_nodename_options,
+    "slurm->DownNodes": slurm_downnodes_options,
+    "slurm->NodeSet": slurm_nodeset_options,
     "gres->Name": gres_name_options,
-    "gres->NodeName": gres_nodename_options
+    "gres->NodeName": gres_nodename_options,
+    "job_container->NodeName": job_container_nodename_options,
+    "job_container->BaseName": job_container_basename_options,
+    "topology->SwitchName": topology_switchname_options,
+    "topology->BlockName": topology_blockname_options,
+    "helpers->NodeName": helpers_nodename_options,
+    "helpers->Feature": helpers_feature_options
 }
 
 _HOSTLIST_RE = re.compile(

From 0db953b7b0af24c617e4fdca53c069060913376a Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 9 Feb 2026 10:00:43 +0530
Subject: [PATCH 073/172] update sfm and ome details

---
 utils/external_kafka_connect_details.yml      |  7 +++---
 utils/external_victoria_connect_details.yml   |  7 +++---
 .../tasks/main.yml                            |  2 ++
 .../tasks/main.yml                            | 25 ++++++++++++++-----
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml
index a51a75aa3f..1f2093e54e 100644
--- a/utils/external_kafka_connect_details.yml
+++ b/utils/external_kafka_connect_details.yml
@@ -21,10 +21,11 @@
     - name: Fail if service_kube_control_plane group is missing or empty
       ansible.builtin.fail:
         msg: >-
-          Inventory must define a non-empty 'service_kube_control_plane' group.
-          Run with '-i <inventory>' and ensure at least one host is in that group.
+          Inventory must define a 'service_kube_control_plane' group with exactly one host.
+          Provide either the service kube control plane VIP or one of the service kube control plane node IPs.
+          Run with '-i <inventory>' and ensure exactly one host is in that group.
       when:
-        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0
+        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1
 
 - name: Fetch external Kafka connection details
   hosts: service_kube_control_plane
diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml
index ad4ed542df..f955bbbc78 100644
--- a/utils/external_victoria_connect_details.yml
+++ b/utils/external_victoria_connect_details.yml
@@ -21,10 +21,11 @@
     - name: Fail if service_kube_control_plane group is missing or empty
       ansible.builtin.fail:
         msg: >-
-          Inventory must define a non-empty 'service_kube_control_plane' group.
-          Run with '-i <inventory>' and ensure at least one host is in that group.
+          Inventory must define a 'service_kube_control_plane' group with exactly one host.
+          Provide either the service kube control plane VIP or one of the service kube control plane node IPs.
+          Run with '-i <inventory>' and ensure exactly one host is in that group.
       when:
-        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0
+        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1
 
 - name: Fetch external Victoria connection details
   hosts: service_kube_control_plane
diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 0c4d525a82..8964652ce1 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -183,8 +183,10 @@
           'OME note (mTLS):',
           '  Use ca.crt as the server certificate in OME.',
           '  Create a client certificate in .pfx format (provide a passphrase when prompted):',
+          '    cd ' ~ kafka_output_dir,
           '    openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
           '  Use user.pfx as the client certificate in OME.',
+          '  If you are using the OME UI from a different system than the OIM host, copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.',
           ''
         ]
       }}
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index c44c145921..63c5301db2 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -182,6 +182,15 @@
         else ''
       }}
 
+- name: Build SFM hosts entry for vmselect
+  ansible.builtin.set_fact:
+    victoria_sfm_hosts_entry_vmselect: >-
+      {{
+        'echo "' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts'
+        if (vmselect_lb_ip.stdout | trim | length) > 0
+        else ''
+      }}
+
 - name: Set Victoria external port fallbacks
   ansible.builtin.set_fact:
     vminsert_port: "8480"
@@ -211,8 +220,9 @@
           server_crt: "{{ victoria_tls_cert }}"
         notes:
           sfm:
-            vminsert_write_url: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write"
+            vminsert_write_url: "https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write"
             hosts_entry: "{{ victoria_sfm_hosts_entry }}"
+            hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}"
 
 - name: Ensure output directory exists
   ansible.builtin.file:
@@ -242,17 +252,20 @@
           'Mode: ' ~ victoria_deployment_mode,
           '',
           'Endpoints:',
-          '  vminsert write: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write',
-          '  vmselect query: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/prometheus/api/v1/query',
-          '  vmselect UI:    https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui',
+          '  vminsert write: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write',
+          '  vmselect query: https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/prometheus/api/v1/query',
+          '  vmselect UI:    https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/vmui',
           '',
           'TLS:',
           '  server.crt: ' ~ victoria_tls_cert,
           '',
           'SFM note:',
-          '  Use vminsert write URL for SFM: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write',
+          '  Use vminsert write URL for SFM: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write',
+          '  Add this entry to /etc/hosts on the SFM server:',
+          '    ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.'),
           '  Add this entry to /etc/hosts on the SFM server:',
-          '    ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'LoadBalancer IP not available; cannot generate /etc/hosts entry.')
+          '    ' ~ (victoria_sfm_hosts_entry_vmselect if (victoria_sfm_hosts_entry_vmselect | length) > 0 else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.'),
+          '  If you are using the SFM UI from a different system than the OIM host, copy server.crt from the OIM host to that system before selecting/uploading it in the UI.'
         ]
       }}
   delegate_to: localhost

From f8bd103bd322b7da99b622fdd353e92a83958501 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 9 Feb 2026 10:08:42 +0530
Subject: [PATCH 074/172] update kafka and victoria lint issues

---
 .../tasks/main.yml                            |  3 +-
 .../tasks/main.yml                            | 45 ++++++++++++++-----
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 8964652ce1..207d93bfe6 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -186,7 +186,8 @@
           '    cd ' ~ kafka_output_dir,
           '    openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
           '  Use user.pfx as the client certificate in OME.',
-          '  If you are using the OME UI from a different system than the OIM host, copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.',
+          '  If you are using the OME UI from a different system than the OIM host,',
+          '  copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.',
           ''
         ]
       }}
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 63c5301db2..38b0ce3045 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -51,7 +51,7 @@
   when: victoria_deployment_mode != 'cluster'
 
 - name: Get Victoria pods status
-  ansible.builtin.shell: >-
+  ansible.builtin.command: >-
     kubectl get pods -n {{ victoria_namespace }}
     -l "app in (vminsert,vmselect,vmstorage,victoriametrics)"
     -o wide
@@ -60,7 +60,7 @@
   failed_when: victoria_pods_wide.rc != 0
 
 - name: Get Victoria pods status (json)
-  ansible.builtin.shell: >-
+  ansible.builtin.command: >-
     kubectl get pods -n {{ victoria_namespace }}
     -l "app in (vminsert,vmselect,vmstorage,victoriametrics)"
     -o json
@@ -191,6 +191,27 @@
         else ''
       }}
 
+- name: Set endpoint urls and SFM note strings
+  ansible.builtin.set_fact:
+    victoria_vminsert_write_url: >-
+      https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write
+    victoria_vmselect_query_url: >-
+      https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query
+    victoria_vmselect_ui_url: >-
+      https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui
+    victoria_sfm_hosts_entry_vminsert_display: >-
+      {{
+        victoria_sfm_hosts_entry
+        if (victoria_sfm_hosts_entry | length) > 0
+        else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.'
+      }}
+    victoria_sfm_hosts_entry_vmselect_display: >-
+      {{
+        victoria_sfm_hosts_entry_vmselect
+        if (victoria_sfm_hosts_entry_vmselect | length) > 0
+        else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.'
+      }}
+
 - name: Set Victoria external port fallbacks
   ansible.builtin.set_fact:
     vminsert_port: "8480"
@@ -220,7 +241,7 @@
           server_crt: "{{ victoria_tls_cert }}"
         notes:
           sfm:
-            vminsert_write_url: "https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write"
+            vminsert_write_url: "{{ victoria_vminsert_write_url }}"
             hosts_entry: "{{ victoria_sfm_hosts_entry }}"
             hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}"
 
@@ -252,20 +273,20 @@
           'Mode: ' ~ victoria_deployment_mode,
           '',
           'Endpoints:',
-          '  vminsert write: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write',
-          '  vmselect query: https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/prometheus/api/v1/query',
-          '  vmselect UI:    https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/vmui',
+          '  vminsert write: ' ~ victoria_vminsert_write_url,
+          '  vmselect query: ' ~ victoria_vmselect_query_url,
+          '  vmselect UI:    ' ~ victoria_vmselect_ui_url,
           '',
           'TLS:',
           '  server.crt: ' ~ victoria_tls_cert,
           '',
           'SFM note:',
-          '  Use vminsert write URL for SFM: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write',
-          '  Add this entry to /etc/hosts on the SFM server:',
-          '    ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.'),
-          '  Add this entry to /etc/hosts on the SFM server:',
-          '    ' ~ (victoria_sfm_hosts_entry_vmselect if (victoria_sfm_hosts_entry_vmselect | length) > 0 else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.'),
-          '  If you are using the SFM UI from a different system than the OIM host, copy server.crt from the OIM host to that system before selecting/uploading it in the UI.'
+          '  Use vminsert write URL for SFM: ' ~ victoria_vminsert_write_url,
+          '  Add these entries to /etc/hosts on the SFM server:',
+          '    ' ~ victoria_sfm_hosts_entry_vminsert_display,
+          '    ' ~ victoria_sfm_hosts_entry_vmselect_display,
+          '  If you are using the SFM UI from a different system than the OIM host,',
+          '  copy server.crt from the OIM host to that system before selecting/uploading it in the UI.'
         ]
       }}
   delegate_to: localhost

From 2e7b3cae1a7554b5a71546117dba1bf1ebe1b2f9 Mon Sep 17 00:00:00 2001
From: Katakam Rakesh Naga Sai
 <125246792+Katakam-Rakesh@users.noreply.github.com>
Date: Mon, 9 Feb 2026 10:30:44 +0530
Subject: [PATCH 075/172] Update copyright for container_repo_utils.py

Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com>
---
 common/library/module_utils/local_repo/container_repo_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py
index 3b8eb29662..0a4abb35fb 100644
--- a/common/library/module_utils/local_repo/container_repo_utils.py
+++ b/common/library/module_utils/local_repo/container_repo_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From aae7e28ebde2e207e93c97852a7abfd19aebe215 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Mon, 9 Feb 2026 10:48:31 +0530
Subject: [PATCH 076/172] Create test.sh

---
 test.sh | 1555 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1555 insertions(+)
 create mode 100644 test.sh

diff --git a/test.sh b/test.sh
new file mode 100644
index 0000000000..cd1f8e63e7
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,1555 @@
+#!/bin/bash
+
+# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script is used to generate the Omnia core docker image.
+# The image is based on Fedora and uses systemd to start all of the necessary
+# services.
+#
+# This script prompts the user for the Omnia shared path and the root
+# password. It then checks if the Omnia shared path exists.
+#
+# The script checks if the ssh key file exists. If it does not exist, a new ssh
+
+# Color Definitions
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+YELLOW='\033[0;33m'
+omnia_release=2.1.0.0
+
+core_container_status=false
+omnia_path=""
+hashed_passwd=""
+domain_name=""
+
+is_local_ip() {
+    local ip_to_check="$1"
+
+    # Get all local IP addresses (excluding loopback)
+    local local_ips
+    local_ips=$(hostname -I)
+
+    # Check if the IP matches any local IP
+    if echo "$local_ips" | grep -qw "$ip_to_check"; then
+        return 0  # IP is local
+    else
+        return 1  # IP is not local
+    fi
+}
+
+OMNIA_BASE_DIR="/opt/omnia"
+OMNIA_INPUT_DIR="/opt/omnia/input"
+OMNIA_BACKUPS_DIR="/opt/omnia/backups"
+OMNIA_METADATA_DIR="/opt/omnia/.data"
+OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
+
+update_metadata_upgrade_backup_dir() {
+    local backup_dir="$1"
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running"
+        return 1
+    fi
+
+    podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then
+            sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE'
+        else
+            echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE'
+        fi
+    "
+}
+
+
+
+check_internal_nfs_export() {
+    nfs_server_ip=$1
+    nfs_server_share_path=$2
+
+    if is_local_ip "$nfs_server_ip"; then
+        echo "The provided NFS server IP ($nfs_server_ip) belongs to the current system."
+    else
+        echo "The provided NFS server IP ($nfs_server_ip) is NOT the current system's IP."
+        exit 1
+    fi
+
+    # Query the remote server for exports
+    exports=$(showmount -e "$nfs_server_ip" 2>/dev/null)
+
+    if [[ $? -ne 0 ]]; then
+        echo -e "${RED}ERROR: Unable to contact NFS server at $nfs_server_ip. Ensure NFS and rpcbind are running, and firewall allows access.${NC}"
+        exit 1
+    fi
+
+    # Check if path is in the export list
+    if echo "$exports" | awk '{print $1}' | grep -Fxq "$nfs_server_share_path"; then
+        echo -e "${GREEN}Path $nfs_server_share_path is exported by $nfs_server_ip.${NC}"
+    else
+        echo -e "${RED}ERROR: Path $nfs_server_share_path is NOT exported by $nfs_server_ip.${NC}"
+        exit 1
+    fi
+}
+
+display_supported_use_cases() {
+    # Color definitions
+    BLUE='\033[1;34m'
+    YELLOW='\033[1;33m'
+    GREEN='\033[1;32m'
+    NC='\033[0m' # No Color
+
+    # Introductory Guidance
+    echo -e "${BLUE} ----------------- Omnia Shared Path Configuration ---------------- ${NC}"
+    echo -e "${BLUE} Please choose the type of Omnia shared path in Omnia Infrastructure Manager (OIM): ${NC}"
+    echo -e "${BLUE} It is recommended to use a external NFS share for the Omnia shared path. ${NC}"
+    echo -e "${BLUE} If you are not using NFS, make sure enough space is available on the disk. ${NC}"
+    echo -e "${YELLOW} Using a Extrenal NFS share is mandatory for Omnia shared path if you are planning to have high availability in OIM or require K8s service cluster. ${NC}"
+    echo -e "\nSupported Use Cases:\n"
+
+    # Table content
+    {
+        echo -e "Share Option\tType\tDescription\tAdditional Info"
+        echo -e "${GREEN}NFS\tExternal\tExternal NFS server(outside OIM) created by user\tMust be reachable from OIM and service nodes. Mounts on OIM. Recommended for HA and hierarchical clusters.${NC}"
+        echo -e "NFS\tInternal\tNFS server created by user in OIM\tUsed only for flat provisioning. No HA or k8s service cluster support. No mount performed."
+        echo -e "Local\tDisk\tDisk storage in OIM\tUsed only for flat provisioning. No HA or hierarchical support."
+    } | column -t -s $'\t'
+}
+
+
+# This function is responsible for initializing the Omnia core container
+# It prompts the user for the Omnia shared path and the root password.
+# It checks if the Omnia shared path exists.
+setup_omnia_core() {
+    # Validate the system environment
+    validate_oim
+
+    # Initialize the container configuration
+    init_container_config
+
+    # Setup the container
+    setup_container
+
+    # Post container setup configuration
+    post_setup_config
+
+    # Start the container
+    start_container_session
+}
+
+
+# This function is responsible for cleaning up the Omnia core container.
+# It removes the container and performs the necessary cleanup steps.
+cleanup_omnia_core() {
+    # Block if critical service containers exist
+    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+    if [ -n "$critical_running" ]; then
+        echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
+        echo "$critical_running"
+        echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}"
+        exit 1
+    fi
+
+    echo -e "${RED} WARNING: This will remove Omnia core container and all files in Omnia Shared Path.${NC}"
+    echo -e "${GREEN} You can abort and take backup if you want.${NC}"
+    read -p " Are you sure you want to continue with the cleanup? (y/n): " confirm
+    if [ "$confirm" = "n" ] || [ "$confirm" = "N" ]; then
+        echo -e "${GREEN}Aborting.${NC}"
+        exit 0
+    elif [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then
+
+        # Fetch the configuration from the Omnia core container.
+        fetch_config
+
+        # Remove the container
+        remove_container
+
+        # Perform the necessary cleanup steps
+        cleanup_config
+    fi
+}
+
+
+# This function is responsible for cleaning up the Omnia core container configuration.
+# It removes the public key from the authorized_keys file.
+# It removes the private key.
+# It removes the ssh key from the known_hosts file.
+# It removes the Omnia core configuration.
+#
+cleanup_config(){
+
+    # Set the path to the ssh public key.
+    ssh_key_file="$HOME/.ssh/oim_rsa.pub"
+
+    # Remove the public key from the authorized_keys file.
+    if [ -f "$ssh_key_file" ]; then
+        # Remove the line from the authorized_keys file.
+        sed -i "\|^$(cat $ssh_key_file)$|d" $HOME/.ssh/authorized_keys
+        echo -e "${GREEN} Public key has been removed from authorized_keys.${NC}"
+    else
+        echo -e "${RED} Public key file not found.${NC}"
+    fi
+
+    # Remove the SSH key pair.
+    ssh_key_file="$HOME/.ssh/oim_rsa"
+    ssh_key_file_pub="${ssh_key_file}.pub"
+    if [ -f "$ssh_key_file" ] && [ -f "$ssh_key_file_pub" ]; then
+        rm -f "$ssh_key_file" "$ssh_key_file_pub"
+        echo -e "${GREEN} SSH key pair have been removed.${NC}"
+    else
+        echo -e "${RED} SSH key file not found.${NC}"
+    fi
+
+    # Remove the ssh key from the known_hosts file.
+    echo -e "${BLUE} Removing ssh key from known_hosts file.${NC}"
+    ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1
+
+
+    # Remove the host entry from the config file in .ssh folder.
+    ssh_config_file="$HOME/.ssh/config"
+    if [ -f "$ssh_config_file" ]; then
+        sed -i '/Host omnia_core/,+5d' "$ssh_config_file"
+        echo -e "${GREEN} Host entry has been removed from config file.${NC}"
+    else
+        echo -e "${RED} Config file not found.${NC}"
+    fi
+
+    # Remove the Omnia core configuration.
+    echo -e "${BLUE} Removing Omnia core configuration.${NC}"
+    rm -rf $omnia_path/omnia/{hosts,input,log,pulp,provision,pcs,ssh_config,tmp,.data}
+
+    # Unmount the NFS shared path if the share option is NFS.
+    if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then
+        umount "$omnia_path"
+        if [ $? -eq 0 ]; then
+            echo -e "${GREEN} NFS shared path has been unmounted.${NC}"
+        else
+            echo -e "${RED} Failed to unmount NFS shared path.${NC}"
+        fi
+        # Remove the entry from /etc/fstab
+        fstab_file="/etc/fstab"
+        if [ -f "$fstab_file" ]; then
+            # Create a backup of the fstab file.
+            cp "$fstab_file" "$fstab_file.bak"
+
+            # Remove the line from the fstab file.
+             sed -i "\#$omnia_path#d" "$fstab_file"
+             if [ $? -ne 0 ]; then
+                echo -e "${RED} Failed to remove the entry from /etc/fstab.${NC}"
+            fi
+        fi
+    fi
+
+    echo -e "${GREEN} Omnia core configuration has been cleaned up.${NC}"
+}
+
+# This function is responsible for removing the Omnia core container.
+#
+# It removes the container using the 'podman rm -f' command.
+# If the container is removed successfully, it prints a success message.
+# Otherwise, it prints an error message.
+remove_container() {
+    # Block if critical service containers exist
+    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+    if [ -n "$critical_running" ]; then
+        echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
+        echo "$critical_running"
+        echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}"
+        exit 1
+    fi
+
+    # Remove the container.
+    echo -e "${BLUE} Removing the Omnia core container.${NC}"
+    if systemctl stop omnia_core.service; then
+        echo -e "${GREEN} Omnia core container has been removed.${NC}"
+        # Remove the systemd generator symlinks.
+        echo -e "${GREEN} Cleaning up systemd generator symlinks.${NC}"
+        rm -f /run/systemd/generator/omnia_core.service
+        rm -f /run/systemd/generator/multi-user.target.wants/omnia_core.service
+        rm -f /run/systemd/generator/default.target.wants/omnia_core.service
+
+        echo -e "${GREEN} Cleaning up omnia_core.container.${NC}"
+        rm -f /etc/containers/systemd/omnia_core.container
+
+    # Remove the omnia_core.service file.
+        rm -f /etc/systemd/system/omnia_core.service
+        systemctl daemon-reload
+        systemctl reset-failed omnia_core.service
+    # check if service is removed
+        if systemctl status omnia_core.service >/dev/null 2>&1; then
+            echo -e "${RED} Failed to remove Omnia core service.${NC}"
+        else
+            echo -e "${GREEN} Omnia core service has been removed.${NC}"
+        fi    
+    else
+        echo -e "${RED} Failed to remove Omnia core container.${NC}"
+    fi
+
+    # Remove the container image.
+    # if podman rmi omnia_core; then
+    #     echo -e "${GREEN} Omnia core image has been removed.${NC}"
+    # else
+    #     echo -e "${RED} Failed to remove Omnia core image.${NC}"
+    # fi
+}
+
+
+# This function is responsible for initializing the Omnia core container.
+#
+# It prompts the user for the Omnia shared path and the root
+# password. It then checks if the Omnia shared path exists.
+#
+# The function generates the ssh key pair and copies the private
+# key to the Omnia shared path.
+#
+# The function also copies the ssh public key to the
+# authorized_keys file.
+#
+# The function creates the necessary log directories.
+init_container_config() {
+
+    share_option=""
+    # Display the supported use cases
+    display_supported_use_cases
+
+    # Display the choices for the user
+    echo -e "${BLUE} Choose the type of Omnia shared path:${NC}"
+    options=( "NFS (recommended)" "Local"  )
+
+    PS3="Select the option number: "
+
+    select opt in "${options[@]}"; do
+        case $opt in
+            "NFS (recommended)")
+                share_option="NFS"
+                break
+                ;;
+            "Local")
+                share_option="Local"
+                break
+                ;;
+            *)
+                echo -e "${RED} Invalid option.${NC}"
+                continue
+        esac
+    done
+
+    case $share_option in
+        "Local")
+            # Prompt the user for the Omnia shared path.
+            echo -e "${BLUE} Please provide Omnia shared path:${NC}"
+            read -p "Omnia shared path: " omnia_path
+
+            # Check if the Omnia shared path is absolute path and path exists.
+            if [[ "$omnia_path" != /* ]] || [ ! -d "$omnia_path" ]; then
+                echo -e "${RED} Omnia shared path is not an absolute path or does not exist! Please re-run omnia.sh --install with valid Omnia shared path.${NC}"
+                exit 1
+            fi
+            ;;
+        "NFS")
+            echo -e "${BLUE} Select NFS type:${NC}"
+            select nfs_type in "External (Recommended)" "Internal"; do
+                case $nfs_type in
+                    "External (Recommended)")
+                        echo -e "${BLUE} Please provide the external NFS server IP:${NC}"
+                        read -p "External NFS server IP: " nfs_server_ip
+
+                        echo -e "${BLUE} Please provide the external NFS server share path:${NC}"
+                        read -p "External NFS share path: " nfs_server_share_path
+
+                        echo -e "${BLUE} Please provide the OIM client share path (mount target):${NC}"
+                        read -p "Omnia shared path: " omnia_path
+
+                        # Validate Omnia shared path is absolute
+                        if [[ "$omnia_path" != /* ]]; then
+                            echo -e "${RED}Omnia shared path must be an absolute path.${NC}"
+                            exit 1
+                        fi
+
+                        nfs_type="external"
+                        break
+                        ;;
+                    "Internal")
+                        echo -e "${BLUE} Please provide the OIM server IP:${NC}"
+                        read -p "OIM server IP: " nfs_server_ip
+
+                        echo -e "${BLUE} Please provide the OIM server share path:${NC}"
+                        read -p "OIM server share path: " nfs_server_share_path
+
+                        echo -e "${BLUE} Checking if the OIM server share path is mounted${NC}"
+                        check_internal_nfs_export "$nfs_server_ip" "$nfs_server_share_path"
+
+                        # Note: No mounting performed here
+                        echo -e "${YELLOW}Note: Internal NFS does not support HA OIM or hierarchical cluster. Proceeding...${NC}"
+                        nfs_type="internal"
+                        omnia_path="$nfs_server_share_path"
+                        break
+                        ;;
+                    *)
+                        echo -e "${RED}Invalid option. Please choose 1 or 2.${NC}"
+                        ;;
+                esac
+            done
+            ;;
+    esac
+
+
+    # Prompt the user for the Omnia core root password.
+    echo -e "${BLUE} Please provide Omnia core root password for accessing container:${NC}"
+
+    read -p " Enter: " -s passwd
+
+    # Prompt the user for the Omnia core root password confirmation.
+    echo -e "\n${BLUE} Please confirm password:${NC}"
+    read -s -p " Enter: " cnf_passwd
+
+    # Check if the provided passwords match.
+    if [ "$passwd" != "$cnf_passwd" ]; then
+        echo -e "${RED} Invalid Omnia core root password, passwords do not match!${NC}"
+        exit 1
+    fi
+
+    # Check if the password contains any of the invalid characters
+    invalid_chars='[\\|&;`"><*?!$(){}[\]]'
+    if [[ "$passwd" =~ $invalid_chars ]]; then
+        echo -e "${RED} Invalid password, passwords must not contain any of these special characters: [\\|&;\`\"><*?!$(){}[\]]${NC}"
+        exit 1
+    fi
+
+    # Install NFS client package if option NFS is selected
+    if [[ "$share_option" == "NFS" ]]; then
+        # Install NFS client package
+        echo -e "${BLUE} Installing NFS client package.${NC}"
+        dnf install -y nfs-utils nfs4-acl-tools
+
+        # Create omnia_path directory if it does not exist
+        echo -e "${BLUE} Creating omnia shared path directory if it does not exist.${NC}"
+        mkdir -p $omnia_path
+
+        # Mount NFS server share path in Omnia share path
+        if [[ "$nfs_type" == "external" ]]; then
+
+            if is_local_ip "$nfs_server_ip"; then
+                echo -e "${RED} Error: NFS server $nfs_server_ip is a local IP.${NC}"
+                echo -e "${RED} Please provide an external NFS server IP or re-run omnia.sh --install with valid options.${NC}"
+                exit 1
+            fi
+
+            # Validate if NFS server is reachable
+            echo -e "${BLUE} Validating if NFS server is reachable.${NC}"
+            ping -c1 -W1 $nfs_server_ip > /dev/null
+            if [ $? -ne 0 ]; then
+                echo -e "${RED} NFS server $nfs_server_ip is not reachable.${NC}"
+                exit 1
+            fi
+
+            echo -e "${BLUE} Mounting NFS server share path in Omnia share path.${NC}"
+            mount -t nfs -o nosuid,rw,sync,hard,intr,timeo=30 "$nfs_server_ip:$nfs_server_share_path" "$omnia_path"
+            if [[ $? -ne 0 ]]; then
+                echo -e "${RED} Failed to mount NFS. Please check the IP and path.${NC}"
+                exit 1
+            fi
+            # Validate if NFS server share path is mounted
+            echo -e "${BLUE} Validating if NFS server share path is mounted.${NC}"
+            # strip the trailing slash from nfs_server_share_path
+            nfs_server_share_path="${nfs_server_share_path%/}"
+            if grep -qs "$nfs_server_ip:$nfs_server_share_path" /proc/mounts; then
+                echo -e "${GREEN} NFS server share path is mounted.${NC}"
+            else
+                echo -e "${RED} NFS server share path is not mounted. Provide valid NFS server details. ${NC}"
+                exit 1
+            fi
+            # Add NFS server share to /etc/fstab to mount on startup
+            echo "$nfs_server_ip:$nfs_server_share_path $omnia_path nfs nosuid,rw,sync,hard,intr" >> /etc/fstab
+        else
+            echo -e "${BLUE} Using internal NFS path without mounting.${NC}"
+        fi
+
+    fi
+
+    hashed_passwd=$(openssl passwd -1 $passwd)
+    ssh_key_file="/root/.ssh/oim_rsa"
+    ssh_port=2222
+
+    # Generate a new ssh key pair.
+    if [ -f "$ssh_key_file" ]; then
+        echo -e "\n${BLUE} Skipping generating new ssh key pair.${NC}"
+    else
+        echo -e "\n${GREEN} Generating a new ssh key pair.${NC}"
+        ssh-keygen -t rsa -b 4096 -C "omnia_oim" -q -N '' -f /root/.ssh/oim_rsa
+        {
+            echo "Host omnia_core"
+            echo "    Hostname localhost"
+            echo "    Port $ssh_port"
+            echo "    User root"
+            echo "    IdentityFile ~/.ssh/oim_rsa"
+            echo "    IdentitiesOnly yes"
+        } >> $HOME/.ssh/config
+    fi
+
+    # Create the ssh configuration directory if it does not exist.
+    echo -e "${GREEN} Creating the ssh configuration directory if it does not exist.${NC}"
+    mkdir -p "$omnia_path/omnia/ssh_config/.ssh"
+
+    # Copy the omnia_core ssh config to the shared path.
+    echo -e "${GREEN} Copying the omnia_core ssh config to the omnia shared path.${NC}"
+    cp "$HOME/.ssh/config" "$omnia_path/omnia/ssh_config/.ssh/config"
+
+    # Copy the oim_rsa ssh key to the shared path.
+    echo -e "${GREEN} Copying the oim_rsa ssh key to the omnia shared path.${NC}"
+    cp "$HOME/.ssh/oim_rsa" "$omnia_path/omnia/ssh_config/.ssh/oim_rsa"
+
+    # Copy the ssh private key to the omnia shared path.
+    echo -e "${GREEN} Copying the ssh private key to the omnia shared path.${NC}"
+    cp $ssh_key_file "$omnia_path/omnia/ssh_config/.ssh/id_rsa"
+
+    # Copy the ssh public key to the omnia shared path.
+    echo -e "${GREEN} Copying the ssh public key to the omnia shared path.${NC}"
+    cp $ssh_key_file.pub "$omnia_path/omnia/ssh_config/.ssh/id_rsa.pub"
+
+    # Get the ssh public key.
+    ssh_public_key="$(cat /root/.ssh/oim_rsa.pub)"
+
+    validate_nfs_server
+
+    # Add ssh public key to the authorized_keys.
+    echo -e "${GREEN} Adding ssh public key to the authorized_keys.${NC}"
+    if grep -q "$ssh_public_key" $HOME/.ssh/authorized_keys; then
+        echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys.${NC}"
+    else
+        echo "$ssh_public_key" >> $HOME/.ssh/authorized_keys
+        chmod 600 $HOME/.ssh/authorized_keys
+    fi
+
+    # Add ssh public key to the authorized_keys in the ssh_config directory.
+    echo -e "${GREEN} Adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}"
+    if [ -f "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" ] && grep -q "$ssh_public_key" "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"; then
+        echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}"
+    else
+        echo "$ssh_public_key" >> "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"
+        chmod 600 "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"
+    fi
+
+    # Create the log directory if it does not exist.
+    echo -e "${GREEN} Creating the log directory if it does not exist.${NC}"
+    mkdir -p "$omnia_path/omnia/log/core/container"
+    mkdir -p "$omnia_path/omnia/log/core/playbooks"
+
+    # Create the hosts file for cluster in $omnia_path/omnia/hosts
+    echo -e "${GREEN} Creating the hosts file for cluster.${NC}"
+    touch "$omnia_path/omnia/hosts"
+
+    # Create the pulp_ha directory if it does not exist.
+    echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}"
+    mkdir -p "$omnia_path/omnia/pulp/pulp_ha"
+}
+
+
+# This function is responsible for fetching the configuration from the Omnia core.
+# It uses podman exec to run a command in the Omnia core container.
+# The command retrieves the metadata from the oim_metadata.yml file.
+# The metadata is then parsed and the required configuration is extracted.
+fetch_config() {
+
+    # Fetch the metadata from the oim_metadata.yml file.
+    echo -e "${GREEN} Fetching the metadata from the oim_metadata.yml file.${NC}"
+        core_config=$(podman exec -ti omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml')
+
+    # Split the metadata into separate lines.
+    IFS=$'\n' read -r -d '' -a config_lines <<<"$core_config"
+
+    # Loop through the lines and extract the required configuration.
+    for line in "${config_lines[@]}"; do
+        # Extract the key and value from the line.
+        key=$(echo "$line" | awk -F ':' '{print $1}')
+        value=$(echo "$line" | awk -F ':' '{print $2}')
+
+        # Check the key and assign the value to the corresponding variable.
+        case $key in
+            oim_shared_path)
+                # Assign the shared path.
+                omnia_path=$(echo "$value" | tr -d '[:space:]')
+                ;;
+            omnia_core_hashed_passwd)
+                # Assign the hashed password.
+                hashed_passwd=$(echo "$value" | tr -d '[:space:]')
+                ;;
+            nfs_server_ip)
+                # Assign the nfs server ip.
+                nfs_server_ip=$(echo "$value" | tr -d '[:space:]')
+                ;;
+            nfs_server_share_path)
+                # Assign the nfs server share path.
+                nfs_server_share_path=$(echo "$value" | tr -d '[:space:]')
+                ;;
+            omnia_share_option)
+                # Assign the share option.
+                share_option=$(echo "$value" | tr -d '[:space:]')
+                ;;
+            nfs_type)
+                # Assign the share option.
+                nfs_type=$(echo "$value" | tr -d '[:space:]')
+                ;;
+        esac
+    done
+    # Check if the required configuration is extracted successfully.
+    if [ -z "$omnia_path" ] || [ -z "$hashed_passwd" ]; then
+        echo -e "${RED} Failed to fetch data from metadata file.${NC}"
+        exit 1
+    else
+        echo -e "${GREEN} Successfully fetched data from metadata file.${NC}"
+    fi
+}
+
+# Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is
+# configured with a domain name, checking if Podman is installed, enabling and
+# starting the Podman socket.
+validate_oim() {
+    # Check if the hostname is set
+    hostname_value=$(hostname)
+    if [[ -z "$hostname_value" ]]; then
+        echo -e "${RED}Hostname is not set!${NC}"
+        exit 1
+    fi
+
+    # Check if the hostname is static
+    static_hostname=$(hostnamectl --static)
+    current_hostname=$(hostname)
+    if [[ "$static_hostname" != "$current_hostname" ]]; then
+        echo -e "${RED}Static Hostname is unset. Current: '$current_hostname', Static: '$static_hostname'${NC}"
+        echo -e "${RED}Please set the static hostname and try again.${NC}"
+        echo -e "${BLUE}Command to set hostname: hostnamectl set-hostname <hostname>${NC}"
+        echo -e "${RED}Exiting...${NC}"
+        exit 1
+    fi
+
+    # Check if the hostname is configured with a domain name.
+    domain_name=$(hostname -d)
+    if [[ -n "$domain_name" ]]; then
+        echo -e "${BLUE}Hostname is configured with a domain name: $domain_name${NC}"
+    else
+        echo -e "${RED}Invalid hostname, hostname is not configured with a domain name!${NC}"
+        exit 1
+    fi
+
+    # Detect OIM timezone from systemd in a stable, case‑independent way
+    oim_timezone=$(timedatectl show -p Timezone --value 2>/dev/null)
+
+    # Fallbacks if needed (non‑systemd or old timedatectl)
+    if [[ -z "$oim_timezone" ]]; then
+        if [[ -f /etc/timezone ]]; then
+            # Debian/Ubuntu style
+            oim_timezone=$(< /etc/timezone)
+        elif [[ -L /etc/localtime ]]; then
+            # Derive from /etc/localtime symlink
+            oim_timezone=$(readlink -f /etc/localtime | sed -n 's|^.*zoneinfo/||p')
+        fi
+    fi
+
+    podman --version
+
+    # Capture the exit status
+    if [ $? -eq 0 ]; then
+        echo -e "${BLUE} Podman is installed. Version: $(podman --version)${NC}"
+    else
+        echo -e "${RED} Podman is not installed.${NC}"
+        exit 1
+    fi
+
+    # Enable the podman socket to start at boot
+    echo -e "${BLUE} Enabling podman.socket...${NC}"
+    systemctl enable podman.socket
+
+    # Start the podman socket now
+    echo -e "${BLUE} Starting podman.socket...${NC}"
+    systemctl start podman.socket
+
+    # Print a success message after enabling and starting the podman socket
+    echo -e "${GREEN} Podman socket has been enabled and started.${NC}"
+}
+
+# Checks if the required directories for Omnia are present.
+# This function iterates over a list of required directories/files and checks if each one exists.
+check_required_directories() {
+    required_paths=(
+        "$omnia_path/omnia"
+        "$omnia_path/omnia/ssh_config/.ssh"
+        "$omnia_path/omnia/log/core/container"
+        "$omnia_path/omnia/hosts"
+        "$omnia_path/omnia/pulp/pulp_ha"
+    )
+
+    missing_paths=()
+
+    for path in "${required_paths[@]}"; do
+        if [ ! -e "$path" ]; then  # Checks both files and directories
+            missing_paths+=("$path")
+        fi
+    done
+
+    if [ "${#missing_paths[@]}" -ne 0 ]; then
+        echo -e "${RED}Error: The following required files or directories are missing:${NC}"
+        echo -e "${RED}${missing_paths[*]}${NC}"
+        echo -e "User can not Retain Existing configuration"
+        echo
+        echo -e "${YELLOW}Instructions:${NC}"
+        echo -e "${YELLOW}* Backup any existing files if required${NC}"
+        echo -e "${YELLOW}* Run ./omnia.sh --install and choose:${NC}"
+        echo -e "${YELLOW}    Options:${NC}"
+        echo -e "${YELLOW}      -> Reinstall the container${NC}"
+        echo -e "${YELLOW}      -> Overwrite and create new configuration${NC}"
+        exit 1
+    fi
+}
+
+# Sets up the Omnia core container.
+# This function pulls the Omnia core Podman image and runs the container.
+# Creates a Quadlet service for the container and also creates a metadata file.
+# It defines the container options and runs the container.
+setup_container() {
+    container_name="omnia_core"
+    echo "==> Setting up $container_name container"
+
+    # SELinux option handling
+    selinux_option=":z"
+    if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then
+        selinux_option=""
+    fi
+
+    # Check if RHEL subscription is enabled
+    subscription_enabled=false
+    if [ -d "/etc/pki/entitlement" ] && [ "$(ls -A /etc/pki/entitlement/*.pem 2>/dev/null)" ]; then
+        subscription_enabled=true
+    fi
+
+    # --- Generate Quadlet container file ---
+    cat > /etc/containers/systemd/${container_name}.container <<EOF
+# ===============================================================
+# $container_name Quadlet Service
+# Generated dynamically by omnia.sh
+# ===============================================================
+[Unit]
+Description=${container_name^} Container
+
+[Container]
+ContainerName=${container_name}
+HostName=${container_name}
+Image=${container_name}:1.1
+Network=host
+
+# Capabilities
+AddCapability=CAP_AUDIT_WRITE
+
+# Volumes
+Volume=${omnia_path}/omnia:/opt/omnia${selinux_option}
+Volume=${omnia_path}/omnia/ssh_config/.ssh:/root/.ssh${selinux_option}
+Volume=${omnia_path}/omnia/log/core/container:/var/log${selinux_option}
+Volume=${omnia_path}/omnia/hosts:/etc/hosts${selinux_option}
+Volume=${omnia_path}/omnia/pulp/pulp_ha:/root/.config/pulp${selinux_option}
+EOF
+
+    # Add subscription volume mounts only if subscription is enabled
+    if [ "$subscription_enabled" = true ]; then
+        cat >> /etc/containers/systemd/${container_name}.container <<EOF
+Volume=/etc/pki/entitlement:/etc/pki/entitlement:ro,z
+Volume=/etc/yum.repos.d/redhat.repo:/etc/yum.repos.d/redhat.repo:ro,z
+EOF
+    fi
+
+    cat >> /etc/containers/systemd/${container_name}.container <<EOF
+
+[Service]
+Restart=always
+
+[Install]
+WantedBy=multi-user.target default.target
+
+EOF
+
+    # Create the .data directory if it does not exist.
+    # This is where the oim_metadata.yml file is stored.
+    echo -e "${GREEN} Creating the .data directory if it does not exist.${NC}"
+    mkdir -p "$OMNIA_METADATA_DIR"
+
+    oim_metadata_file="$OMNIA_METADATA_FILE"
+
+    if [ ! -f "$oim_metadata_file" ]; then
+        echo -e "${GREEN} Creating oim_metadata file${NC}"
+        {
+            echo "oim_crt: \"podman\""
+            echo "oim_shared_path: $omnia_path"
+            echo "omnia_version: $omnia_release"
+            echo "oim_hostname: $(hostname)"
+            echo "oim_node_name: $(hostname -s)"
+            echo "domain_name: $domain_name"
+            echo "oim_timezone: $oim_timezone"
+            echo "omnia_core_hashed_passwd: $hashed_passwd"
+            echo "omnia_share_option: $share_option"
+        } >> "$oim_metadata_file"
+        if [ "$share_option" = "NFS" ]; then
+            {
+            echo "nfs_server_ip: $nfs_server_ip"
+            echo "nfs_server_share_path: $nfs_server_share_path"
+            echo "nfs_type: $nfs_type"
+        } >> "$oim_metadata_file"
+        fi
+    fi
+
+    # --- Remove old service if exists ---
+    if systemctl list-unit-files | grep -q "${container_name}.service"; then
+        systemctl stop ${container_name}.service
+        systemctl disable ${container_name}.service
+        rm -f /etc/systemd/system/${container_name}.service
+    fi
+
+    # --- Reload systemd so Quadlet generates the service ---
+    systemctl daemon-reexec
+    systemctl daemon-reload
+    systemctl start ${container_name}.service
+
+    # --- Start the container via Quadlet ---
+    echo "==> ${container_name} container deployed and starting via Quadlet"
+
+    # --- Wait for container to be running ---
+    echo "Waiting for $container_name container to start..."
+    for i in {1..30}; do
+        if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
+            echo "$container_name container is running."
+            break
+        else
+            sleep 1
+        fi
+    done
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
+        echo -e "${RED}Error: $container_name container failed to start.${NC}"
+        rm -rf "$OMNIA_METADATA_FILE"
+        exit 1
+    fi
+
+    systemctl start firewalld
+    systemctl enable firewalld
+    firewall-cmd --permanent --zone=public --add-port=2222/tcp
+    firewall-cmd --reload
+}
+
+# This function sets up the configuration for the Omnia core.
+#  post_setup_config is a function that sets up the configuration for the Omnia core.
+#  It creates the necessary directories and files, copies input files from the Omnia container,
+#  and creates the oim_metadata.yml file.
+post_setup_config() {
+
+    # Create the ansible tmp directory if it does not exist.
+    mkdir -p "$omnia_path/omnia/tmp/.ansible/tmp"
+    chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp"
+    # Create the input directory if it does not exist.
+    echo -e "${GREEN} Creating the input directory if it does not exist.${NC}"
+    mkdir -p "$OMNIA_INPUT_DIR/"
+
+    # Create the default.yml file if it does not exist.
+    # This file contains the name of the project.
+    if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then
+        echo -e "${BLUE} Creating default.yml file.${NC}"
+        {
+            echo "# This file defines the project name."
+            echo "# The name of the project should be set in a directory under input."
+            echo "project_name: project_default"
+        } >> "$OMNIA_INPUT_DIR/default.yml"
+    fi
+
+    # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container
+    podman exec -u root omnia_core bash -c "cd /omnia && git pull"
+    echo -e "${BLUE} Moving input files from /omnia dir to project_default folder.${NC}"
+    podman exec -u root omnia_core bash -c "
+    mkdir -p /opt/omnia/input/project_default
+    cp -r /omnia/input/* /opt/omnia/input/project_default
+    rm -rf /omnia/input
+    rm -rf /omnia/omnia.sh"
+
+    init_ssh_config
+}
+
+validate_nfs_server() {
+
+    # Validate NFS server permission
+    if [ "$share_option" = "NFS" ]; then
+        # Create a temporary file inside $omnia_path
+        temp_file="$omnia_path/temp_file"
+        touch "$temp_file"
+        # Check if the file can be chown to root
+        if chown root:root "$temp_file"; then
+            rm "$temp_file"
+        else
+            echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration."
+            exit 1
+        fi
+        if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then
+            echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration."
+            exit 1
+        fi
+    fi
+
+}
+
+init_ssh_config() {
+    touch $HOME/.ssh/known_hosts
+    # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host
+    ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1  # Remove existing entry if it exists
+    ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts  # Scan and add the new key
+}
+
+start_container_session() {
+
+    echo -e "${GREEN}
+    ------------------------------------------------------------------------------------------------------------------------------------------
+            Omnia Core container running successfully.
+
+            Entering the container from Omnia Infrastructure Manager(OIM):
+            Through podman:
+            # podman exec -it -u root omnia_core bash
+
+            Direct SSH:
+            # ssh omnia_core
+
+            You are now in the Omnia environment.
+
+            The following are the main directories available in the Omnia core container:
+
+            - The shared directory, which is mapped to $omnia_path in OIM: /opt/omnia
+            - The input directory: /opt/omnia/input
+            - The Omnia source code directory: /omnia
+            - The Omnia playbooks logs directory: /opt/omnia/log/core/playbooks
+
+            It's important to note:
+                - Files placed in the shared directory should not be manually deleted.
+                - Use the playbook /omnia/utils/oim_cleanup.yml to safely remove the shared directory and Omnia containers (except the core container).
+                - If you need to delete the core container, please run the omnia.sh script with --uninstall option.
+                - If you need to  redeploy the core container with new input configs, please rerun the omnia.sh script with --install option.
+                - Provide any file paths (ISO, mapping files, etc.) that are mentioned in input files in the /opt/omnia directory.
+                - The domain name that will be used for Omnia is $domain_name, if you wish to change the domain name please cleanup Omnia,
+                  change the Omnia Infrastructure Manager's domain name and rerun omnia.sh script with --install option.
+
+    --------------------------------------------------------------------------------------------------------------------------------------------------
+    ${NC}"
+
+    # Entering Omnia-core container
+    ssh omnia_core
+}
+
+show_help() {
+    echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]"
+    echo "  -i, --install     Install and start the Omnia core container"
+    echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
+    echo "      --upgrade     Upgrade the Omnia core container from image tag 1.0 to 1.1"
+    echo "  -v, --version     Display Omnia version information"
+    echo "  -h, --help        More information about usage"
+}
+
+install_omnia_core() {
+    local omnia_core_tag="1.1"
+    local omnia_core_registry=""
+    
+    # Check if local omnia_core:1.1 exists
+    if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
+        echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
+    # Check if latest exists for backward compatibility
+    elif podman inspect omnia_core:latest >/dev/null 2>&1; then
+        echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}"
+        # Tag it as 1.1 for consistency
+        podman tag omnia_core:latest omnia_core:${omnia_core_tag}
+    else
+        echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
+        echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}"
+        echo ""
+        echo -e "${YELLOW}One way to build the image locally:${NC}"
+        echo -e "1. Clone the Omnia Artifactory repository:"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container"
+        echo -e "2. Navigate to the repository directory:"
+        echo -e "   cd omnia-artifactory"
+        echo -e "3. Build the core image locally (loads into local Podman by default):"
+        echo -e "   ./build_images.sh core omnia_branch=<version/branch_name>"
+        echo ""
+        echo -e "${YELLOW}Then re-run:${NC}"
+        echo -e "   ./omnia.sh --install"
+        exit 1
+    fi
+
+    # Check if any other containers with 'omnia' in their name are running
+    other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core')
+
+    # If there are any, exit
+    if [ -n "$other_containers" ]; then
+        echo -e "${RED} Failed to intiatiate omnia_core container cleanup. There are other omnia container running.${NC}"
+        echo -e "${GREEN} Execute oim_cleanup.yml first to cleanup all containers.${NC}"
+        ssh omnia_core
+        exit 1
+    fi
+
+    # Check if the omnia_core container is already running
+    running_containers=$(podman ps -a --format '{{.Names}} {{.State}}' | grep -E 'omnia_core')
+
+    # If yes, set the variable to true
+    if [ -n "$running_containers" ]; then
+        core_container_status=true
+    fi
+
+    # If core container is running
+    if [ "$core_container_status" = true ]; then
+        if [ -n "$(echo "$running_containers" | grep -E 'running')" ]; then
+            echo -e "${GREEN} Omnia core container is already running.${NC}"
+            echo -e "${GREEN} Do you want to:${NC}"
+            PS3="Select the option number: "
+
+            select opt in "Enter omnia_core container" "Reinstall the container" "Exit"; do
+                case $opt in
+                    "Enter omnia_core container")
+                        choice=1
+                        break
+                        ;;
+                    "Reinstall the container")
+                        choice=2
+                        break
+                        ;;
+                    "Exit")
+                        echo "Exiting the script."
+                        exit 0
+                        ;;
+                    *)
+                        echo "Invalid choice. Please try again."
+                        continue
+                        ;;
+                esac
+            done
+
+            # If the user wants to enter omnia_core container
+            if [ "$choice" = "1" ]; then
+                start_container_session
+            fi
+            # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function
+            if [ "$choice" = "2" ]; then
+                # Block if critical service containers exist
+                critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+                if [ -n "$critical_running" ]; then
+                    echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
+                    echo "$critical_running"
+                    echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}"
+                    exit 1
+                fi
+                echo -e "${GREEN} What configuration do you want to use for reinstallation:${NC}"
+
+                PS3="Select the option number: "
+
+                select opt in "Retain Existing configuration" "Overwrite and create new configuration" "Exit"; do
+                    case $opt in
+                        "Retain Existing configuration")
+                            choice=1
+                            break
+                            ;;
+                        "Overwrite and create new configuration")
+                            choice=2
+                            break
+                            ;;
+                        "Exit")
+                            echo "Exiting the script."
+                            exit 0
+                            ;;
+                        *)
+                            echo "Invalid choice. Please try again."
+                            continue
+                            ;;
+                    esac
+                done
+
+                # If the user wants to retain existing configuration, call the remove_container function
+                if [ "$choice" = "1" ]; then
+                    fetch_config
+                    check_required_directories
+                    remove_container
+                    setup_container
+                    init_ssh_config
+                    start_container_session
+                # If the user wants to overwrite and create new configuration, call the cleanup_omnia_core function
+                elif [ "$choice" = "2" ]; then
+                    cleanup_omnia_core
+                    setup_omnia_core
+                fi
+            fi
+        else
+            # If omnia_core container exists and is not running call the remove_container function
+
+            echo -e "${RED} The Omnia Core container is present but not in running state.${NC}"
+            echo -e "${GREEN} Only the core container can be cleanup can be performed.${NC}"
+            echo -e "${GREEN} Container Configurations in the shared directory will not be cleaned up.${NC}"
+            echo -e "${GREEN} Do you want to perform cleanup:${NC}"
+            echo -e "${GREEN} 1. Yes.${NC}"
+            echo -e "${GREEN} 2. No. ${NC}"
+            read -p " Enter your choice (1 or 2): " choice
+            if [ "$choice" = "1" ]; then
+                remove_container
+            elif [ "$choice" = "2" ]; then
+                exit
+            fi
+        fi
+
+    # If core container is not present
+    else
+
+        # Start the container setup
+        echo -e "${GREEN}Starting Omnia core container setup.${NC}"
+        setup_omnia_core
+    fi
+}
+
+# Check if Omnia core container is running
+check_container_status() {
+    # Check if the Omnia core container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo -e "${RED}ERROR: Omnia core container is not running.${NC}"
+        exit 1
+    fi
+}
+
+# Function to display version information
+display_version() {
+    # Check if metadata file exists and Omnia core container is running
+    check_container_status
+    
+    # Fetch the metadata from the oim_metadata.yml file in the container
+    echo -e "${GREEN} Fetching metadata from omnia_core container...${NC}"
+    core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml')
+    
+    # Extract Omnia version from metadata file
+    omnia_version=$(echo "$core_config" | grep "omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r')
+    
+    # Display version information
+    echo "Omnia version: $omnia_version"
+    
+    # Return exit code 0 on success
+    exit 0
+}
+
+phase1_validate() {
+    local current_image
+    local core_config
+    local previous_omnia_version
+    local shared_path
+
+    echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation"
+
+    if [ "$(id -u)" -ne 0 ]; then
+        if ! sudo -n true >/dev/null 2>&1; then
+            echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo"
+            return 1
+        fi
+    fi
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running"
+        return 1
+    fi
+
+    core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null)
+    if [ -z "$core_config" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container"
+        return 1
+    fi
+
+    previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r')
+    if [ -z "$previous_omnia_version" ]; then
+        echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml"
+        return 1
+    fi
+
+    if [ "$previous_omnia_version" != "2.0.0.0" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version"
+        return 1
+    fi
+
+    shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -z "$shared_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml"
+        return 1
+    fi
+
+    omnia_path="$shared_path"
+
+    if [ ! -d "$omnia_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path"
+        return 1
+    fi
+
+    if [ ! -w "$omnia_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path"
+        return 1
+    fi
+
+    current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null)
+    if [ -z "$current_image" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image"
+        return 1
+    fi
+
+    if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then
+        echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image"
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)"
+
+    if [ ! -d "$OMNIA_BASE_DIR" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Mount/path invalid: expected directory not found: $OMNIA_BASE_DIR"
+        echo "[ERROR] [ORCHESTRATOR] Fix: ensure /opt/omnia exists and is mounted (if using external mount)"
+        return 1
+    fi
+
+    if [ ! -w "$OMNIA_BASE_DIR" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on $OMNIA_BASE_DIR"
+        echo "[ERROR] [ORCHESTRATOR] Fix: run as root or fix permissions on /opt/omnia"
+        return 1
+    fi
+
+    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
+        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed"
+    return 0
+}
+
+phase2_approval() {
+    local backup_base default_backup_dir
+
+    echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate"
+    echo "============================================"
+    echo "OMNIA UPGRADE SUMMARY"
+    echo "============================================"
+    echo "Current Container Tag: 1.0"
+    echo "Target Container Tag:  1.1"
+    echo "Current Omnia Release: 2.0.0.0"
+    echo "Target Omnia Release:  2.1.0.0"
+    echo "New Features:"
+    echo "  - Add and remove node for slurm cluster"
+    echo "  - Additional Package Installation"
+    echo "============================================"
+
+    default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade"
+    backup_base="$default_backup_dir"
+
+    echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base"
+
+    if ! update_metadata_upgrade_backup_dir "$backup_base"; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata"
+        return 1
+    fi
+
+    read -p "Proceed with upgrade? (y/N): " confirm
+    if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then
+        echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user"
+        return 1
+    fi
+
+    OMNIA_UPGRADE_BACKUP_PATH="$backup_base"
+    export OMNIA_UPGRADE_BACKUP_PATH
+
+    echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted"
+    return 0
+}
+
+generate_backup_manifest() {
+    local backup_path="$1"
+    local manifest_file="$backup_path/manifest.txt"
+
+    {
+        echo "backup_version: 1.0"
+        echo "timestamp: $(date -Iseconds)"
+        echo "source_container_tag: 1.0"
+        echo "target_container_tag: 1.1"
+        echo "source_omnia_release: 2.0.x"
+        echo "target_omnia_release: 2.1.0.0"
+        echo "hostname: $(hostname)"
+        echo ""
+        echo "files:"
+        find "$backup_path" -type f ! -name "manifest.txt" -exec echo "  - {}" \;
+    } > "$manifest_file"
+}
+
+verify_backup_integrity() {
+    local backup_path="$1"
+
+    [ -d "$backup_path" ] || return 1
+    [ -d "$backup_path/input" ] || return 1
+    [ -d "$backup_path/metadata" ] || return 1
+    [ -d "$backup_path/configs" ] || return 1
+    [ -f "$backup_path/metadata/oim_metadata.yml" ] || return 1
+    [ -f "$backup_path/manifest.txt" ] || return 1
+
+    return 0
+}
+
+create_backup() {
+    local backup_path="$1"
+
+    echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation"
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running"
+        return 1
+    fi
+
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        mkdir -p '$backup_path/input' '$backup_path/metadata' '$backup_path/configs'
+
+        if [ -d '$OMNIA_INPUT_DIR' ]; then
+            cp -a '$OMNIA_INPUT_DIR' '$backup_path/'
+        fi
+
+        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+            exit 1
+        fi
+        cp -a '$OMNIA_METADATA_FILE' '$backup_path/metadata/oim_metadata.yml'
+
+        ts=\"\$(date -Iseconds)\"
+        hn=\"\$(hostname)\"
+        {
+            echo 'backup_version: 1.0'
+            echo \"timestamp: \$ts\"
+            echo 'source_container_tag: 1.0'
+            echo 'target_container_tag: 1.1'
+            echo 'source_omnia_release: 2.0.x'
+            echo 'target_omnia_release: 2.1.0.0'
+            echo \"hostname: \$hn\"
+        } > '$backup_path/manifest.txt'
+    "; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to create backup inside omnia_core container"
+        return 1
+    fi
+
+    if [ -f "/etc/containers/systemd/omnia_core.container" ]; then
+        if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:$backup_path/configs/omnia_core.container" >/dev/null 2>&1; then
+            echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file into container backup path"
+            return 1
+        fi
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_path"
+    echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed"
+    return 0
+}
+
+wait_for_container_health() {
+    local timeout="${1:-60}"
+    local i
+
+    for i in $(seq 1 "$timeout"); do
+        if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+            return 0
+        fi
+        sleep 1
+    done
+    return 1
+}
+
+update_metadata_version() {
+    local metadata_file="$OMNIA_METADATA_FILE"
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running"
+        return 1
+    fi
+
+    podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^omnia_version:' '$OMNIA_METADATA_FILE'; then
+            sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$OMNIA_METADATA_FILE'
+        else
+            echo 'omnia_version: 2.1.0.0' >> '$OMNIA_METADATA_FILE'
+        fi
+    "
+}
+
+sync_input_to_shared_path() {
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Cannot sync input because omnia_core is not running"
+        return 1
+    fi
+
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        if [ -d /omnia/input ]; then
+            mkdir -p /opt/omnia/input/project_default
+            cp -r /omnia/input/* /opt/omnia/input/project_default
+            rm -rf /omnia/input
+        fi
+    "; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to copy /omnia/input to /opt/omnia/input/project_default"
+        return 1
+    fi
+    return 0
+}
+
+phase4_container_swap() {
+    echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap"
+
+    if systemctl list-unit-files | grep -q "omnia_core.service"; then
+        systemctl stop omnia_core.service >/dev/null 2>&1 || true
+    fi
+
+    if [ -z "${omnia_path}" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Shared path (omnia_path) is empty. Phase 1 validation may not have run."
+        return 1
+    fi
+
+    if [ ! -f "/etc/containers/systemd/omnia_core.container" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Quadlet file not found: /etc/containers/systemd/omnia_core.container"
+        echo "[ERROR] [ORCHESTRATOR] Cannot proceed with upgrade container swap"
+        return 1
+    fi
+
+    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
+        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
+        return 1
+    fi
+
+    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' /etc/containers/systemd/omnia_core.container; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to update Image in quadlet file"
+        return 1
+    fi
+
+    escaped_omnia_path=$(printf '%s\n' "$omnia_path" | sed 's/[\/&]/\\\\&/g')
+    if grep -q '^Volume=/omnia\(/\|:\)' /etc/containers/systemd/omnia_core.container; then
+        if ! sed -i "s|^Volume=/omnia\(/\|:\)|Volume=${escaped_omnia_path}\\1|g" /etc/containers/systemd/omnia_core.container; then
+            echo "[ERROR] [ORCHESTRATOR] Failed to update Volume paths in quadlet file"
+            return 1
+        fi
+    fi
+
+    systemctl daemon-reload || return 1
+    if ! systemctl restart omnia_core.service; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to restart omnia_core.service"
+        systemctl status omnia_core.service --no-pager -l || true
+        journalctl -xeu omnia_core.service --no-pager | tail -n 120 || true
+        return 1
+    fi
+
+    if ! wait_for_container_health 60; then
+        echo "[ERROR] [ORCHESTRATOR] Container failed health check after swap"
+        return 1
+    fi
+
+    if ! update_metadata_version; then
+        return 1
+    fi
+
+    if ! sync_input_to_shared_path; then
+        return 1
+    fi
+
+    init_ssh_config
+
+    echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed"
+    return 0
+}
+
+upgrade_omnia_core() {
+    local lock_file="/var/lock/omnia_core_upgrade.lock"
+    local backup_path
+
+    if [ -e "$lock_file" ]; then
+        echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}"
+        exit 1
+    fi
+
+    mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true
+    echo "$$" > "$lock_file" || {
+        echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}"
+        exit 1
+    }
+    trap 'rm -f "$lock_file"' EXIT
+
+    if ! phase1_validate; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
+        exit 1
+    fi
+
+    if ! phase2_approval; then
+        exit 0
+    fi
+
+    backup_path="$OMNIA_UPGRADE_BACKUP_PATH"
+    if [ -z "$backup_path" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Backup path is empty"
+        exit 1
+    fi
+
+    if ! create_backup "$backup_path"; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3"
+        exit 1
+    fi
+
+    if ! phase4_container_swap; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
+        exit 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
+    echo "[INFO] [ORCHESTRATOR] Backup location: $backup_path"
+    exit 0
+}
+
+# Main function to check if omnia_core container is already running.
+# If yes, ask the user if they want to enter the container or reinstall.
+# If no, set it up.
+main() {
+    case "$1" in
+        --install|-i)
+            install_omnia_core
+            ;;
+        --uninstall|-u)
+            cleanup_omnia_core
+            ;;
+        --upgrade)
+            upgrade_omnia_core
+            ;;
+        --version|-v)
+            display_version
+            ;;
+        --help|-h|"")
+            show_help
+            ;;
+        *)
+            echo "Unknown option: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# Call the main function
+main "$1"

From 2b69697dfedad273f93fafc887de2a4e0b80824c Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Mon, 9 Feb 2026 06:55:43 +0000
Subject: [PATCH 077/172] reduced inputs in each role

---
 utils/roles/slurm_cleanup/tasks/main.yml      |  18 ---
 .../roles/slurm_config_backup/tasks/main.yml  |  51 ---------
 .../slurm_config_rollback/tasks/main.yml      |  84 --------------
 utils/slurm_config_util.yml                   | 106 +++++++++++++++++-
 4 files changed, 102 insertions(+), 157 deletions(-)

diff --git a/utils/roles/slurm_cleanup/tasks/main.yml b/utils/roles/slurm_cleanup/tasks/main.yml
index 5c59cae2d0..7acd38e571 100644
--- a/utils/roles/slurm_cleanup/tasks/main.yml
+++ b/utils/roles/slurm_cleanup/tasks/main.yml
@@ -1,23 +1,5 @@
 ---
 
-- name: Include variable file omnia_config.yml
-  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
-  tags: slurm_cleanup
-
-- name: Include storage vars
-  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
-  tags: slurm_cleanup
-
-- name: Set facts for slurm
-  ansible.builtin.set_fact:
-    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
-  tags: slurm_cleanup
-
-- name: Read the slurm mount point
-  ansible.builtin.set_fact:
-    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
-  tags: slurm_cleanup
-
 - name: Set slurm_config_path
   ansible.builtin.set_fact:
     slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}"
diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml
index 4d01014180..401a086493 100644
--- a/utils/roles/slurm_config_backup/tasks/main.yml
+++ b/utils/roles/slurm_config_backup/tasks/main.yml
@@ -1,56 +1,5 @@
 ---
 
-- name: Include variable file omnia_config.yml
-  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
-
-- name: Include storage vars
-  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
-
-- name: Set facts for slurm
-  ansible.builtin.set_fact:
-    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
-
-- name: Read the slurm mount point
-  ansible.builtin.set_fact:
-    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
-
-- name: Display resolved slurm share path
-  ansible.builtin.debug:
-    msg: "Resolved share_path={{ share_path }} (nfs_storage_name={{ nfs_storage_name }})"
-
-- name: Slurp remote YAML file
-  ansible.builtin.slurp:
-    src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
-  register: slurped_yaml
-
-- name: Parse YAML into vars
-  ansible.builtin.set_fact:
-    node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}"
-
-- name: Read the node name group
-  ansible.builtin.set_fact:
-    name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}"
-
-- name: Group the functional_groups
-  ansible.builtin.set_fact:
-    tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}"
-
-- name: Re-organize the groups
-  ansible.builtin.set_fact:
-    grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}"
-  loop: "{{ tmp_grouped_nodes }}"
-
-- name: Assign slurm lists
-  ansible.builtin.set_fact:
-    ctld_list: "{{ grouped_nodes | dict2items
-                   | selectattr('key', 'match', '^' ~ 'slurm_control_node_')
-                   | map(attribute='value') | list | flatten }}"
-
-- name: Fail if Slurm controller list is empty
-  ansible.builtin.fail:
-    msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun."
-  when: ctld_list | length == 0
-
 - name: Set slurm_config_path
   ansible.builtin.set_fact:
     slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}"
diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml
index 0a66c096b0..6e185f2028 100644
--- a/utils/roles/slurm_config_rollback/tasks/main.yml
+++ b/utils/roles/slurm_config_rollback/tasks/main.yml
@@ -1,89 +1,5 @@
 ---
 
-- name: Include variable file omnia_config.yml
-  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
-  tags: config_rollback
-
-- name: Include storage vars
-  ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
-  tags: config_rollback
-
-- name: Set facts for slurm
-  ansible.builtin.set_fact:
-    nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
-  tags: config_rollback
-
-- name: Read the slurm mount point
-  ansible.builtin.set_fact:
-    share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
-  tags: config_rollback
-
-- name: Slurp remote YAML file
-  ansible.builtin.slurp:
-    src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
-  register: slurped_yaml
-  tags: config_rollback
-
-- name: Parse YAML into vars
-  ansible.builtin.set_fact:
-    node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}"
-  tags: config_rollback
-
-- name: Get name and IP mapping 1
-  ansible.builtin.set_fact:
-    tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}"
-  tags: config_rollback
-
-- name: Get name and IP mapping 2
-  ansible.builtin.set_fact:
-    ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}"
-  loop: "{{ tmp_ip_name_map | dict2items }}"
-  tags: config_rollback
-
-- name: Read the node name group
-  ansible.builtin.set_fact:
-    name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}"
-  tags: config_rollback
-
-- name: Group the functional_groups
-  ansible.builtin.set_fact:
-    tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}"
-  tags: config_rollback
-
-- name: Re-organize the groups
-  ansible.builtin.set_fact:
-    grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}"
-  loop: "{{ tmp_grouped_nodes }}"
-  tags: config_rollback
-
-- name: Assign slurm lists
-  ansible.builtin.set_fact:
-    ctld_list: "{{ grouped_nodes | dict2items
-                   | selectattr('key', 'match', '^' ~ 'slurm_control_node_')
-                   | map(attribute='value') | list | flatten }}"
-  tags: config_rollback
-
-- name: Fail if Slurm controller list is empty
-  ansible.builtin.fail:
-    msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun."
-  when: ctld_list | length == 0
-  tags: config_rollback
-
-- name: Set slurm controller IP
-  ansible.builtin.set_fact:
-    controller_ip: "{{ ip_name_map[ctld_list | first] }}"
-  when: ctld_list | length > 0
-  tags: config_rollback
-
-- name: Add slurm controller as dynamic host
-  ansible.builtin.add_host:
-    name: slurm_controller
-    ansible_host: "{{ controller_ip }}"
-    ansible_user: root
-    ansible_port: 22
-  when: controller_ip is defined
-  tags: config_rollback
-
 - name: Set slurm paths
   ansible.builtin.set_fact:
     slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}"
diff --git a/utils/slurm_config_util.yml b/utils/slurm_config_util.yml
index 7cb5249ccd..fd42e4c202 100644
--- a/utils/slurm_config_util.yml
+++ b/utils/slurm_config_util.yml
@@ -17,10 +17,108 @@
   hosts: oim
   connection: ssh
   gather_facts: true
-  roles:
-    - role: slurm_config_backup
+  tasks:
+    - name: Include variable file omnia_config.yml
+      ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
+      tags: always
+
+    - name: Include storage vars
+      ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml"
+      tags: always
+
+    - name: Set facts for slurm
+      ansible.builtin.set_fact:
+        nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}"
+      tags: always
+
+    - name: Read the slurm mount point
+      ansible.builtin.set_fact:
+        share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}"
+      tags: always
+
+    - name: Slurp remote YAML file
+      ansible.builtin.slurp:
+        src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
+      register: slurped_yaml
+      tags: always
+
+    - name: Parse YAML into vars
+      ansible.builtin.set_fact:
+        node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}"
+      tags: always
+
+    - name: Get name and IP mapping 1
+      ansible.builtin.set_fact:
+        tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}"
+      tags: always
+
+    - name: Get name and IP mapping 2
+      ansible.builtin.set_fact:
+        ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}"
+      loop: "{{ tmp_ip_name_map | dict2items }}"
+      tags: always
+
+    - name: Read the node name group
+      ansible.builtin.set_fact:
+        name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}"
+      tags: always
+
+    - name: Group the functional_groups
+      ansible.builtin.set_fact:
+        tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}"
+      tags: always
+
+    - name: Re-organize the groups
+      ansible.builtin.set_fact:
+        grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}"
+      loop: "{{ tmp_grouped_nodes }}"
+      tags: always
+
+    - name: Assign slurm lists
+      ansible.builtin.set_fact:
+        ctld_list: "{{ grouped_nodes | dict2items
+                       | selectattr('key', 'match', '^' ~ 'slurm_control_node_')
+                       | map(attribute='value') | list | flatten }}"
+      tags: always
+
+    - name: Fail if Slurm controller list is empty
+      ansible.builtin.fail:
+        msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun."
+      when: ctld_list | length == 0
+      tags: always
+
+    - name: Set slurm controller IP
+      ansible.builtin.set_fact:
+        controller_ip: "{{ ip_name_map[ctld_list | first] }}"
+      when: ctld_list | length > 0
+      tags: always
+
+    - name: Add slurm controller as dynamic host
+      ansible.builtin.add_host:
+        name: slurm_controller
+        ansible_host: "{{ controller_ip }}"
+        ansible_user: root
+        ansible_port: 22
+      when: controller_ip is defined
+      tags: always
+
+    - name: Run slurm config backup
+      ansible.builtin.include_role:
+        name: slurm_config_backup
+        apply:
+          tags: config_backup
       tags: config_backup
-    - role: slurm_cleanup
+
+    - name: Run slurm cleanup
+      ansible.builtin.include_role:
+        name: slurm_cleanup
+        apply:
+          tags: slurm_cleanup
       tags: slurm_cleanup
-    - role: slurm_config_rollback
+
+    - name: Run slurm config rollback
+      ansible.builtin.include_role:
+        name: slurm_config_rollback
+        apply:
+          tags: config_rollback
       tags: config_rollback

From b31391ddcca48280d11d96954d724a0bc7ba2ed8 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Mon, 9 Feb 2026 12:28:16 +0530
Subject: [PATCH 078/172] slurm parameters json fo reference in schema, not yet
 actively used in code

---
 .../schema/slurm_config_parameters.json       | 501 ++++++++++++++++++
 1 file changed, 501 insertions(+)
 create mode 100644 common/library/module_utils/input_validation/schema/slurm_config_parameters.json

diff --git a/common/library/module_utils/input_validation/schema/slurm_config_parameters.json b/common/library/module_utils/input_validation/schema/slurm_config_parameters.json
new file mode 100644
index 0000000000..19480de228
--- /dev/null
+++ b/common/library/module_utils/input_validation/schema/slurm_config_parameters.json
@@ -0,0 +1,501 @@
+{
+  "slurm.conf": {
+    "AccountingStorageBackupHost": "S_P_STRING",
+    "AccountingStorageEnforce": "S_P_STRING",
+    "AccountingStorageExternalHost": "S_P_STRING",
+    "AccountingStorageHost": "S_P_STRING",
+    "AccountingStorageParameters": "S_P_STRING",
+    "AccountingStoragePass": "S_P_STRING",
+    "AccountingStoragePort": "S_P_UINT16",
+    "AccountingStorageTRES": "S_P_STRING",
+    "AccountingStorageType": "S_P_STRING",
+    "AccountingStorageUser": "S_P_STRING",
+    "AccountingStoreFlags": "S_P_STRING",
+    "AccountingStoreJobComment": "S_P_BOOLEAN",
+    "AcctGatherEnergyType": "S_P_STRING",
+    "AcctGatherFilesystemType": "S_P_STRING",
+    "AcctGatherInfinibandType": "S_P_STRING",
+    "AcctGatherInterconnectType": "S_P_STRING",
+    "AcctGatherNodeFreq": "S_P_UINT16",
+    "AcctGatherProfileType": "S_P_STRING",
+    "AllowSpecResourcesUsage": "S_P_BOOLEAN",
+    "AuthAltParameters": "S_P_STRING",
+    "AuthAltTypes": "S_P_STRING",
+    "AuthInfo": "S_P_STRING",
+    "AuthType": "S_P_STRING",
+    "BackupAddr": "S_P_STRING",
+    "BackupController": "S_P_STRING",
+    "BatchStartTimeout": "S_P_UINT16",
+    "BcastExclude": "S_P_STRING",
+    "BcastParameters": "S_P_STRING",
+    "BurstBufferParameters": "S_P_STRING",
+    "BurstBufferType": "S_P_STRING",
+    "CertgenType": "S_P_STRING",
+    "CertgenParameters": "S_P_STRING",
+    "CertmgrType": "S_P_STRING",
+    "CertmgrParameters": "S_P_STRING",
+    "CliFilterParameters": "S_P_STRING",
+    "CliFilterPlugins": "S_P_STRING",
+    "ClusterName": "S_P_STRING",
+    "CommunicationParameters": "S_P_STRING",
+    "CompleteWait": "S_P_UINT16",
+    "ControlAddr": "S_P_STRING",
+    "ControlMachine": "S_P_STRING",
+    "CoreSpecPlugin": "S_P_STRING",
+    "CpuFreqDef": "S_P_STRING",
+    "CpuFreqGovernors": "S_P_STRING",
+    "CredType": "S_P_STRING",
+    "CryptoType": "S_P_STRING",
+    "DataParserParameters": "S_P_STRING",
+    "DebugFlags": "S_P_STRING",
+    "DefCPUPerGPU": "S_P_UINT64",
+    "DefMemPerCPU": "S_P_UINT64",
+    "DefMemPerGPU": "S_P_UINT64",
+    "DefMemPerNode": "S_P_UINT64",
+    "DependencyParameters": "S_P_STRING",
+    "DisableRootJobs": "S_P_BOOLEAN",
+    "EioTimeout": "S_P_UINT16",
+    "EnforcePartLimits": "S_P_STRING",
+    "Epilog": "S_P_ARRAY",
+    "EpilogMsgTime": "S_P_UINT32",
+    "EpilogSlurmctld": "S_P_ARRAY",
+    "EpilogTimeout": "S_P_UINT16",
+    "ExtSensorsFreq": "S_P_UINT16",
+    "ExtSensorsType": "S_P_STRING",
+    "FairShareDampeningFactor": "S_P_UINT16",
+    "FastSchedule": "S_P_UINT16",
+    "FederationParameters": "S_P_STRING",
+    "FirstJobId": "S_P_UINT32",
+    "GetEnvTimeout": "S_P_UINT16",
+    "GpuFreqDef": "S_P_STRING",
+    "GresTypes": "S_P_STRING",
+    "GroupUpdateForce": "S_P_UINT16",
+    "GroupUpdateTime": "S_P_UINT16",
+    "HashPlugin": "S_P_STRING",
+    "HealthCheckInterval": "S_P_UINT16",
+    "HealthCheckNodeState": "S_P_STRING",
+    "HealthCheckProgram": "S_P_STRING",
+    "HttpParserType": "S_P_STRING",
+    "InactiveLimit": "S_P_UINT16",
+    "InteractiveStepOptions": "S_P_STRING",
+    "JobAcctGatherFrequency": "S_P_STRING",
+    "JobAcctGatherParams": "S_P_STRING",
+    "JobAcctGatherType": "S_P_STRING",
+    "JobCompHost": "S_P_STRING",
+    "JobCompLoc": "S_P_STRING",
+    "JobCompParams": "S_P_STRING",
+    "JobCompPass": "S_P_STRING",
+    "JobCompPassScript": "S_P_STRING",
+    "JobCompPort": "S_P_UINT32",
+    "JobCompType": "S_P_STRING",
+    "JobCompUser": "S_P_STRING",
+    "JobContainerType": "S_P_STRING",
+    "JobCredentialPrivateKey": "S_P_STRING",
+    "JobCredentialPublicCertificate": "S_P_STRING",
+    "JobFileAppend": "S_P_UINT16",
+    "JobRequeue": "S_P_UINT16",
+    "JobSubmitPlugins": "S_P_STRING",
+    "KeepAliveTime": "S_P_UINT32",
+    "KillOnBadExit": "S_P_UINT16",
+    "KillWait": "S_P_UINT16",
+    "LaunchParameters": "S_P_STRING",
+    "LaunchType": "S_P_STRING",
+    "Licenses": "S_P_STRING",
+    "LogTimeFormat": "S_P_STRING",
+    "MailDomain": "S_P_STRING",
+    "MailProg": "S_P_STRING",
+    "MaxArraySize": "S_P_UINT32",
+    "MaxBatchRequeue": "S_P_UINT32",
+    "MaxDBDMsgs": "S_P_UINT32",
+    "MaxJobCount": "S_P_UINT32",
+    "MaxJobId": "S_P_UINT32",
+    "MaxMemPerCPU": "S_P_UINT64",
+    "MaxMemPerNode": "S_P_UINT64",
+    "MaxNodeCount": "S_P_UINT32",
+    "MaxStepCount": "S_P_UINT32",
+    "MaxTasksPerNode": "S_P_UINT16",
+    "MCSParameters": "S_P_STRING",
+    "MCSPlugin": "S_P_STRING",
+    "MessageTimeout": "S_P_UINT16",
+    "MetricsType": "S_P_STRING",
+    "MinJobAge": "S_P_UINT32",
+    "MpiDefault": "S_P_STRING",
+    "MpiParams": "S_P_STRING",
+    "NamespaceType": "S_P_STRING",
+    "NodeFeaturesPlugins": "S_P_STRING",
+    "OverTimeLimit": "S_P_UINT16",
+    "PluginDir": "S_P_STRING",
+    "PlugStackConfig": "S_P_STRING",
+    "PowerParameters": "S_P_STRING",
+    "PowerPlugin": "S_P_STRING",
+    "PreemptExemptTime": "S_P_STRING",
+    "PreemptMode": "S_P_STRING",
+    "PreemptParameters": "S_P_STRING",
+    "PreemptType": "S_P_STRING",
+    "PrEpParameters": "S_P_STRING",
+    "PrEpPlugins": "S_P_STRING",
+    "PriorityCalcPeriod": "S_P_STRING",
+    "PriorityDecayHalfLife": "S_P_STRING",
+    "PriorityFavorSmall": "S_P_BOOLEAN",
+    "PriorityFlags": "S_P_STRING",
+    "PriorityMaxAge": "S_P_STRING",
+    "PriorityParameters": "S_P_STRING",
+    "PrioritySiteFactorParameters": "S_P_STRING",
+    "PrioritySiteFactorPlugin": "S_P_STRING",
+    "PriorityType": "S_P_STRING",
+    "PriorityUsageResetPeriod": "S_P_STRING",
+    "PriorityWeightAge": "S_P_UINT32",
+    "PriorityWeightAssoc": "S_P_UINT32",
+    "PriorityWeightFairshare": "S_P_UINT32",
+    "PriorityWeightJobSize": "S_P_UINT32",
+    "PriorityWeightPartition": "S_P_UINT32",
+    "PriorityWeightQOS": "S_P_UINT32",
+    "PriorityWeightTRES": "S_P_STRING",
+    "PrivateData": "S_P_STRING",
+    "ProctrackType": "S_P_STRING",
+    "Prolog": "S_P_ARRAY",
+    "PrologEpilogTimeout": "S_P_UINT16",
+    "PrologFlags": "S_P_STRING",
+    "PrologSlurmctld": "S_P_ARRAY",
+    "PrologTimeout": "S_P_UINT16",
+    "PropagatePrioProcess": "S_P_UINT16",
+    "PropagateResourceLimits": "S_P_STRING",
+    "PropagateResourceLimitsExcept": "S_P_STRING",
+    "RebootProgram": "S_P_STRING",
+    "ReconfigFlags": "S_P_STRING",
+    "RequeueExit": "S_P_STRING",
+    "RequeueExitHold": "S_P_STRING",
+    "ResumeFailProgram": "S_P_STRING",
+    "ResumeProgram": "S_P_STRING",
+    "ResumeRate": "S_P_UINT16",
+    "ResumeTimeout": "S_P_UINT16",
+    "ResvEpilog": "S_P_STRING",
+    "ResvOverRun": "S_P_UINT16",
+    "ResvProlog": "S_P_STRING",
+    "ReturnToService": "S_P_UINT16",
+    "RoutePlugin": "S_P_STRING",
+    "SallocDefaultCommand": "S_P_STRING",
+    "SbcastParameters": "S_P_STRING",
+    "SchedulerParameters": "S_P_STRING",
+    "SchedulerTimeSlice": "S_P_UINT16",
+    "SchedulerType": "S_P_STRING",
+    "ScronParameters": "S_P_STRING",
+    "SelectType": "S_P_STRING",
+    "SelectTypeParameters": "S_P_STRING",
+    "SlurmctldAddr": "S_P_STRING",
+    "SlurmctldDebug": "S_P_STRING",
+    "SlurmctldLogFile": "S_P_STRING",
+    "SlurmctldParameters": "S_P_STRING",
+    "SlurmctldPidFile": "S_P_STRING",
+    "SlurmctldPort": "S_P_STRING",
+    "SlurmctldPrimaryOffProg": "S_P_STRING",
+    "SlurmctldPrimaryOnProg": "S_P_STRING",
+    "SlurmctldSyslogDebug": "S_P_STRING",
+    "SlurmctldTimeout": "S_P_UINT16",
+    "SlurmdDebug": "S_P_STRING",
+    "SlurmdLogFile": "S_P_STRING",
+    "SlurmdParameters": "S_P_STRING",
+    "SlurmdPidFile": "S_P_STRING",
+    "SlurmdPort": "S_P_UINT32",
+    "SlurmdSpoolDir": "S_P_STRING",
+    "SlurmdSyslogDebug": "S_P_STRING",
+    "SlurmdTimeout": "S_P_UINT16",
+    "SlurmdUser": "S_P_STRING",
+    "SlurmSchedLogFile": "S_P_STRING",
+    "SlurmSchedLogLevel": "S_P_UINT16",
+    "SlurmUser": "S_P_STRING",
+    "SrunEpilog": "S_P_STRING",
+    "SrunPortRange": "S_P_STRING",
+    "SrunProlog": "S_P_STRING",
+    "StateSaveLocation": "S_P_STRING",
+    "SuspendExcNodes": "S_P_STRING",
+    "SuspendExcParts": "S_P_STRING",
+    "SuspendExcStates": "S_P_STRING",
+    "SuspendProgram": "S_P_STRING",
+    "SuspendRate": "S_P_UINT16",
+    "SuspendTime": "S_P_STRING",
+    "SuspendTimeout": "S_P_UINT16",
+    "SwitchParameters": "S_P_STRING",
+    "SwitchType": "S_P_STRING",
+    "TaskEpilog": "S_P_STRING",
+    "TaskPlugin": "S_P_STRING",
+    "TaskPluginParam": "S_P_STRING",
+    "TaskProlog": "S_P_STRING",
+    "TCPTimeout": "S_P_UINT16",
+    "TLSParameters": "S_P_STRING",
+    "TLSType": "S_P_STRING",
+    "TmpFS": "S_P_STRING",
+    "TopologyParam": "S_P_STRING",
+    "TopologyPlugin": "S_P_STRING",
+    "TrackWCKey": "S_P_BOOLEAN",
+    "TreeWidth": "S_P_UINT16",
+    "UnkillableStepProgram": "S_P_STRING",
+    "UnkillableStepTimeout": "S_P_UINT16",
+    "UrlParserType": "S_P_STRING",
+    "UsePAM": "S_P_BOOLEAN",
+    "VSizeFactor": "S_P_UINT16",
+    "WaitTime": "S_P_UINT16",
+    "X11Parameters": "S_P_STRING",
+    "DownNodes": "S_P_ARRAY",
+    "NodeName": "S_P_ARRAY",
+    "NodeSet": "S_P_ARRAY",
+    "PartitionName": "S_P_ARRAY",
+    "SlurmctldHost": "S_P_ARRAY"
+  },
+  "slurmdbd.conf": {
+    "AllowNoDefAcct": "S_P_BOOLEAN",
+    "AllResourcesAbsolute": "S_P_BOOLEAN",
+    "ArchiveDir": "S_P_STRING",
+    "ArchiveEvents": "S_P_BOOLEAN",
+    "ArchiveJobs": "S_P_BOOLEAN",
+    "ArchiveResvs": "S_P_BOOLEAN",
+    "ArchiveScript": "S_P_STRING",
+    "ArchiveSteps": "S_P_BOOLEAN",
+    "ArchiveSuspend": "S_P_BOOLEAN",
+    "ArchiveTXN": "S_P_BOOLEAN",
+    "ArchiveUsage": "S_P_BOOLEAN",
+    "AuthAltTypes": "S_P_STRING",
+    "AuthAltParameters": "S_P_STRING",
+    "AuthInfo": "S_P_STRING",
+    "AuthType": "S_P_STRING",
+    "CommitDelay": "S_P_UINT16",
+    "CommunicationParameters": "S_P_STRING",
+    "DbdAddr": "S_P_STRING",
+    "DbdBackupHost": "S_P_STRING",
+    "DbdHost": "S_P_STRING",
+    "DbdPort": "S_P_UINT16",
+    "DebugFlags": "S_P_STRING",
+    "DebugLevel": "S_P_STRING",
+    "DebugLevelSyslog": "S_P_STRING",
+    "DefaultQOS": "S_P_STRING",
+    "DisableCoordDBD": "S_P_BOOLEAN",
+    "DisableArchiveCommands": "S_P_BOOLEAN",
+    "HashPlugin": "S_P_STRING",
+    "JobPurge": "S_P_UINT32",
+    "LogFile": "S_P_STRING",
+    "LogTimeFormat": "S_P_STRING",
+    "MaxPurgeLimit": "S_P_UINT32",
+    "MaxQueryTimeRange": "S_P_STRING",
+    "MessageTimeout": "S_P_UINT16",
+    "Parameters": "S_P_STRING",
+    "PidFile": "S_P_STRING",
+    "PluginDir": "S_P_STRING",
+    "PrivateData": "S_P_STRING",
+    "PurgeEventAfter": "S_P_STRING",
+    "PurgeJobAfter": "S_P_STRING",
+    "PurgeResvAfter": "S_P_STRING",
+    "PurgeStepAfter": "S_P_STRING",
+    "PurgeSuspendAfter": "S_P_STRING",
+    "PurgeTXNAfter": "S_P_STRING",
+    "PurgeUsageAfter": "S_P_STRING",
+    "PurgeEventMonths": "S_P_UINT32",
+    "PurgeJobMonths": "S_P_UINT32",
+    "PurgeStepMonths": "S_P_UINT32",
+    "PurgeSuspendMonths": "S_P_UINT32",
+    "PurgeTXNMonths": "S_P_UINT32",
+    "PurgeUsageMonths": "S_P_UINT32",
+    "SlurmUser": "S_P_STRING",
+    "StepPurge": "S_P_UINT32",
+    "StorageBackupHost": "S_P_STRING",
+    "StorageHost": "S_P_STRING",
+    "StorageLoc": "S_P_STRING",
+    "StorageParameters": "S_P_STRING",
+    "StoragePass": "S_P_STRING",
+    "StoragePassScript": "S_P_STRING",
+    "StoragePort": "S_P_UINT16",
+    "StorageType": "S_P_STRING",
+    "StorageUser": "S_P_STRING",
+    "TCPTimeout": "S_P_UINT16",
+    "TLSParameters": "S_P_STRING",
+    "TLSType": "S_P_STRING",
+    "TrackWCKey": "S_P_BOOLEAN",
+    "TrackSlurmctldDown": "S_P_BOOLEAN"
+  },
+  "cgroup.conf": {
+    "CgroupAutomount": "S_P_BOOLEAN",
+    "CgroupMountpoint": "S_P_STRING",
+    "CgroupSlice": "S_P_STRING",
+    "ConstrainCores": "S_P_BOOLEAN",
+    "ConstrainRAMSpace": "S_P_BOOLEAN",
+    "AllowedRAMSpace": "S_P_FLOAT",
+    "MaxRAMPercent": "S_P_FLOAT",
+    "MinRAMSpace": "S_P_UINT64",
+    "ConstrainSwapSpace": "S_P_BOOLEAN",
+    "AllowedSwapSpace": "S_P_FLOAT",
+    "MaxSwapPercent": "S_P_FLOAT",
+    "MemoryLimitEnforcement": "S_P_BOOLEAN",
+    "MemoryLimitThreshold": "S_P_FLOAT",
+    "ConstrainDevices": "S_P_BOOLEAN",
+    "AllowedDevicesFile": "S_P_STRING",
+    "MemorySwappiness": "S_P_UINT64",
+    "CgroupPlugin": "S_P_STRING",
+    "IgnoreSystemd": "S_P_BOOLEAN",
+    "IgnoreSystemdOnFailure": "S_P_BOOLEAN",
+    "EnableControllers": "S_P_BOOLEAN",
+    "EnableExtraControllers": "S_P_STRING",
+    "SignalChildrenProcesses": "S_P_BOOLEAN",
+    "SystemdTimeout": "S_P_UINT64"
+  },
+  "gres.conf": {
+    "AutoDetect": "S_P_STRING",
+    "Count": "S_P_STRING",
+    "CPUs": "S_P_STRING",
+    "Cores": "S_P_STRING",
+    "File": "S_P_STRING",
+    "Files": "S_P_STRING",
+    "Flags": "S_P_STRING",
+    "Link": "S_P_STRING",
+    "Links": "S_P_STRING",
+    "MultipleFiles": "S_P_STRING",
+    "Name": "S_P_STRING",
+    "Type": "S_P_STRING"
+  },
+  "oci.conf": {
+    "ContainerPath": "S_P_STRING",
+    "CreateEnvFile": "S_P_STRING",
+    "DisableHooks": "S_P_STRING",
+    "EnvExclude": "S_P_STRING",
+    "MountSpoolDir": "S_P_STRING",
+    "RunTimeCreate": "S_P_STRING",
+    "RunTimeDelete": "S_P_STRING",
+    "RunTimeKill": "S_P_STRING",
+    "RunTimeEnvExclude": "S_P_STRING",
+    "RunTimeQuery": "S_P_STRING",
+    "RunTimeRun": "S_P_STRING",
+    "RunTimeStart": "S_P_STRING",
+    "SrunPath": "S_P_STRING",
+    "SrunArgs": "S_P_ARRAY",
+    "DisableCleanup": "S_P_BOOLEAN",
+    "StdIODebug": "S_P_STRING",
+    "SyslogDebug": "S_P_STRING",
+    "FileDebug": "S_P_STRING",
+    "DebugFlags": "S_P_STRING",
+    "IgnoreFileConfigJson": "S_P_BOOLEAN"
+  },
+  "acct_gather.conf": {
+    "EnergyIPMIDriverType": "S_P_UINT32",
+    "EnergyIPMIDisableAutoProbe": "S_P_UINT32",
+    "EnergyIPMIDriverAddress": "S_P_UINT32",
+    "EnergyIPMIRegisterSpacing": "S_P_UINT32",
+    "EnergyIPMIDriverDevice": "S_P_STRING",
+    "EnergyIPMIProtocolVersion": "S_P_UINT32",
+    "EnergyIPMIUsername": "S_P_STRING",
+    "EnergyIPMIPassword": "S_P_STRING",
+    "EnergyIPMIPrivilegeLevel": "S_P_UINT32",
+    "EnergyIPMIAuthenticationType": "S_P_UINT32",
+    "EnergyIPMICipherSuiteId": "S_P_UINT32",
+    "EnergyIPMISessionTimeout": "S_P_UINT32",
+    "EnergyIPMIRetransmissionTimeout": "S_P_UINT32",
+    "EnergyIPMIWorkaroundFlags": "S_P_UINT32",
+    "EnergyIPMIRereadSdrCache": "S_P_BOOLEAN",
+    "EnergyIPMIIgnoreNonInterpretableSensors": "S_P_BOOLEAN",
+    "EnergyIPMIBridgeSensors": "S_P_BOOLEAN",
+    "EnergyIPMIInterpretOemData": "S_P_BOOLEAN",
+    "EnergyIPMISharedSensors": "S_P_BOOLEAN",
+    "EnergyIPMIDiscreteReading": "S_P_BOOLEAN",
+    "EnergyIPMIIgnoreScanningDisabled": "S_P_BOOLEAN",
+    "EnergyIPMIAssumeBmcOwner": "S_P_BOOLEAN",
+    "EnergyIPMIEntitySensorNames": "S_P_BOOLEAN",
+    "EnergyIPMIFrequency": "S_P_UINT32",
+    "EnergyIPMICalcAdjustment": "S_P_BOOLEAN",
+    "EnergyIPMIPowerSensors": "S_P_STRING",
+    "EnergyIPMITimeout": "S_P_UINT32",
+    "EnergyIPMIVariable": "S_P_STRING",
+    "ProfileHDF5Dir": "S_P_STRING",
+    "ProfileHDF5Default": "S_P_STRING",
+    "ProfileInfluxDBDatabase": "S_P_STRING",
+    "ProfileInfluxDBDefault": "S_P_STRING",
+    "ProfileInfluxDBFrequency": "S_P_UINT32",
+    "ProfileInfluxDBHost": "S_P_STRING",
+    "ProfileInfluxDBPass": "S_P_STRING",
+    "ProfileInfluxDBRTPolicy": "S_P_STRING",
+    "ProfileInfluxDBTimeout": "S_P_UINT32",
+    "ProfileInfluxDBUser": "S_P_STRING",
+    "InterconnectOFEDPort": "S_P_UINT32",
+    "InfinibandOFEDPort": "S_P_UINT32",
+    "SysfsInterfaces": "S_P_STRING"
+  },
+  "burst_buffer.conf": {
+    "AllowUsers": "S_P_STRING",
+    "CreateBuffer": "S_P_STRING",
+    "DefaultPool": "S_P_STRING",
+    "DenyUsers": "S_P_STRING",
+    "DestroyBuffer": "S_P_STRING",
+    "Directive": "S_P_STRING",
+    "Flags": "S_P_STRING",
+    "GetSysState": "S_P_STRING",
+    "GetSysStatus": "S_P_STRING",
+    "Granularity": "S_P_STRING",
+    "OtherTimeout": "S_P_UINT32",
+    "PollInterval": "S_P_UINT32",
+    "Pools": "S_P_STRING",
+    "StageInTimeout": "S_P_UINT32",
+    "StageOutTimeout": "S_P_UINT32",
+    "StartStageIn": "S_P_STRING",
+    "StartStageOut": "S_P_STRING",
+    "StopStageIn": "S_P_STRING",
+    "StopStageOut": "S_P_STRING",
+    "ValidateTimeout": "S_P_UINT32"
+  },
+  "helpers.conf": {
+    "AllowUserBoot": "S_P_STRING",
+    "BootTime": "S_P_UINT32",
+    "ExecTime": "S_P_UINT32",
+    "Feature": "S_P_ARRAY",
+    "MutuallyExclusive": "S_P_LIST",
+    "NodeName": "S_P_ARRAY"
+  },
+  "job_container.conf": {
+    "AutoBasePath": "S_P_BOOLEAN",
+    "BasePath": "S_P_ARRAY",
+    "EntireStepInNS": "S_P_BOOLEAN",
+    "InitScript": "S_P_STRING",
+    "Shared": "S_P_BOOLEAN",
+    "CloneNSScript": "S_P_STRING",
+    "CloneNSEpilog": "S_P_STRING",
+    "CloneNSScript_Wait": "S_P_UINT32",
+    "CloneNSEpilog_Wait": "S_P_UINT32"
+  },
+  "mpi.conf": {
+    "PMIxCliTmpDirBase": "S_P_STRING",
+    "PMIxCollFence": "S_P_STRING",
+    "PMIxDebug": "S_P_UINT32",
+    "PMIxDirectConn": "S_P_BOOLEAN",
+    "PMIxDirectConnEarly": "S_P_BOOLEAN",
+    "PMIxDirectConnUCX": "S_P_BOOLEAN",
+    "PMIxDirectSameArch": "S_P_BOOLEAN",
+    "PMIxEnv": "S_P_STRING",
+    "PMIxFenceBarrier": "S_P_BOOLEAN",
+    "PMIxNetDevicesUCX": "S_P_STRING",
+    "PMIxShareServerTopology": "S_P_BOOLEAN",
+    "PMIxTimeout": "S_P_UINT32",
+    "PMIxTlsUCX": "S_P_CSV"
+  },
+  "topology.conf": {
+    "SwitchName": "S_P_ARRAY",
+    "LinkSpeed": "S_P_UINT32",
+    "Nodes": "S_P_STRING",
+    "Switches": "S_P_STRING",
+    "BlockName": "S_P_ARRAY",
+    "BlockSizes": "S_P_STRING"
+  },
+  "type_definitions": {
+    "S_P_IGNORE": "Any instance of specified key and associated value in a file will be allowed, but the value will not be stored",
+    "S_P_STRING": "String value",
+    "S_P_PLAIN_STRING": "Plain string value (not expanded in S_P_EXPLINE contexts)",
+    "S_P_LONG": "Long integer value",
+    "S_P_UINT16": "Unsigned 16-bit integer",
+    "S_P_UINT32": "Unsigned 32-bit integer",
+    "S_P_UINT64": "Unsigned 64-bit integer",
+    "S_P_POINTER": "Pointer type (custom handler)",
+    "S_P_ARRAY": "Array of values (allows multiple occurrences)",
+    "S_P_LIST": "List of values (allows multiple occurrences)",
+    "S_P_CSV": "Comma-separated values",
+    "S_P_BOOLEAN": "Boolean value (true/false, yes/no)",
+    "S_P_LINE": "Nested configuration line with sub-options",
+    "S_P_EXPLINE": "Expanded line with hostlist expansion support",
+    "S_P_FLOAT": "Floating point value",
+    "S_P_DOUBLE": "Double precision floating point",
+    "S_P_LONG_DOUBLE": "Long double precision floating point"
+  }
+}

From 6030165253d7817f19f4f407646dc5e169a54a11 Mon Sep 17 00:00:00 2001
From: Katakam Rakesh Naga Sai
 <125246792+Katakam-Rakesh@users.noreply.github.com>
Date: Mon, 9 Feb 2026 13:03:21 +0530
Subject: [PATCH 079/172] Update config.py

Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com>
---
 .../library/module_utils/input_validation/common_utils/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py
index 4de8aafa88..a2cd35b9c4 100644
--- a/common/library/module_utils/input_validation/common_utils/config.py
+++ b/common/library/module_utils/input_validation/common_utils/config.py
@@ -76,6 +76,7 @@
     "storage": [files["storage_config"]],
     "prepare_oim": [
         files["network_spec"],
+        files["software_config"]
     ],
     # "high_availability": [files["high_availability_config"]],
     # "additional_software": [files["additional_software"]],

From 220ef45651136da5504ba8904d82410ace242a00 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Mon, 9 Feb 2026 08:49:03 +0000
Subject: [PATCH 080/172] Enhance input validation for powervault config in
 storage_config.yml

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../schema/storage_config.json                | 11 +++++---
 ...ci-group-slurm_control_node_x86_64.yaml.j2 |  2 +-
 input/storage_config.yml                      | 25 ++++++++-----------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json
index 41746905f1..114d88f525 100644
--- a/common/library/module_utils/input_validation/schema/storage_config.json
+++ b/common/library/module_utils/input_validation/schema/storage_config.json
@@ -51,7 +51,8 @@
         "minItems": 1
       },
       "powervault_config": {
-        "required": ["ip", "isci_initiators", "volume_id"],
+        "type": "object",
+        "required": ["ip", "iscsi_initiators", "volume_id"],
         "properties": {
           "ip": {
             "description": "List of target controller IP addresses",
@@ -69,14 +70,16 @@
             "type": "integer"
           },
 
-          "isci_initiators": {
+          "iscsi_initiators": {
             "description": "iSCSI initiator IQN",
-            "type": "string"
+            "type": "string",
+            "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$"
           },
 
           "volume_id": {
             "description": "Volume identifier (hex string)",
-            "type": "string"
+            "type": "string",
+            "pattern": "^[a-fA-F0-9]+$"
           }
         }
       }
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
index 2f2721d7eb..d99d9dc90f 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
@@ -92,7 +92,7 @@
 
             PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %})
             PORT="{{ powervault_config.port | default(3260) }}"
-            INITIATOR_IQN="{{ powervault_config.isci_initiators | default('') }}"
+            INITIATOR_IQN="{{ powervault_config.iscsi_initiators | default('') }}"
             VOLUME_ID="{{ powervault_config.volume_id | default('') }}"
             FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}"
             MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}"
diff --git a/input/storage_config.yml b/input/storage_config.yml
index 48eac2d5cc..3ad961b405 100644
--- a/input/storage_config.yml
+++ b/input/storage_config.yml
@@ -19,28 +19,23 @@
 
 # -----------------------------Powervault-------------------------------------------
 # powervault_config
-# ip: ipv4
-# A list of PowerVault controller IP addresses used for iSCSI target discovery and login.
-# In this configuration, a single controller portal is provided.
-
-# port:
-# Defines the TCP port for the iSCSI target service.
-# Port 3260 is the standard port for iSCSI communication.
+# Mandatory when using PowerVault for persistent storage.
+# Below parameters are mandatory when powervault_config is defined
+    # ip: A list of PowerVault controller ipv4 addresses used for iSCSI target discovery and login.
+    # iscsi_initiators: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array.
+    # volume_id: This is the unique WWN/identifier for the specific volume that should be used for persistent storage. This value is used for multipath scanning to select the correct mapped device.
 
-# isci_initiators:
-# Specifies the InitiatorName used by the host when connecting to the iSCSI target.
-# This IQN uniquely identifies the host to the storage array.
+# Below are the optional parameters when powervault_config is defined
+    # port: Defines the TCP port for the iSCSI target service. When port is not specified, default port used will be 3260
 
-# volume_id:
-# This is the unique WWN/identifier for the
-# specific volume that should be used for persistent storage.
-# The script uses this value during multipath scanning to select the correct mapped device
+# Below is an example on how to configure powervault_config
+# In this configuration, a single controller portal is provided.
 
 #powervault_config:
 #  ip:
 #    - 172.1.2.3
 #  port: 3260
-#  isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7
+#  iscsi_initiators: iqn.initiator.com.example:7d7d7d7d7d7
 #  volume_id: 00c0ff4343f1f1f1001c8c4e6901000000
 
 

From 8f02f5c460d5d0bf7939474c7a78aaf620f20dcc Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Mon, 9 Feb 2026 08:51:29 +0000
Subject: [PATCH 081/172] Update copyright in storage_config.yml

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 input/storage_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/input/storage_config.yml b/input/storage_config.yml
index 3ad961b405..9492f15558 100644
--- a/input/storage_config.yml
+++ b/input/storage_config.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.

From 9a44bcf3195d93f42b8a015e01e8f830a27dcfe7 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Mon, 9 Feb 2026 14:49:22 +0530
Subject: [PATCH 082/172] config file update

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/module_utils/local_repo/config.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 5c515c527c..e26e8a6e71 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -142,10 +142,7 @@
     "get_repo_version": "pulp container repository show --href %s",
     "list_tags_by_version": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s",
     "rename_repository": "pulp container repository update --name %s --new-name %s",
-    "orphan_cleanup": "pulp orphan cleanup"
-
-
-    "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'",
+    "orphan_cleanup": "pulp orphan cleanup",
     "container_distribution_show": "pulp container distribution show --name %s | jq .repository",
     "show_repository_version": "pulp container repository show --href %s | jq .latest_version_href",
     "list_image_tags": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s"

From 88c526923f7bc3b16681e4d7f766bcc8fc09efd5 Mon Sep 17 00:00:00 2001
From: balajikumaran-c-s <balajikumaran.cs@dellteam.com>
Date: Mon, 9 Feb 2026 15:14:24 +0530
Subject: [PATCH 083/172] Skip podman image pull when image already exists for
 pulp and openchami

---
 .../openchami/tasks/deployment_prereq.yml     | 19 +++++++++++++++----
 .../pulp/tasks/deployment_prereq.yml          | 15 +++++++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml
index 109bc725f3..1558152a50 100644
--- a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml
+++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml
@@ -13,18 +13,29 @@
 # limitations under the License.
 ---
 
-- name: Pull OpenCHAMI images using Podman
+- name: Check if OpenCHAMI images already exist
   ansible.builtin.command:
-    cmd: "podman pull {{ item }}"
+    cmd: "podman image exists {{ item }}"
   loop: "{{ openchami_images }}"
+  register: openchami_image_exists
+  changed_when: false
+  failed_when: false
+
+- name: Pull OpenCHAMI images using Podman when missing
+  ansible.builtin.command:
+    cmd: "podman pull {{ item.item }}"
+  loop: "{{ openchami_image_exists.results }}"
+  loop_control:
+    label: "{{ item.item }}"
   register: pull_result
   retries: "{{ pull_image_retries }}"
   delay: "{{ pull_image_delay }}"
   until: pull_result.rc == 0
   changed_when: false
+  when: item.rc != 0
 
 - name: Fail if any OpenCHAMI image pull failed
   ansible.builtin.fail:
     msg: "Failed to pull OpenCHAMI image: {{ item.item }}. Error: {{ item.stderr }}"
-  loop: "{{ pull_result.results }}"
-  when: item.rc != 0
+  loop: "{{ pull_result.results | default([]) }}"
+  when: item.rc is defined and item.rc != 0
diff --git a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml
index 09ec52e6a4..dc143b03c5 100644
--- a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml
+++ b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml
@@ -38,7 +38,14 @@
   when: hostname_enabled
   no_log: true
 
-- name: Pull Pulp image using Podman
+- name: Check if Pulp image already exists
+  ansible.builtin.command:
+    cmd: "podman image exists {{ pulp_image }}"
+  register: pulp_image_exists
+  changed_when: false
+  failed_when: false
+
+- name: Pull Pulp image using Podman when missing
   ansible.builtin.command:
     cmd: "podman pull {{ pulp_image }}"
   register: pulp_pull_result
@@ -46,11 +53,15 @@
   delay: "{{ pull_image_delay }}"
   until: pulp_pull_result is not failed
   changed_when: false
+  when: pulp_image_exists.rc != 0
 
 - name: Fail if Pulp image pull failed
   ansible.builtin.fail:
     msg: "Failed to pull Pulp image: {{ pulp_image }}. Error: {{ pulp_pull_result.stderr }}"
-  when: pulp_pull_result.rc != 0
+  when:
+    - pulp_image_exists.rc != 0
+    - pulp_pull_result.rc is defined
+    - pulp_pull_result.rc != 0
 
 - name: Invoke Pulp Container Deployment Tasks for HTTP
   ansible.builtin.include_tasks: deploy_pulp_container_http.yml

From d4cdf690b31f2a97dd05616aa10fe54140e7f770 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 9 Feb 2026 15:18:16 +0530
Subject: [PATCH 084/172] defect fix for local repo validation when
 subscription is enabled

---
 .../input_validation/common_utils/config.py   |   8 +-
 .../validation_flows/local_repo_validation.py | 100 ++++++++++++------
 .../roles/validate_input/tasks/main.yml       |  26 +++--
 .../roles/validate_input/vars/main.yml        |   3 +
 4 files changed, 92 insertions(+), 45 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py
index 4de8aafa88..b5d3676165 100644
--- a/common/library/module_utils/input_validation/common_utils/config.py
+++ b/common/library/module_utils/input_validation/common_utils/config.py
@@ -26,8 +26,12 @@
 # log path for input validator
 INPUT_VALIDATOR_LOG_PATH = '/opt/omnia/log/core/playbooks/'
 
-ENTITLEMENT_PEM = '/opt/omnia/rhel_repo_certs/*.pem'
-REDHAT_REPO_FILE = '/opt/omnia/rhel_repo_certs/redhat.repo'
+# Subscription checking paths - checked in order of priority
+SYSTEM_ENTITLEMENT_PATH = '/etc/pki/entitlement/*.pem'
+SYSTEM_REDHAT_REPO = '/etc/yum.repos.d/redhat.repo'
+
+OMNIA_ENTITLEMENT_PATH = '/opt/omnia/rhel_repo_certs/*.pem'
+OMNIA_REDHAT_REPO = '/opt/omnia/rhel_repo_certs/redhat.repo'
 
 # dict to hold the file names. If any file's name changes just change it here.
 files = {
diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
index bcec9f4197..343a4f3de1 100644
--- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
@@ -29,43 +29,77 @@
 
 def check_subscription_status(logger=None):
     """
-    Check if the system has an active Red Hat subscription.
-    Subscription status is considered True if either entitlement
-    certificates exist or the required Red Hat repository URLs are present.
-    
-    Checks mounted host paths (/etc/pki/entitlement, /etc/yum.repos.d/redhat.repo).
+    Check if the system has an active Red Hat subscription enabled.
+    If system entitlement certificates are found in /etc/pki/entitlement,
+    only system paths are checked. Otherwise, Omnia paths are checked.
+    Subscription is enabled only if entitlement certificates and required
+    Red Hat repository URLs are found in the same source (system or Omnia).
 
     Returns:
-        bool: True if the system is subscribed (either entitlement certs
-              exist or required repos are present), False otherwise.
-    """    
-    # 1. Check entitlement certs
-    entitlement_certs = glob.glob(config.ENTITLEMENT_PEM)
-    has_entitlement = len(entitlement_certs) > 0
-    if logger:
-        logger.info(f"Entitlement certs in {config.ENTITLEMENT_PEM}: {len(entitlement_certs)} found")
-
-    # 2. Check redhat repos in redhat.repo
+        bool: True if subscription is enabled (both entitlement certs
+              and repos are found in the same source), False otherwise.
+    """
+    # 1. Check system entitlement certs first
+    system_entitlement_certs = glob.glob(config.SYSTEM_ENTITLEMENT_PATH)
+    has_system_entitlement = len(system_entitlement_certs) > 0
+    
+    if has_system_entitlement:
+        # System entitlement found - use system paths only
+        entitlement_certs = system_entitlement_certs
+        has_entitlement = True
+        repo_file_to_check = config.SYSTEM_REDHAT_REPO
+        
+        if logger:
+            logger.info(f"Found {len(system_entitlement_certs)} system entitlement certs - using system paths only")
+    else:
+        # No system entitlement - check Omnia paths
+        omnia_entitlement_certs = glob.glob(config.OMNIA_ENTITLEMENT_PATH)
+        entitlement_certs = omnia_entitlement_certs
+        has_entitlement = len(omnia_entitlement_certs) > 0
+        repo_file_to_check = config.OMNIA_REDHAT_REPO
+        
+        if logger:
+            logger.info(f"No system entitlement found - checking Omnia paths: {len(omnia_entitlement_certs)} certs found")
+
+    # 2. Check repos based on which entitlement path was used
+    has_repos = False
     repo_urls = []
-    redhat_repo = config.REDHAT_REPO_FILE
-    if os.path.exists(redhat_repo):
-        with open(redhat_repo, "r") as f:
-            for line in f:
-                if line.startswith("baseurl ="):
-                    url = line.split("=", 1)[1].strip()
-                    if re.search(r"(codeready-builder|baseos|appstream)", url, re.IGNORECASE):
-                        repo_urls.append(url)
-
-    has_repos = len(repo_urls) > 0
-    if logger:
-        logger.info(f"Repo URLs in {redhat_repo}: {len(repo_urls)} found")
-
-    # 3. Subscription status logic
-    subscription_status = has_entitlement or has_repos
+    redhat_repo_used = None
+    
+    if os.path.exists(repo_file_to_check):
+        try:
+            with open(repo_file_to_check, "r") as f:
+                for line in f:
+                    if line.startswith("baseurl ="):
+                        url = line.split("=", 1)[1].strip()
+                        if re.search(r"(codeready-builder|baseos|appstream)", url, re.IGNORECASE):
+                            repo_urls.append(url)
+            
+            if repo_urls:
+                has_repos = True
+                redhat_repo_used = repo_file_to_check
+                if logger:
+                    logger.info(f"Found {len(repo_urls)} repo URLs in {repo_file_to_check}")
+            elif logger:
+                logger.info(f"No required repo URLs found in {repo_file_to_check}")
+        except (IOError, OSError) as e:
+            if logger:
+                logger.warning(f"Error reading {repo_file_to_check}: {e}")
+    elif logger:
+        logger.info(f"Repo file {repo_file_to_check} does not exist")
+
+    # 3. Subscription enabled if entitlement and repos are found in the same source
+    subscription_enabled = has_entitlement and has_repos
+    
     if logger:
-        logger.info(f"Subscription status: {subscription_status} (entitlement={has_entitlement}, repos={has_repos})")
-
-    return subscription_status
+        logger.info(
+            f"Subscription enabled: {subscription_enabled} "
+            f"(entitlement={has_entitlement}, repos={has_repos}, "
+            f"entitlement_source={entitlement_certs[0] if entitlement_certs else 'None'}, "
+            f"repo_source={redhat_repo_used})"
+        )
+
+    return subscription_enabled
 
 # Below is a validation function for each file in the input folder
 def validate_local_repo_config(input_file_path, data,
diff --git a/input_validation/roles/validate_input/tasks/main.yml b/input_validation/roles/validate_input/tasks/main.yml
index ff11c79950..6a1c773ee5 100644
--- a/input_validation/roles/validate_input/tasks/main.yml
+++ b/input_validation/roles/validate_input/tasks/main.yml
@@ -23,14 +23,20 @@
     # then the "all" tag should be removed so that only the config files related to that playbook are validated.
     input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2
       else omnia_run_tags | default([]) }}"
-  validate_input:
-    omnia_base_dir: "{{ (input_dir + '/../') | ansible.builtin.realpath }}"
-    project_name: "{{ project_name }}"
-    tag_names: "{{ input_validate_tags }}"
-    module_utils_path: "{{ (role_path + '/../../../common/library/module_utils/') | ansible.builtin.realpath }}"
-  register: validation_status
-  when: (input_validate_tags | length) > 0
+  block:
+    - name: Run validation
+      validate_input:
+        omnia_base_dir: "{{ (input_dir + '/../') | ansible.builtin.realpath }}"
+        project_name: "{{ project_name }}"
+        tag_names: "{{ input_validate_tags }}"
+        module_utils_path: "{{ (role_path + '/../../../common/library/module_utils/') | ansible.builtin.realpath }}"
+      register: validation_status
+      when: (input_validate_tags | length) > 0
 
-- name: Debug validation status
-  ansible.builtin.debug:
-    msg: "{{ messages.validation_success }}"
+    - name: Debug validation status
+      ansible.builtin.debug:
+        msg: "{{ messages.validation_success }}"
+  rescue:
+    - name: Failed due to validation failure
+      ansible.builtin.fail:
+        msg: "{{ messages.validation_error }}"
diff --git a/input_validation/roles/validate_input/vars/main.yml b/input_validation/roles/validate_input/vars/main.yml
index 3c6f2b1aff..4655e7b25a 100644
--- a/input_validation/roles/validate_input/vars/main.yml
+++ b/input_validation/roles/validate_input/vars/main.yml
@@ -18,3 +18,6 @@ project_name: "{{ hostvars['localhost']['project_name'] }}"
 
 messages:
   validation_success: "Successfully validated Omnia input config file(s)"
+  validation_error: >
+    Input validation failed.
+    For detailed validation errors, see: {{ ansible_failed_result.log_file }}

From c4f60eebc7e7dc5029bda6b51a091ea94f12c328 Mon Sep 17 00:00:00 2001
From: Kratika_Patidar <Kratika.Patidar@dell.com>
Date: Mon, 9 Feb 2026 10:06:28 +0000
Subject: [PATCH 085/172] defect fix input validation mismatch between
 login_compiler and slurm_node

---
 .../validation_flows/provision_validation.py  | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py
index 7eef7bef20..cc6b4d8e76 100644
--- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py
@@ -91,6 +91,65 @@ def validate_functional_groups_separation(pxe_mapping_file_path):
     if errors:
         raise ValueError("PXE mapping file group separation validation errors: " + "; ".join([str(e) for e in errors]))
 
+def validate_slurm_login_compiler_prefix(pxe_mapping_file_path):
+    """Validate that slurm_node and login_compiler entries align on architecture suffix when both are present.
+
+    - Functional group suffix must be either _x86_64 or _aarch64 (case-sensitive).
+    - When both slurm_node* and login_compiler_node* are present, their suffixes must match.
+
+    Raises ValueError with details if suffixes differ. Prefix differences are allowed.
+    """
+
+    if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path):
+        raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}")
+
+    with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh:
+        raw_lines = fh.readlines()
+
+    non_comment_lines = [ln for ln in raw_lines if ln.strip()]
+    reader = csv.DictReader(non_comment_lines)
+
+    fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames}
+    fg_col = fieldname_map.get("FUNCTIONAL_GROUP_NAME")
+    hostname_col = fieldname_map.get("HOSTNAME")
+
+    if not fg_col or not hostname_col:
+        raise ValueError("FUNCTIONAL_GROUP_NAME or HOSTNAME column not found in PXE mapping file")
+
+    arch_map = {"slurm_node": [], "login_compiler_node": []}
+
+    for row_idx, row in enumerate(reader, start=2):
+        fg_name = row.get(fg_col, "").strip() if row.get(fg_col) else ""
+        hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else ""
+        if not fg_name or not hostname:
+            continue
+
+        fg_arch = None
+        fg_base = fg_name
+        for suffix in ("_x86_64", "_aarch64"):
+            if fg_name.endswith(suffix):
+                fg_arch = suffix.lstrip("_")
+                fg_base = fg_name[: -len(suffix)]
+                break
+
+        if fg_base in arch_map and fg_arch:
+            arch_map[fg_base].append((fg_arch, row_idx))
+
+    if not arch_map["slurm_node"] or not arch_map["login_compiler_node"]:
+        return
+
+    slurm_arch, _ = arch_map["slurm_node"][0]
+    login_arch, _ = arch_map["login_compiler_node"][0]
+    if slurm_arch != login_arch:
+        slurm_rows = [str(r[1]) for r in arch_map["slurm_node"]]
+        login_rows = [str(r[1]) for r in arch_map["login_compiler_node"]]
+        raise ValueError(
+            "Architecture suffix mismatch between slurm_node and login_compiler_node. "
+            f"slurm_node suffix '{slurm_arch}' vs "
+            f"login_compiler_node suffix '{login_arch}' "
+            "Ensure both use the same suffix (_x86_64 or _aarch64)."
+        )
+
 def validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path):
     """
     Validates that HOSTNAME values in the mapping file are unique.
@@ -684,6 +743,7 @@ def validate_provision_config(
             validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path)
             validate_functional_groups_separation(pxe_mapping_file_path)
             validate_parent_service_tag_hierarchy(pxe_mapping_file_path)
+            validate_slurm_login_compiler_prefix(pxe_mapping_file_path)
 
             # Validate ADMIN_IPs against network_spec.yml ranges
             network_spec_path = create_file_path(input_file_path, file_names["network_spec"])

From f89761e9970a944af6efd3ded5a93fb184284ae4 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Mon, 9 Feb 2026 17:26:36 +0530
Subject: [PATCH 086/172] Update omnia.sh

---
 omnia.sh | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 4 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index f05c9ebe84..746fb8fd34 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -825,6 +825,13 @@ EOF
             echo "nfs_type: $nfs_type"
         } >> "$oim_metadata_file"
         fi
+    else
+        sed -i '/^upgrade_backup_dir:/d' "$oim_metadata_file" >/dev/null 2>&1 || true
+        if grep -q '^omnia_version:' "$oim_metadata_file"; then
+            sed -i "s/^omnia_version:.*/omnia_version: $omnia_release/" "$oim_metadata_file" >/dev/null 2>&1 || true
+        else
+            echo "omnia_version: $omnia_release" >> "$oim_metadata_file"
+        fi
     fi
 
     # --- Remove old service if exists ---
@@ -924,6 +931,7 @@ validate_nfs_server() {
 }
 
 init_ssh_config() {
+    mkdir -p "$HOME/.ssh"
     touch $HOME/.ssh/known_hosts
     # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host
     ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1  # Remove existing entry if it exists
@@ -964,6 +972,8 @@ start_container_session() {
     --------------------------------------------------------------------------------------------------------------------------------------------------
     ${NC}"
 
+    init_ssh_config
+
     # Entering Omnia-core container
     ssh omnia_core
 }
@@ -1192,6 +1202,11 @@ phase1_validate() {
         return 1
     fi
 
+    if [ "$previous_omnia_version" = "2.1.0.0" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade already performed. Current Omnia version is 2.1.0.0. No further upgrade required."
+        return 1
+    fi
+
     if [ "$previous_omnia_version" != "2.0.0.0" ]; then
         echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version"
         return 1
@@ -1241,7 +1256,7 @@ phase1_validate() {
 }
 
 phase2_approval() {
-    local backup_base default_backup_dir
+    local backup_base default_backup_dir current_omnia_version
 
     echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate"
     echo "============================================"
@@ -1256,10 +1271,16 @@ phase2_approval() {
     echo "  - Additional Package Installation"
     echo "============================================"
 
-    default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade"
+    current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null)
+    if [ -z "$current_omnia_version" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to read omnia_version from metadata inside container"
+        return 1
+    fi
+
+    default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade/version_${current_omnia_version}"
     backup_base="$default_backup_dir"
 
-    echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base"
+    echo "[INFO] [ORCHESTRATOR] Backup destination (inside omnia_core container): $backup_base"
 
     if ! update_metadata_upgrade_backup_dir "$backup_base"; then
         echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata"
@@ -1331,6 +1352,85 @@ phase3_backup_creation() {
     return 0
 }
 
+phase4_container_swap() {
+    local quadlet_file="/etc/containers/systemd/omnia_core.container"
+    local i
+
+    echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap"
+
+    if [ ! -f "$quadlet_file" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file"
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Stopping omnia_core 1.0 container"
+    systemctl stop omnia_core.service >/dev/null 2>&1 || true
+
+    if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[WARN] [ORCHESTRATOR] omnia_core still running; forcing stop"
+        podman stop -t 30 omnia_core >/dev/null 2>&1 || true
+    fi
+
+    if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container"
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit"
+    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
+        return 1
+    fi
+
+    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file"
+        return 1
+    fi
+
+    systemctl daemon-reload || {
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed"
+        return 1
+    }
+
+    systemctl start omnia_core.service || {
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
+        return 1
+    }
+
+    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 1.1 health check (60s)"
+    for i in $(seq 1 60); do
+        if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+            break
+        fi
+        sleep 1
+    done
+
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to 2.1.0.0"
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
+            sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$CONTAINER_METADATA_FILE'
+        else
+            echo 'omnia_version: 2.1.0.0' >> '$CONTAINER_METADATA_FILE'
+        fi
+    "; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version"
+        return 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed"
+    return 0
+}
+
 upgrade_omnia_core() {
     local lock_file="/var/lock/omnia_core_upgrade.lock"
     local backup_base
@@ -1367,7 +1467,14 @@ upgrade_omnia_core() {
         exit 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Upgrade tasks for container swap are deferred to a follow-up PR"
+    if ! phase4_container_swap; then
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
+        exit 1
+    fi
+
+    echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
+    echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base"
+    start_container_session
     exit 0
 }
 

From e2228b62028af13647bf426924fe10a1a953065c Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Mon, 9 Feb 2026 17:27:34 +0530
Subject: [PATCH 087/172] Revert "Create test.sh"

This reverts commit aae7e28ebde2e207e93c97852a7abfd19aebe215.
---
 test.sh | 1555 -------------------------------------------------------
 1 file changed, 1555 deletions(-)
 delete mode 100644 test.sh

diff --git a/test.sh b/test.sh
deleted file mode 100644
index cd1f8e63e7..0000000000
--- a/test.sh
+++ /dev/null
@@ -1,1555 +0,0 @@
-#!/bin/bash
-
-# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script is used to generate the Omnia core docker image.
-# The image is based on Fedora and uses systemd to start all of the necessary
-# services.
-#
-# This script prompts the user for the Omnia shared path and the root
-# password. It then checks if the Omnia shared path exists.
-#
-# The script checks if the ssh key file exists. If it does not exist, a new ssh
-
-# Color Definitions
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-YELLOW='\033[0;33m'
-omnia_release=2.1.0.0
-
-core_container_status=false
-omnia_path=""
-hashed_passwd=""
-domain_name=""
-
-is_local_ip() {
-    local ip_to_check="$1"
-
-    # Get all local IP addresses (excluding loopback)
-    local local_ips
-    local_ips=$(hostname -I)
-
-    # Check if the IP matches any local IP
-    if echo "$local_ips" | grep -qw "$ip_to_check"; then
-        return 0  # IP is local
-    else
-        return 1  # IP is not local
-    fi
-}
-
-OMNIA_BASE_DIR="/opt/omnia"
-OMNIA_INPUT_DIR="/opt/omnia/input"
-OMNIA_BACKUPS_DIR="/opt/omnia/backups"
-OMNIA_METADATA_DIR="/opt/omnia/.data"
-OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
-
-update_metadata_upgrade_backup_dir() {
-    local backup_dir="$1"
-
-    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-        echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running"
-        return 1
-    fi
-
-    podman exec -u root omnia_core bash -c "
-        set -e
-        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
-            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
-            exit 1
-        fi
-        if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then
-            sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE'
-        else
-            echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE'
-        fi
-    "
-}
-
-
-
-check_internal_nfs_export() {
-    nfs_server_ip=$1
-    nfs_server_share_path=$2
-
-    if is_local_ip "$nfs_server_ip"; then
-        echo "The provided NFS server IP ($nfs_server_ip) belongs to the current system."
-    else
-        echo "The provided NFS server IP ($nfs_server_ip) is NOT the current system's IP."
-        exit 1
-    fi
-
-    # Query the remote server for exports
-    exports=$(showmount -e "$nfs_server_ip" 2>/dev/null)
-
-    if [[ $? -ne 0 ]]; then
-        echo -e "${RED}ERROR: Unable to contact NFS server at $nfs_server_ip. Ensure NFS and rpcbind are running, and firewall allows access.${NC}"
-        exit 1
-    fi
-
-    # Check if path is in the export list
-    if echo "$exports" | awk '{print $1}' | grep -Fxq "$nfs_server_share_path"; then
-        echo -e "${GREEN}Path $nfs_server_share_path is exported by $nfs_server_ip.${NC}"
-    else
-        echo -e "${RED}ERROR: Path $nfs_server_share_path is NOT exported by $nfs_server_ip.${NC}"
-        exit 1
-    fi
-}
-
-display_supported_use_cases() {
-    # Color definitions
-    BLUE='\033[1;34m'
-    YELLOW='\033[1;33m'
-    GREEN='\033[1;32m'
-    NC='\033[0m' # No Color
-
-    # Introductory Guidance
-    echo -e "${BLUE} ----------------- Omnia Shared Path Configuration ---------------- ${NC}"
-    echo -e "${BLUE} Please choose the type of Omnia shared path in Omnia Infrastructure Manager (OIM): ${NC}"
-    echo -e "${BLUE} It is recommended to use a external NFS share for the Omnia shared path. ${NC}"
-    echo -e "${BLUE} If you are not using NFS, make sure enough space is available on the disk. ${NC}"
-    echo -e "${YELLOW} Using a Extrenal NFS share is mandatory for Omnia shared path if you are planning to have high availability in OIM or require K8s service cluster. ${NC}"
-    echo -e "\nSupported Use Cases:\n"
-
-    # Table content
-    {
-        echo -e "Share Option\tType\tDescription\tAdditional Info"
-        echo -e "${GREEN}NFS\tExternal\tExternal NFS server(outside OIM) created by user\tMust be reachable from OIM and service nodes. Mounts on OIM. Recommended for HA and hierarchical clusters.${NC}"
-        echo -e "NFS\tInternal\tNFS server created by user in OIM\tUsed only for flat provisioning. No HA or k8s service cluster support. No mount performed."
-        echo -e "Local\tDisk\tDisk storage in OIM\tUsed only for flat provisioning. No HA or hierarchical support."
-    } | column -t -s $'\t'
-}
-
-
-# This function is responsible for initializing the Omnia core container
-# It prompts the user for the Omnia shared path and the root password.
-# It checks if the Omnia shared path exists.
-setup_omnia_core() {
-    # Validate the system environment
-    validate_oim
-
-    # Initialize the container configuration
-    init_container_config
-
-    # Setup the container
-    setup_container
-
-    # Post container setup configuration
-    post_setup_config
-
-    # Start the container
-    start_container_session
-}
-
-
-# This function is responsible for cleaning up the Omnia core container.
-# It removes the container and performs the necessary cleanup steps.
-cleanup_omnia_core() {
-    # Block if critical service containers exist
-    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
-    if [ -n "$critical_running" ]; then
-        echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
-        echo "$critical_running"
-        echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}"
-        exit 1
-    fi
-
-    echo -e "${RED} WARNING: This will remove Omnia core container and all files in Omnia Shared Path.${NC}"
-    echo -e "${GREEN} You can abort and take backup if you want.${NC}"
-    read -p " Are you sure you want to continue with the cleanup? (y/n): " confirm
-    if [ "$confirm" = "n" ] || [ "$confirm" = "N" ]; then
-        echo -e "${GREEN}Aborting.${NC}"
-        exit 0
-    elif [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then
-
-        # Fetch the configuration from the Omnia core container.
-        fetch_config
-
-        # Remove the container
-        remove_container
-
-        # Perform the necessary cleanup steps
-        cleanup_config
-    fi
-}
-
-
-# This function is responsible for cleaning up the Omnia core container configuration.
-# It removes the public key from the authorized_keys file.
-# It removes the private key.
-# It removes the ssh key from the known_hosts file.
-# It removes the Omnia core configuration.
-#
-cleanup_config(){
-
-    # Set the path to the ssh public key.
-    ssh_key_file="$HOME/.ssh/oim_rsa.pub"
-
-    # Remove the public key from the authorized_keys file.
-    if [ -f "$ssh_key_file" ]; then
-        # Remove the line from the authorized_keys file.
-        sed -i "\|^$(cat $ssh_key_file)$|d" $HOME/.ssh/authorized_keys
-        echo -e "${GREEN} Public key has been removed from authorized_keys.${NC}"
-    else
-        echo -e "${RED} Public key file not found.${NC}"
-    fi
-
-    # Remove the SSH key pair.
-    ssh_key_file="$HOME/.ssh/oim_rsa"
-    ssh_key_file_pub="${ssh_key_file}.pub"
-    if [ -f "$ssh_key_file" ] && [ -f "$ssh_key_file_pub" ]; then
-        rm -f "$ssh_key_file" "$ssh_key_file_pub"
-        echo -e "${GREEN} SSH key pair have been removed.${NC}"
-    else
-        echo -e "${RED} SSH key file not found.${NC}"
-    fi
-
-    # Remove the ssh key from the known_hosts file.
-    echo -e "${BLUE} Removing ssh key from known_hosts file.${NC}"
-    ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1
-
-
-    # Remove the host entry from the config file in .ssh folder.
-    ssh_config_file="$HOME/.ssh/config"
-    if [ -f "$ssh_config_file" ]; then
-        sed -i '/Host omnia_core/,+5d' "$ssh_config_file"
-        echo -e "${GREEN} Host entry has been removed from config file.${NC}"
-    else
-        echo -e "${RED} Config file not found.${NC}"
-    fi
-
-    # Remove the Omnia core configuration.
-    echo -e "${BLUE} Removing Omnia core configuration.${NC}"
-    rm -rf $omnia_path/omnia/{hosts,input,log,pulp,provision,pcs,ssh_config,tmp,.data}
-
-    # Unmount the NFS shared path if the share option is NFS.
-    if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then
-        umount "$omnia_path"
-        if [ $? -eq 0 ]; then
-            echo -e "${GREEN} NFS shared path has been unmounted.${NC}"
-        else
-            echo -e "${RED} Failed to unmount NFS shared path.${NC}"
-        fi
-        # Remove the entry from /etc/fstab
-        fstab_file="/etc/fstab"
-        if [ -f "$fstab_file" ]; then
-            # Create a backup of the fstab file.
-            cp "$fstab_file" "$fstab_file.bak"
-
-            # Remove the line from the fstab file.
-             sed -i "\#$omnia_path#d" "$fstab_file"
-             if [ $? -ne 0 ]; then
-                echo -e "${RED} Failed to remove the entry from /etc/fstab.${NC}"
-            fi
-        fi
-    fi
-
-    echo -e "${GREEN} Omnia core configuration has been cleaned up.${NC}"
-}
-
-# This function is responsible for removing the Omnia core container.
-#
-# It removes the container using the 'podman rm -f' command.
-# If the container is removed successfully, it prints a success message.
-# Otherwise, it prints an error message.
-remove_container() {
-    # Block if critical service containers exist
-    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
-    if [ -n "$critical_running" ]; then
-        echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
-        echo "$critical_running"
-        echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}"
-        exit 1
-    fi
-
-    # Remove the container.
-    echo -e "${BLUE} Removing the Omnia core container.${NC}"
-    if systemctl stop omnia_core.service; then
-        echo -e "${GREEN} Omnia core container has been removed.${NC}"
-        # Remove the systemd generator symlinks.
-        echo -e "${GREEN} Cleaning up systemd generator symlinks.${NC}"
-        rm -f /run/systemd/generator/omnia_core.service
-        rm -f /run/systemd/generator/multi-user.target.wants/omnia_core.service
-        rm -f /run/systemd/generator/default.target.wants/omnia_core.service
-
-        echo -e "${GREEN} Cleaning up omnia_core.container.${NC}"
-        rm -f /etc/containers/systemd/omnia_core.container
-
-    # Remove the omnia_core.service file.
-        rm -f /etc/systemd/system/omnia_core.service
-        systemctl daemon-reload
-        systemctl reset-failed omnia_core.service
-    # check if service is removed
-        if systemctl status omnia_core.service >/dev/null 2>&1; then
-            echo -e "${RED} Failed to remove Omnia core service.${NC}"
-        else
-            echo -e "${GREEN} Omnia core service has been removed.${NC}"
-        fi    
-    else
-        echo -e "${RED} Failed to remove Omnia core container.${NC}"
-    fi
-
-    # Remove the container image.
-    # if podman rmi omnia_core; then
-    #     echo -e "${GREEN} Omnia core image has been removed.${NC}"
-    # else
-    #     echo -e "${RED} Failed to remove Omnia core image.${NC}"
-    # fi
-}
-
-
-# This function is responsible for initializing the Omnia core container.
-#
-# It prompts the user for the Omnia shared path and the root
-# password. It then checks if the Omnia shared path exists.
-#
-# The function generates the ssh key pair and copies the private
-# key to the Omnia shared path.
-#
-# The function also copies the ssh public key to the
-# authorized_keys file.
-#
-# The function creates the necessary log directories.
-init_container_config() {
-
-    share_option=""
-    # Display the supported use cases
-    display_supported_use_cases
-
-    # Display the choices for the user
-    echo -e "${BLUE} Choose the type of Omnia shared path:${NC}"
-    options=( "NFS (recommended)" "Local"  )
-
-    PS3="Select the option number: "
-
-    select opt in "${options[@]}"; do
-        case $opt in
-            "NFS (recommended)")
-                share_option="NFS"
-                break
-                ;;
-            "Local")
-                share_option="Local"
-                break
-                ;;
-            *)
-                echo -e "${RED} Invalid option.${NC}"
-                continue
-        esac
-    done
-
-    case $share_option in
-        "Local")
-            # Prompt the user for the Omnia shared path.
-            echo -e "${BLUE} Please provide Omnia shared path:${NC}"
-            read -p "Omnia shared path: " omnia_path
-
-            # Check if the Omnia shared path is absolute path and path exists.
-            if [[ "$omnia_path" != /* ]] || [ ! -d "$omnia_path" ]; then
-                echo -e "${RED} Omnia shared path is not an absolute path or does not exist! Please re-run omnia.sh --install with valid Omnia shared path.${NC}"
-                exit 1
-            fi
-            ;;
-        "NFS")
-            echo -e "${BLUE} Select NFS type:${NC}"
-            select nfs_type in "External (Recommended)" "Internal"; do
-                case $nfs_type in
-                    "External (Recommended)")
-                        echo -e "${BLUE} Please provide the external NFS server IP:${NC}"
-                        read -p "External NFS server IP: " nfs_server_ip
-
-                        echo -e "${BLUE} Please provide the external NFS server share path:${NC}"
-                        read -p "External NFS share path: " nfs_server_share_path
-
-                        echo -e "${BLUE} Please provide the OIM client share path (mount target):${NC}"
-                        read -p "Omnia shared path: " omnia_path
-
-                        # Validate Omnia shared path is absolute
-                        if [[ "$omnia_path" != /* ]]; then
-                            echo -e "${RED}Omnia shared path must be an absolute path.${NC}"
-                            exit 1
-                        fi
-
-                        nfs_type="external"
-                        break
-                        ;;
-                    "Internal")
-                        echo -e "${BLUE} Please provide the OIM server IP:${NC}"
-                        read -p "OIM server IP: " nfs_server_ip
-
-                        echo -e "${BLUE} Please provide the OIM server share path:${NC}"
-                        read -p "OIM server share path: " nfs_server_share_path
-
-                        echo -e "${BLUE} Checking if the OIM server share path is mounted${NC}"
-                        check_internal_nfs_export "$nfs_server_ip" "$nfs_server_share_path"
-
-                        # Note: No mounting performed here
-                        echo -e "${YELLOW}Note: Internal NFS does not support HA OIM or hierarchical cluster. Proceeding...${NC}"
-                        nfs_type="internal"
-                        omnia_path="$nfs_server_share_path"
-                        break
-                        ;;
-                    *)
-                        echo -e "${RED}Invalid option. Please choose 1 or 2.${NC}"
-                        ;;
-                esac
-            done
-            ;;
-    esac
-
-
-    # Prompt the user for the Omnia core root password.
-    echo -e "${BLUE} Please provide Omnia core root password for accessing container:${NC}"
-
-    read -p " Enter: " -s passwd
-
-    # Prompt the user for the Omnia core root password confirmation.
-    echo -e "\n${BLUE} Please confirm password:${NC}"
-    read -s -p " Enter: " cnf_passwd
-
-    # Check if the provided passwords match.
-    if [ "$passwd" != "$cnf_passwd" ]; then
-        echo -e "${RED} Invalid Omnia core root password, passwords do not match!${NC}"
-        exit 1
-    fi
-
-    # Check if the password contains any of the invalid characters
-    invalid_chars='[\\|&;`"><*?!$(){}[\]]'
-    if [[ "$passwd" =~ $invalid_chars ]]; then
-        echo -e "${RED} Invalid password, passwords must not contain any of these special characters: [\\|&;\`\"><*?!$(){}[\]]${NC}"
-        exit 1
-    fi
-
-    # Install NFS client package if option NFS is selected
-    if [[ "$share_option" == "NFS" ]]; then
-        # Install NFS client package
-        echo -e "${BLUE} Installing NFS client package.${NC}"
-        dnf install -y nfs-utils nfs4-acl-tools
-
-        # Create omnia_path directory if it does not exist
-        echo -e "${BLUE} Creating omnia shared path directory if it does not exist.${NC}"
-        mkdir -p $omnia_path
-
-        # Mount NFS server share path in Omnia share path
-        if [[ "$nfs_type" == "external" ]]; then
-
-            if is_local_ip "$nfs_server_ip"; then
-                echo -e "${RED} Error: NFS server $nfs_server_ip is a local IP.${NC}"
-                echo -e "${RED} Please provide an external NFS server IP or re-run omnia.sh --install with valid options.${NC}"
-                exit 1
-            fi
-
-            # Validate if NFS server is reachable
-            echo -e "${BLUE} Validating if NFS server is reachable.${NC}"
-            ping -c1 -W1 $nfs_server_ip > /dev/null
-            if [ $? -ne 0 ]; then
-                echo -e "${RED} NFS server $nfs_server_ip is not reachable.${NC}"
-                exit 1
-            fi
-
-            echo -e "${BLUE} Mounting NFS server share path in Omnia share path.${NC}"
-            mount -t nfs -o nosuid,rw,sync,hard,intr,timeo=30 "$nfs_server_ip:$nfs_server_share_path" "$omnia_path"
-            if [[ $? -ne 0 ]]; then
-                echo -e "${RED} Failed to mount NFS. Please check the IP and path.${NC}"
-                exit 1
-            fi
-            # Validate if NFS server share path is mounted
-            echo -e "${BLUE} Validating if NFS server share path is mounted.${NC}"
-            # strip the trailing slash from nfs_server_share_path
-            nfs_server_share_path="${nfs_server_share_path%/}"
-            if grep -qs "$nfs_server_ip:$nfs_server_share_path" /proc/mounts; then
-                echo -e "${GREEN} NFS server share path is mounted.${NC}"
-            else
-                echo -e "${RED} NFS server share path is not mounted. Provide valid NFS server details. ${NC}"
-                exit 1
-            fi
-            # Add NFS server share to /etc/fstab to mount on startup
-            echo "$nfs_server_ip:$nfs_server_share_path $omnia_path nfs nosuid,rw,sync,hard,intr" >> /etc/fstab
-        else
-            echo -e "${BLUE} Using internal NFS path without mounting.${NC}"
-        fi
-
-    fi
-
-    hashed_passwd=$(openssl passwd -1 $passwd)
-    ssh_key_file="/root/.ssh/oim_rsa"
-    ssh_port=2222
-
-    # Generate a new ssh key pair.
-    if [ -f "$ssh_key_file" ]; then
-        echo -e "\n${BLUE} Skipping generating new ssh key pair.${NC}"
-    else
-        echo -e "\n${GREEN} Generating a new ssh key pair.${NC}"
-        ssh-keygen -t rsa -b 4096 -C "omnia_oim" -q -N '' -f /root/.ssh/oim_rsa
-        {
-            echo "Host omnia_core"
-            echo "    Hostname localhost"
-            echo "    Port $ssh_port"
-            echo "    User root"
-            echo "    IdentityFile ~/.ssh/oim_rsa"
-            echo "    IdentitiesOnly yes"
-        } >> $HOME/.ssh/config
-    fi
-
-    # Create the ssh configuration directory if it does not exist.
-    echo -e "${GREEN} Creating the ssh configuration directory if it does not exist.${NC}"
-    mkdir -p "$omnia_path/omnia/ssh_config/.ssh"
-
-    # Copy the omnia_core ssh config to the shared path.
-    echo -e "${GREEN} Copying the omnia_core ssh config to the omnia shared path.${NC}"
-    cp "$HOME/.ssh/config" "$omnia_path/omnia/ssh_config/.ssh/config"
-
-    # Copy the oim_rsa ssh key to the shared path.
-    echo -e "${GREEN} Copying the oim_rsa ssh key to the omnia shared path.${NC}"
-    cp "$HOME/.ssh/oim_rsa" "$omnia_path/omnia/ssh_config/.ssh/oim_rsa"
-
-    # Copy the ssh private key to the omnia shared path.
-    echo -e "${GREEN} Copying the ssh private key to the omnia shared path.${NC}"
-    cp $ssh_key_file "$omnia_path/omnia/ssh_config/.ssh/id_rsa"
-
-    # Copy the ssh public key to the omnia shared path.
-    echo -e "${GREEN} Copying the ssh public key to the omnia shared path.${NC}"
-    cp $ssh_key_file.pub "$omnia_path/omnia/ssh_config/.ssh/id_rsa.pub"
-
-    # Get the ssh public key.
-    ssh_public_key="$(cat /root/.ssh/oim_rsa.pub)"
-
-    validate_nfs_server
-
-    # Add ssh public key to the authorized_keys.
-    echo -e "${GREEN} Adding ssh public key to the authorized_keys.${NC}"
-    if grep -q "$ssh_public_key" $HOME/.ssh/authorized_keys; then
-        echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys.${NC}"
-    else
-        echo "$ssh_public_key" >> $HOME/.ssh/authorized_keys
-        chmod 600 $HOME/.ssh/authorized_keys
-    fi
-
-    # Add ssh public key to the authorized_keys in the ssh_config directory.
-    echo -e "${GREEN} Adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}"
-    if [ -f "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" ] && grep -q "$ssh_public_key" "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"; then
-        echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}"
-    else
-        echo "$ssh_public_key" >> "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"
-        chmod 600 "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"
-    fi
-
-    # Create the log directory if it does not exist.
-    echo -e "${GREEN} Creating the log directory if it does not exist.${NC}"
-    mkdir -p "$omnia_path/omnia/log/core/container"
-    mkdir -p "$omnia_path/omnia/log/core/playbooks"
-
-    # Create the hosts file for cluster in $omnia_path/omnia/hosts
-    echo -e "${GREEN} Creating the hosts file for cluster.${NC}"
-    touch "$omnia_path/omnia/hosts"
-
-    # Create the pulp_ha directory if it does not exist.
-    echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}"
-    mkdir -p "$omnia_path/omnia/pulp/pulp_ha"
-}
-
-
-# This function is responsible for fetching the configuration from the Omnia core.
-# It uses podman exec to run a command in the Omnia core container.
-# The command retrieves the metadata from the oim_metadata.yml file.
-# The metadata is then parsed and the required configuration is extracted.
-fetch_config() {
-
-    # Fetch the metadata from the oim_metadata.yml file.
-    echo -e "${GREEN} Fetching the metadata from the oim_metadata.yml file.${NC}"
-        core_config=$(podman exec -ti omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml')
-
-    # Split the metadata into separate lines.
-    IFS=$'\n' read -r -d '' -a config_lines <<<"$core_config"
-
-    # Loop through the lines and extract the required configuration.
-    for line in "${config_lines[@]}"; do
-        # Extract the key and value from the line.
-        key=$(echo "$line" | awk -F ':' '{print $1}')
-        value=$(echo "$line" | awk -F ':' '{print $2}')
-
-        # Check the key and assign the value to the corresponding variable.
-        case $key in
-            oim_shared_path)
-                # Assign the shared path.
-                omnia_path=$(echo "$value" | tr -d '[:space:]')
-                ;;
-            omnia_core_hashed_passwd)
-                # Assign the hashed password.
-                hashed_passwd=$(echo "$value" | tr -d '[:space:]')
-                ;;
-            nfs_server_ip)
-                # Assign the nfs server ip.
-                nfs_server_ip=$(echo "$value" | tr -d '[:space:]')
-                ;;
-            nfs_server_share_path)
-                # Assign the nfs server share path.
-                nfs_server_share_path=$(echo "$value" | tr -d '[:space:]')
-                ;;
-            omnia_share_option)
-                # Assign the share option.
-                share_option=$(echo "$value" | tr -d '[:space:]')
-                ;;
-            nfs_type)
-                # Assign the share option.
-                nfs_type=$(echo "$value" | tr -d '[:space:]')
-                ;;
-        esac
-    done
-    # Check if the required configuration is extracted successfully.
-    if [ -z "$omnia_path" ] || [ -z "$hashed_passwd" ]; then
-        echo -e "${RED} Failed to fetch data from metadata file.${NC}"
-        exit 1
-    else
-        echo -e "${GREEN} Successfully fetched data from metadata file.${NC}"
-    fi
-}
-
-# Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is
-# configured with a domain name, checking if Podman is installed, enabling and
-# starting the Podman socket.
-validate_oim() {
-    # Check if the hostname is set
-    hostname_value=$(hostname)
-    if [[ -z "$hostname_value" ]]; then
-        echo -e "${RED}Hostname is not set!${NC}"
-        exit 1
-    fi
-
-    # Check if the hostname is static
-    static_hostname=$(hostnamectl --static)
-    current_hostname=$(hostname)
-    if [[ "$static_hostname" != "$current_hostname" ]]; then
-        echo -e "${RED}Static Hostname is unset. Current: '$current_hostname', Static: '$static_hostname'${NC}"
-        echo -e "${RED}Please set the static hostname and try again.${NC}"
-        echo -e "${BLUE}Command to set hostname: hostnamectl set-hostname <hostname>${NC}"
-        echo -e "${RED}Exiting...${NC}"
-        exit 1
-    fi
-
-    # Check if the hostname is configured with a domain name.
-    domain_name=$(hostname -d)
-    if [[ -n "$domain_name" ]]; then
-        echo -e "${BLUE}Hostname is configured with a domain name: $domain_name${NC}"
-    else
-        echo -e "${RED}Invalid hostname, hostname is not configured with a domain name!${NC}"
-        exit 1
-    fi
-
-    # Detect OIM timezone from systemd in a stable, case‑independent way
-    oim_timezone=$(timedatectl show -p Timezone --value 2>/dev/null)
-
-    # Fallbacks if needed (non‑systemd or old timedatectl)
-    if [[ -z "$oim_timezone" ]]; then
-        if [[ -f /etc/timezone ]]; then
-            # Debian/Ubuntu style
-            oim_timezone=$(< /etc/timezone)
-        elif [[ -L /etc/localtime ]]; then
-            # Derive from /etc/localtime symlink
-            oim_timezone=$(readlink -f /etc/localtime | sed -n 's|^.*zoneinfo/||p')
-        fi
-    fi
-
-    podman --version
-
-    # Capture the exit status
-    if [ $? -eq 0 ]; then
-        echo -e "${BLUE} Podman is installed. Version: $(podman --version)${NC}"
-    else
-        echo -e "${RED} Podman is not installed.${NC}"
-        exit 1
-    fi
-
-    # Enable the podman socket to start at boot
-    echo -e "${BLUE} Enabling podman.socket...${NC}"
-    systemctl enable podman.socket
-
-    # Start the podman socket now
-    echo -e "${BLUE} Starting podman.socket...${NC}"
-    systemctl start podman.socket
-
-    # Print a success message after enabling and starting the podman socket
-    echo -e "${GREEN} Podman socket has been enabled and started.${NC}"
-}
-
-# Checks if the required directories for Omnia are present.
-# This function iterates over a list of required directories/files and checks if each one exists.
-check_required_directories() {
-    required_paths=(
-        "$omnia_path/omnia"
-        "$omnia_path/omnia/ssh_config/.ssh"
-        "$omnia_path/omnia/log/core/container"
-        "$omnia_path/omnia/hosts"
-        "$omnia_path/omnia/pulp/pulp_ha"
-    )
-
-    missing_paths=()
-
-    for path in "${required_paths[@]}"; do
-        if [ ! -e "$path" ]; then  # Checks both files and directories
-            missing_paths+=("$path")
-        fi
-    done
-
-    if [ "${#missing_paths[@]}" -ne 0 ]; then
-        echo -e "${RED}Error: The following required files or directories are missing:${NC}"
-        echo -e "${RED}${missing_paths[*]}${NC}"
-        echo -e "User can not Retain Existing configuration"
-        echo
-        echo -e "${YELLOW}Instructions:${NC}"
-        echo -e "${YELLOW}* Backup any existing files if required${NC}"
-        echo -e "${YELLOW}* Run ./omnia.sh --install and choose:${NC}"
-        echo -e "${YELLOW}    Options:${NC}"
-        echo -e "${YELLOW}      -> Reinstall the container${NC}"
-        echo -e "${YELLOW}      -> Overwrite and create new configuration${NC}"
-        exit 1
-    fi
-}
-
-# Sets up the Omnia core container.
-# This function pulls the Omnia core Podman image and runs the container.
-# Creates a Quadlet service for the container and also creates a metadata file.
-# It defines the container options and runs the container.
-setup_container() {
-    container_name="omnia_core"
-    echo "==> Setting up $container_name container"
-
-    # SELinux option handling
-    selinux_option=":z"
-    if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then
-        selinux_option=""
-    fi
-
-    # Check if RHEL subscription is enabled
-    subscription_enabled=false
-    if [ -d "/etc/pki/entitlement" ] && [ "$(ls -A /etc/pki/entitlement/*.pem 2>/dev/null)" ]; then
-        subscription_enabled=true
-    fi
-
-    # --- Generate Quadlet container file ---
-    cat > /etc/containers/systemd/${container_name}.container <<EOF
-# ===============================================================
-# $container_name Quadlet Service
-# Generated dynamically by omnia.sh
-# ===============================================================
-[Unit]
-Description=${container_name^} Container
-
-[Container]
-ContainerName=${container_name}
-HostName=${container_name}
-Image=${container_name}:1.1
-Network=host
-
-# Capabilities
-AddCapability=CAP_AUDIT_WRITE
-
-# Volumes
-Volume=${omnia_path}/omnia:/opt/omnia${selinux_option}
-Volume=${omnia_path}/omnia/ssh_config/.ssh:/root/.ssh${selinux_option}
-Volume=${omnia_path}/omnia/log/core/container:/var/log${selinux_option}
-Volume=${omnia_path}/omnia/hosts:/etc/hosts${selinux_option}
-Volume=${omnia_path}/omnia/pulp/pulp_ha:/root/.config/pulp${selinux_option}
-EOF
-
-    # Add subscription volume mounts only if subscription is enabled
-    if [ "$subscription_enabled" = true ]; then
-        cat >> /etc/containers/systemd/${container_name}.container <<EOF
-Volume=/etc/pki/entitlement:/etc/pki/entitlement:ro,z
-Volume=/etc/yum.repos.d/redhat.repo:/etc/yum.repos.d/redhat.repo:ro,z
-EOF
-    fi
-
-    cat >> /etc/containers/systemd/${container_name}.container <<EOF
-
-[Service]
-Restart=always
-
-[Install]
-WantedBy=multi-user.target default.target
-
-EOF
-
-    # Create the .data directory if it does not exist.
-    # This is where the oim_metadata.yml file is stored.
-    echo -e "${GREEN} Creating the .data directory if it does not exist.${NC}"
-    mkdir -p "$OMNIA_METADATA_DIR"
-
-    oim_metadata_file="$OMNIA_METADATA_FILE"
-
-    if [ ! -f "$oim_metadata_file" ]; then
-        echo -e "${GREEN} Creating oim_metadata file${NC}"
-        {
-            echo "oim_crt: \"podman\""
-            echo "oim_shared_path: $omnia_path"
-            echo "omnia_version: $omnia_release"
-            echo "oim_hostname: $(hostname)"
-            echo "oim_node_name: $(hostname -s)"
-            echo "domain_name: $domain_name"
-            echo "oim_timezone: $oim_timezone"
-            echo "omnia_core_hashed_passwd: $hashed_passwd"
-            echo "omnia_share_option: $share_option"
-        } >> "$oim_metadata_file"
-        if [ "$share_option" = "NFS" ]; then
-            {
-            echo "nfs_server_ip: $nfs_server_ip"
-            echo "nfs_server_share_path: $nfs_server_share_path"
-            echo "nfs_type: $nfs_type"
-        } >> "$oim_metadata_file"
-        fi
-    fi
-
-    # --- Remove old service if exists ---
-    if systemctl list-unit-files | grep -q "${container_name}.service"; then
-        systemctl stop ${container_name}.service
-        systemctl disable ${container_name}.service
-        rm -f /etc/systemd/system/${container_name}.service
-    fi
-
-    # --- Reload systemd so Quadlet generates the service ---
-    systemctl daemon-reexec
-    systemctl daemon-reload
-    systemctl start ${container_name}.service
-
-    # --- Start the container via Quadlet ---
-    echo "==> ${container_name} container deployed and starting via Quadlet"
-
-    # --- Wait for container to be running ---
-    echo "Waiting for $container_name container to start..."
-    for i in {1..30}; do
-        if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
-            echo "$container_name container is running."
-            break
-        else
-            sleep 1
-        fi
-    done
-
-    if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
-        echo -e "${RED}Error: $container_name container failed to start.${NC}"
-        rm -rf "$OMNIA_METADATA_FILE"
-        exit 1
-    fi
-
-    systemctl start firewalld
-    systemctl enable firewalld
-    firewall-cmd --permanent --zone=public --add-port=2222/tcp
-    firewall-cmd --reload
-}
-
-# This function sets up the configuration for the Omnia core.
-#  post_setup_config is a function that sets up the configuration for the Omnia core.
-#  It creates the necessary directories and files, copies input files from the Omnia container,
-#  and creates the oim_metadata.yml file.
-post_setup_config() {
-
-    # Create the ansible tmp directory if it does not exist.
-    mkdir -p "$omnia_path/omnia/tmp/.ansible/tmp"
-    chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp"
-    # Create the input directory if it does not exist.
-    echo -e "${GREEN} Creating the input directory if it does not exist.${NC}"
-    mkdir -p "$OMNIA_INPUT_DIR/"
-
-    # Create the default.yml file if it does not exist.
-    # This file contains the name of the project.
-    if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then
-        echo -e "${BLUE} Creating default.yml file.${NC}"
-        {
-            echo "# This file defines the project name."
-            echo "# The name of the project should be set in a directory under input."
-            echo "project_name: project_default"
-        } >> "$OMNIA_INPUT_DIR/default.yml"
-    fi
-
-    # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container
-    podman exec -u root omnia_core bash -c "cd /omnia && git pull"
-    echo -e "${BLUE} Moving input files from /omnia dir to project_default folder.${NC}"
-    podman exec -u root omnia_core bash -c "
-    mkdir -p /opt/omnia/input/project_default
-    cp -r /omnia/input/* /opt/omnia/input/project_default
-    rm -rf /omnia/input
-    rm -rf /omnia/omnia.sh"
-
-    init_ssh_config
-}
-
-validate_nfs_server() {
-
-    # Validate NFS server permission
-    if [ "$share_option" = "NFS" ]; then
-        # Create a temporary file inside $omnia_path
-        temp_file="$omnia_path/temp_file"
-        touch "$temp_file"
-        # Check if the file can be chown to root
-        if chown root:root "$temp_file"; then
-            rm "$temp_file"
-        else
-            echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration."
-            exit 1
-        fi
-        if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then
-            echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration."
-            exit 1
-        fi
-    fi
-
-}
-
-init_ssh_config() {
-    touch $HOME/.ssh/known_hosts
-    # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host
-    ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1  # Remove existing entry if it exists
-    ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts  # Scan and add the new key
-}
-
-start_container_session() {
-
-    echo -e "${GREEN}
-    ------------------------------------------------------------------------------------------------------------------------------------------
-            Omnia Core container running successfully.
-
-            Entering the container from Omnia Infrastructure Manager(OIM):
-            Through podman:
-            # podman exec -it -u root omnia_core bash
-
-            Direct SSH:
-            # ssh omnia_core
-
-            You are now in the Omnia environment.
-
-            The following are the main directories available in the Omnia core container:
-
-            - The shared directory, which is mapped to $omnia_path in OIM: /opt/omnia
-            - The input directory: /opt/omnia/input
-            - The Omnia source code directory: /omnia
-            - The Omnia playbooks logs directory: /opt/omnia/log/core/playbooks
-
-            It's important to note:
-                - Files placed in the shared directory should not be manually deleted.
-                - Use the playbook /omnia/utils/oim_cleanup.yml to safely remove the shared directory and Omnia containers (except the core container).
-                - If you need to delete the core container, please run the omnia.sh script with --uninstall option.
-                - If you need to  redeploy the core container with new input configs, please rerun the omnia.sh script with --install option.
-                - Provide any file paths (ISO, mapping files, etc.) that are mentioned in input files in the /opt/omnia directory.
-                - The domain name that will be used for Omnia is $domain_name, if you wish to change the domain name please cleanup Omnia,
-                  change the Omnia Infrastructure Manager's domain name and rerun omnia.sh script with --install option.
-
-    --------------------------------------------------------------------------------------------------------------------------------------------------
-    ${NC}"
-
-    # Entering Omnia-core container
-    ssh omnia_core
-}
-
-show_help() {
-    echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]"
-    echo "  -i, --install     Install and start the Omnia core container"
-    echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
-    echo "      --upgrade     Upgrade the Omnia core container from image tag 1.0 to 1.1"
-    echo "  -v, --version     Display Omnia version information"
-    echo "  -h, --help        More information about usage"
-}
-
-install_omnia_core() {
-    local omnia_core_tag="1.1"
-    local omnia_core_registry=""
-    
-    # Check if local omnia_core:1.1 exists
-    if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
-        echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
-    # Check if latest exists for backward compatibility
-    elif podman inspect omnia_core:latest >/dev/null 2>&1; then
-        echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}"
-        # Tag it as 1.1 for consistency
-        podman tag omnia_core:latest omnia_core:${omnia_core_tag}
-    else
-        echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
-        echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}"
-        echo ""
-        echo -e "${YELLOW}One way to build the image locally:${NC}"
-        echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container"
-        echo -e "2. Navigate to the repository directory:"
-        echo -e "   cd omnia-artifactory"
-        echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core omnia_branch=<version/branch_name>"
-        echo ""
-        echo -e "${YELLOW}Then re-run:${NC}"
-        echo -e "   ./omnia.sh --install"
-        exit 1
-    fi
-
-    # Check if any other containers with 'omnia' in their name are running
-    other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core')
-
-    # If there are any, exit
-    if [ -n "$other_containers" ]; then
-        echo -e "${RED} Failed to intiatiate omnia_core container cleanup. There are other omnia container running.${NC}"
-        echo -e "${GREEN} Execute oim_cleanup.yml first to cleanup all containers.${NC}"
-        ssh omnia_core
-        exit 1
-    fi
-
-    # Check if the omnia_core container is already running
-    running_containers=$(podman ps -a --format '{{.Names}} {{.State}}' | grep -E 'omnia_core')
-
-    # If yes, set the variable to true
-    if [ -n "$running_containers" ]; then
-        core_container_status=true
-    fi
-
-    # If core container is running
-    if [ "$core_container_status" = true ]; then
-        if [ -n "$(echo "$running_containers" | grep -E 'running')" ]; then
-            echo -e "${GREEN} Omnia core container is already running.${NC}"
-            echo -e "${GREEN} Do you want to:${NC}"
-            PS3="Select the option number: "
-
-            select opt in "Enter omnia_core container" "Reinstall the container" "Exit"; do
-                case $opt in
-                    "Enter omnia_core container")
-                        choice=1
-                        break
-                        ;;
-                    "Reinstall the container")
-                        choice=2
-                        break
-                        ;;
-                    "Exit")
-                        echo "Exiting the script."
-                        exit 0
-                        ;;
-                    *)
-                        echo "Invalid choice. Please try again."
-                        continue
-                        ;;
-                esac
-            done
-
-            # If the user wants to enter omnia_core container
-            if [ "$choice" = "1" ]; then
-                start_container_session
-            fi
-            # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function
-            if [ "$choice" = "2" ]; then
-                # Block if critical service containers exist
-                critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
-                if [ -n "$critical_running" ]; then
-                    echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
-                    echo "$critical_running"
-                    echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}"
-                    exit 1
-                fi
-                echo -e "${GREEN} What configuration do you want to use for reinstallation:${NC}"
-
-                PS3="Select the option number: "
-
-                select opt in "Retain Existing configuration" "Overwrite and create new configuration" "Exit"; do
-                    case $opt in
-                        "Retain Existing configuration")
-                            choice=1
-                            break
-                            ;;
-                        "Overwrite and create new configuration")
-                            choice=2
-                            break
-                            ;;
-                        "Exit")
-                            echo "Exiting the script."
-                            exit 0
-                            ;;
-                        *)
-                            echo "Invalid choice. Please try again."
-                            continue
-                            ;;
-                    esac
-                done
-
-                # If the user wants to retain existing configuration, call the remove_container function
-                if [ "$choice" = "1" ]; then
-                    fetch_config
-                    check_required_directories
-                    remove_container
-                    setup_container
-                    init_ssh_config
-                    start_container_session
-                # If the user wants to overwrite and create new configuration, call the cleanup_omnia_core function
-                elif [ "$choice" = "2" ]; then
-                    cleanup_omnia_core
-                    setup_omnia_core
-                fi
-            fi
-        else
-            # If omnia_core container exists and is not running call the remove_container function
-
-            echo -e "${RED} The Omnia Core container is present but not in running state.${NC}"
-            echo -e "${GREEN} Only the core container can be cleanup can be performed.${NC}"
-            echo -e "${GREEN} Container Configurations in the shared directory will not be cleaned up.${NC}"
-            echo -e "${GREEN} Do you want to perform cleanup:${NC}"
-            echo -e "${GREEN} 1. Yes.${NC}"
-            echo -e "${GREEN} 2. No. ${NC}"
-            read -p " Enter your choice (1 or 2): " choice
-            if [ "$choice" = "1" ]; then
-                remove_container
-            elif [ "$choice" = "2" ]; then
-                exit
-            fi
-        fi
-
-    # If core container is not present
-    else
-
-        # Start the container setup
-        echo -e "${GREEN}Starting Omnia core container setup.${NC}"
-        setup_omnia_core
-    fi
-}
-
-# Check if Omnia core container is running
-check_container_status() {
-    # Check if the Omnia core container is running
-    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-        echo -e "${RED}ERROR: Omnia core container is not running.${NC}"
-        exit 1
-    fi
-}
-
-# Function to display version information
-display_version() {
-    # Check if metadata file exists and Omnia core container is running
-    check_container_status
-    
-    # Fetch the metadata from the oim_metadata.yml file in the container
-    echo -e "${GREEN} Fetching metadata from omnia_core container...${NC}"
-    core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml')
-    
-    # Extract Omnia version from metadata file
-    omnia_version=$(echo "$core_config" | grep "omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r')
-    
-    # Display version information
-    echo "Omnia version: $omnia_version"
-    
-    # Return exit code 0 on success
-    exit 0
-}
-
-phase1_validate() {
-    local current_image
-    local core_config
-    local previous_omnia_version
-    local shared_path
-
-    echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation"
-
-    if [ "$(id -u)" -ne 0 ]; then
-        if ! sudo -n true >/dev/null 2>&1; then
-            echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo"
-            return 1
-        fi
-    fi
-
-    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-        echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running"
-        return 1
-    fi
-
-    core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null)
-    if [ -z "$core_config" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container"
-        return 1
-    fi
-
-    previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r')
-    if [ -z "$previous_omnia_version" ]; then
-        echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml"
-        return 1
-    fi
-
-    if [ "$previous_omnia_version" != "2.0.0.0" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version"
-        return 1
-    fi
-
-    shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r')
-    if [ -z "$shared_path" ]; then
-        echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml"
-        return 1
-    fi
-
-    omnia_path="$shared_path"
-
-    if [ ! -d "$omnia_path" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path"
-        return 1
-    fi
-
-    if [ ! -w "$omnia_path" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path"
-        return 1
-    fi
-
-    current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null)
-    if [ -z "$current_image" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image"
-        return 1
-    fi
-
-    if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then
-        echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image"
-        return 1
-    fi
-
-    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)"
-
-    if [ ! -d "$OMNIA_BASE_DIR" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Mount/path invalid: expected directory not found: $OMNIA_BASE_DIR"
-        echo "[ERROR] [ORCHESTRATOR] Fix: ensure /opt/omnia exists and is mounted (if using external mount)"
-        return 1
-    fi
-
-    if [ ! -w "$OMNIA_BASE_DIR" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on $OMNIA_BASE_DIR"
-        echo "[ERROR] [ORCHESTRATOR] Fix: run as root or fix permissions on /opt/omnia"
-        return 1
-    fi
-
-    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
-        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
-        return 1
-    fi
-
-    echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed"
-    return 0
-}
-
-phase2_approval() {
-    local backup_base default_backup_dir
-
-    echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate"
-    echo "============================================"
-    echo "OMNIA UPGRADE SUMMARY"
-    echo "============================================"
-    echo "Current Container Tag: 1.0"
-    echo "Target Container Tag:  1.1"
-    echo "Current Omnia Release: 2.0.0.0"
-    echo "Target Omnia Release:  2.1.0.0"
-    echo "New Features:"
-    echo "  - Add and remove node for slurm cluster"
-    echo "  - Additional Package Installation"
-    echo "============================================"
-
-    default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade"
-    backup_base="$default_backup_dir"
-
-    echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base"
-
-    if ! update_metadata_upgrade_backup_dir "$backup_base"; then
-        echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata"
-        return 1
-    fi
-
-    read -p "Proceed with upgrade? (y/N): " confirm
-    if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then
-        echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user"
-        return 1
-    fi
-
-    OMNIA_UPGRADE_BACKUP_PATH="$backup_base"
-    export OMNIA_UPGRADE_BACKUP_PATH
-
-    echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted"
-    return 0
-}
-
-generate_backup_manifest() {
-    local backup_path="$1"
-    local manifest_file="$backup_path/manifest.txt"
-
-    {
-        echo "backup_version: 1.0"
-        echo "timestamp: $(date -Iseconds)"
-        echo "source_container_tag: 1.0"
-        echo "target_container_tag: 1.1"
-        echo "source_omnia_release: 2.0.x"
-        echo "target_omnia_release: 2.1.0.0"
-        echo "hostname: $(hostname)"
-        echo ""
-        echo "files:"
-        find "$backup_path" -type f ! -name "manifest.txt" -exec echo "  - {}" \;
-    } > "$manifest_file"
-}
-
-verify_backup_integrity() {
-    local backup_path="$1"
-
-    [ -d "$backup_path" ] || return 1
-    [ -d "$backup_path/input" ] || return 1
-    [ -d "$backup_path/metadata" ] || return 1
-    [ -d "$backup_path/configs" ] || return 1
-    [ -f "$backup_path/metadata/oim_metadata.yml" ] || return 1
-    [ -f "$backup_path/manifest.txt" ] || return 1
-
-    return 0
-}
-
-create_backup() {
-    local backup_path="$1"
-
-    echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation"
-
-    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-        echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running"
-        return 1
-    fi
-
-    if ! podman exec -u root omnia_core bash -c "
-        set -e
-        mkdir -p '$backup_path/input' '$backup_path/metadata' '$backup_path/configs'
-
-        if [ -d '$OMNIA_INPUT_DIR' ]; then
-            cp -a '$OMNIA_INPUT_DIR' '$backup_path/'
-        fi
-
-        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
-            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
-            exit 1
-        fi
-        cp -a '$OMNIA_METADATA_FILE' '$backup_path/metadata/oim_metadata.yml'
-
-        ts=\"\$(date -Iseconds)\"
-        hn=\"\$(hostname)\"
-        {
-            echo 'backup_version: 1.0'
-            echo \"timestamp: \$ts\"
-            echo 'source_container_tag: 1.0'
-            echo 'target_container_tag: 1.1'
-            echo 'source_omnia_release: 2.0.x'
-            echo 'target_omnia_release: 2.1.0.0'
-            echo \"hostname: \$hn\"
-        } > '$backup_path/manifest.txt'
-    "; then
-        echo "[ERROR] [ORCHESTRATOR] Failed to create backup inside omnia_core container"
-        return 1
-    fi
-
-    if [ -f "/etc/containers/systemd/omnia_core.container" ]; then
-        if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:$backup_path/configs/omnia_core.container" >/dev/null 2>&1; then
-            echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file into container backup path"
-            return 1
-        fi
-    fi
-
-    echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_path"
-    echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed"
-    return 0
-}
-
-wait_for_container_health() {
-    local timeout="${1:-60}"
-    local i
-
-    for i in $(seq 1 "$timeout"); do
-        if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-            return 0
-        fi
-        sleep 1
-    done
-    return 1
-}
-
-update_metadata_version() {
-    local metadata_file="$OMNIA_METADATA_FILE"
-
-    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-        echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running"
-        return 1
-    fi
-
-    podman exec -u root omnia_core bash -c "
-        set -e
-        if [ ! -f '$OMNIA_METADATA_FILE' ]; then
-            echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2
-            exit 1
-        fi
-        if grep -q '^omnia_version:' '$OMNIA_METADATA_FILE'; then
-            sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$OMNIA_METADATA_FILE'
-        else
-            echo 'omnia_version: 2.1.0.0' >> '$OMNIA_METADATA_FILE'
-        fi
-    "
-}
-
-sync_input_to_shared_path() {
-    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
-        echo "[ERROR] [ORCHESTRATOR] Cannot sync input because omnia_core is not running"
-        return 1
-    fi
-
-    if ! podman exec -u root omnia_core bash -c "
-        set -e
-        if [ -d /omnia/input ]; then
-            mkdir -p /opt/omnia/input/project_default
-            cp -r /omnia/input/* /opt/omnia/input/project_default
-            rm -rf /omnia/input
-        fi
-    "; then
-        echo "[ERROR] [ORCHESTRATOR] Failed to copy /omnia/input to /opt/omnia/input/project_default"
-        return 1
-    fi
-    return 0
-}
-
-phase4_container_swap() {
-    echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap"
-
-    if systemctl list-unit-files | grep -q "omnia_core.service"; then
-        systemctl stop omnia_core.service >/dev/null 2>&1 || true
-    fi
-
-    if [ -z "${omnia_path}" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Shared path (omnia_path) is empty. Phase 1 validation may not have run."
-        return 1
-    fi
-
-    if [ ! -f "/etc/containers/systemd/omnia_core.container" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Quadlet file not found: /etc/containers/systemd/omnia_core.container"
-        echo "[ERROR] [ORCHESTRATOR] Cannot proceed with upgrade container swap"
-        return 1
-    fi
-
-    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
-        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
-        return 1
-    fi
-
-    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' /etc/containers/systemd/omnia_core.container; then
-        echo "[ERROR] [ORCHESTRATOR] Failed to update Image in quadlet file"
-        return 1
-    fi
-
-    escaped_omnia_path=$(printf '%s\n' "$omnia_path" | sed 's/[\/&]/\\\\&/g')
-    if grep -q '^Volume=/omnia\(/\|:\)' /etc/containers/systemd/omnia_core.container; then
-        if ! sed -i "s|^Volume=/omnia\(/\|:\)|Volume=${escaped_omnia_path}\\1|g" /etc/containers/systemd/omnia_core.container; then
-            echo "[ERROR] [ORCHESTRATOR] Failed to update Volume paths in quadlet file"
-            return 1
-        fi
-    fi
-
-    systemctl daemon-reload || return 1
-    if ! systemctl restart omnia_core.service; then
-        echo "[ERROR] [ORCHESTRATOR] Failed to restart omnia_core.service"
-        systemctl status omnia_core.service --no-pager -l || true
-        journalctl -xeu omnia_core.service --no-pager | tail -n 120 || true
-        return 1
-    fi
-
-    if ! wait_for_container_health 60; then
-        echo "[ERROR] [ORCHESTRATOR] Container failed health check after swap"
-        return 1
-    fi
-
-    if ! update_metadata_version; then
-        return 1
-    fi
-
-    if ! sync_input_to_shared_path; then
-        return 1
-    fi
-
-    init_ssh_config
-
-    echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed"
-    return 0
-}
-
-upgrade_omnia_core() {
-    local lock_file="/var/lock/omnia_core_upgrade.lock"
-    local backup_path
-
-    if [ -e "$lock_file" ]; then
-        echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}"
-        exit 1
-    fi
-
-    mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true
-    echo "$$" > "$lock_file" || {
-        echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}"
-        exit 1
-    }
-    trap 'rm -f "$lock_file"' EXIT
-
-    if ! phase1_validate; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
-        exit 1
-    fi
-
-    if ! phase2_approval; then
-        exit 0
-    fi
-
-    backup_path="$OMNIA_UPGRADE_BACKUP_PATH"
-    if [ -z "$backup_path" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Backup path is empty"
-        exit 1
-    fi
-
-    if ! create_backup "$backup_path"; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3"
-        exit 1
-    fi
-
-    if ! phase4_container_swap; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
-        exit 1
-    fi
-
-    echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
-    echo "[INFO] [ORCHESTRATOR] Backup location: $backup_path"
-    exit 0
-}
-
-# Main function to check if omnia_core container is already running.
-# If yes, ask the user if they want to enter the container or reinstall.
-# If no, set it up.
-main() {
-    case "$1" in
-        --install|-i)
-            install_omnia_core
-            ;;
-        --uninstall|-u)
-            cleanup_omnia_core
-            ;;
-        --upgrade)
-            upgrade_omnia_core
-            ;;
-        --version|-v)
-            display_version
-            ;;
-        --help|-h|"")
-            show_help
-            ;;
-        *)
-            echo "Unknown option: $1"
-            show_help
-            exit 1
-            ;;
-    esac
-}
-
-# Call the main function
-main "$1"

From 66339a9754810d6f7cc98815791b0d9c50a521dd Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Mon, 9 Feb 2026 13:13:16 +0000
Subject: [PATCH 088/172] custom slurm confs

---
 ...-group-login_compiler_node_aarch64.yaml.j2 |  28 ++--
 ...i-group-login_compiler_node_x86_64.yaml.j2 |  28 ++--
 .../ci-group-login_node_aarch64.yaml.j2       |  28 ++--
 .../ci-group-login_node_x86_64.yaml.j2        |  27 ++--
 ...ci-group-slurm_control_node_x86_64.yaml.j2 |   9 +-
 .../ci-group-slurm_node_aarch64.yaml.j2       |  46 ++++--
 .../ci-group-slurm_node_x86_64.yaml.j2        |  45 ++++--
 discovery/roles/slurm_config/tasks/confs.yml  |  20 ++-
 .../tasks/extract_path_overrides.yml          | 147 ++++++++++++++++++
 .../tasks/validate_path_overrides.yml         |  83 ++++++++++
 10 files changed, 379 insertions(+), 82 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/extract_path_overrides.yml
 create mode 100644 discovery/roles/slurm_config/tasks/validate_path_overrides.yml

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index de236ed958..c273b54f90 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -204,11 +204,10 @@
       runcmd:
         - /usr/local/bin/set-ssh.sh
         - /usr/local/bin/install_cuda_toolkit.sh
-        - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
+        - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      {{ slurm_slurmd_spool_dir_effective }}       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d     /etc/slurm/epilog.d      nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path}}/hpc_tools  /hpc_tools   nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -221,17 +220,22 @@
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
         - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key
-        - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }}
         - chmod {{ file_mode_400 }} /etc/munge/munge.key
         - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/
-        - mkdir -p /var/spool/slurmd
-        - chmod {{ file_mode_755 }} /var/spool/slurmd
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd
+{% for epath in slurm_epilog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi'
+{% endfor %}
+{% for ppath in slurm_prolog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi'
+{% endfor %}
+        - mkdir -p {{ slurm_slurmd_spool_dir_effective }}
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - setenforce 0
         - systemctl enable firewalld
         - systemctl start firewalld
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index 3195fad9e3..b7b23c1d33 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -214,11 +214,10 @@
 
         # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
         - mkdir -p {{ client_mount_path }}/slurm/ssh 
-        - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
+        - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      {{ slurm_slurmd_spool_dir_effective }}       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d     /etc/slurm/epilog.d      nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path}}/hpc_tools  /hpc_tools   nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -233,17 +232,22 @@
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
         - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key
-        - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }}
         - chmod {{ file_mode_400 }} /etc/munge/munge.key
         - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/
-        - mkdir -p /var/spool/slurmd
-        - chmod {{ file_mode_755 }} /var/spool/slurmd
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd
+{% for epath in slurm_epilog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi'
+{% endfor %}
+{% for ppath in slurm_prolog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi'
+{% endfor %}
+        - mkdir -p {{ slurm_slurmd_spool_dir_effective }}
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - setenforce 0
         - systemctl enable firewalld
         - systemctl start firewalld
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
index f869d7d8fe..8b3d771592 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
@@ -116,11 +116,10 @@
       runcmd:
         - /usr/local/bin/set-ssh.sh
 
-        - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
+        - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      {{ slurm_slurmd_spool_dir_effective }}       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d     /etc/slurm/epilog.d      nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images  /hpc_tools/container_images   nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -134,17 +133,22 @@
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
         - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key
-        - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }}
         - chmod {{ file_mode_400 }} /etc/munge/munge.key
         - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/
-        - mkdir -p /var/spool/slurmd
-        - chmod {{ file_mode_755 }} /var/spool/slurmd
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd
+{% for epath in slurm_epilog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi'
+{% endfor %}
+{% for ppath in slurm_prolog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi'
+{% endfor %}
+        - mkdir -p {{ slurm_slurmd_spool_dir_effective }}
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - setenforce 0
         - systemctl enable firewalld
         - systemctl start firewalld
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
index 82646da1c6..4e68ba8d81 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
@@ -123,10 +123,10 @@
         - /usr/local/bin/set-ssh.sh
         # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
         - mkdir -p {{ client_mount_path }}/slurm/ssh
-        - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
+        - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
         - echo "{{ cloud_init_nfs_path }}/cert  /cert   nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
-        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      /var/spool/slurmd       nfs defaults,_netdev 0 0" >> /etc/fstab
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+        - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      {{ slurm_slurmd_spool_dir_effective }}       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d     /etc/slurm/epilog.d      nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -144,18 +144,23 @@
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
         - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key
-        - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }}
         - chmod {{ file_mode_400 }} /etc/munge/munge.key
         - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/
         - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh
-        - mkdir -p /var/spool/slurmd
-        - chmod {{ file_mode_755 }} /var/spool/slurmd
-        - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd
+{% for epath in slurm_epilog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi'
+{% endfor %}
+{% for ppath in slurm_prolog_custom_paths %}
+        - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi'
+{% endfor %}
+        - mkdir -p {{ slurm_slurmd_spool_dir_effective }}
+        - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }}
+        - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
         - setenforce 0
         - systemctl enable firewalld
         - systemctl start firewalld
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
index 2f2721d7eb..a8c3b8d88c 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
@@ -469,15 +469,18 @@
 
         # slurm user and group created in the users module
         # Create directories for nfs and mount all
-         - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
+         - mkdir -p {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} {{ slurm_ctld_pid_dir_effective }} {{ slurmdbd_pid_dir_effective }} {{ slurm_state_save_location_effective }} {% if slurm_sched_log_dir_effective %}{{ slurm_sched_log_dir_effective }} {% endif %}/etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
          - echo "{{ cloud_init_nfs_path }}/cert  /cert   nfs defaults,_netdev 0 0" >> /etc/fstab
          - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm      /etc/slurm       nfs defaults,_netdev 0 0" >> /etc/fstab
          - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/my.cnf.d   /etc/my.cnf.d    nfs defaults,_netdev 0 0" >> /etc/fstab
          - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/mariadb /var/log/mariadb nfs defaults,_netdev 0 0" >> /etc/fstab
-         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
+         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_ctld_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+{% if slurmdbd_log_dir_effective != slurm_ctld_log_dir_effective %}
+         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurmdbd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+{% endif %}
 {% if powervault_config is not defined %}
          - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql  /var/lib/mysql   nfs defaults,_netdev 0 0" >> /etc/fstab
-         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld     /var/spool/slurmctld      nfs defaults,_netdev 0 0" >> /etc/fstab
+         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld     {{ slurm_state_save_location_effective }}      nfs defaults,_netdev 0 0" >> /etc/fstab
 {% endif %}
          - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
          - echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index cc784bdd10..a81d564ba6 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -237,13 +237,12 @@
             echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) ====="
 
             echo "[INFO] Creating base directories for Slurm and Munge"
-            mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
+            mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
 
             echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths"
-            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
-            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      {{ slurm_slurmd_spool_dir_effective }}       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d     /etc/slurm/epilog.d      nfs defaults,_netdev 0 0" >> /etc/fstab
-            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool      /var/spool       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images  /hpc_tools/container_images   nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -274,22 +273,43 @@
             bash /usr/local/bin/check_slurm_controller_status.sh
 
             echo "[INFO] Setting ownership for Slurm directories"
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
 
             echo "[INFO] Setting permissions for Slurm directories"
-            chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm
+            chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }}
 
             echo "[INFO] Ensuring Slurm epilog directory and logout script permissions"
             chmod {{ file_mode_755 }} /etc/slurm/epilog.d/
             chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh
+{% for epath in slurm_epilog_custom_paths %}
+
+            echo "[INFO] Checking custom epilog script: {{ epath }}"
+            if [ ! -f "{{ epath }}" ]; then
+              echo "[INFO] Creating stub epilog script at {{ epath }}"
+              mkdir -p "$(dirname '{{ epath }}')"
+              printf '#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n' > "{{ epath }}"
+              chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"
+              chmod {{ file_mode_755 }} "{{ epath }}"
+            fi
+{% endfor %}
+{% for ppath in slurm_prolog_custom_paths %}
+
+            echo "[INFO] Checking custom prolog script: {{ ppath }}"
+            if [ ! -f "{{ ppath }}" ]; then
+              echo "[INFO] Creating stub prolog script at {{ ppath }}"
+              mkdir -p "$(dirname '{{ ppath }}')"
+              printf '#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n' > "{{ ppath }}"
+              chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"
+              chmod {{ file_mode_755 }} "{{ ppath }}"
+            fi
+{% endfor %}
 
-            echo "[INFO] Creating and configuring /var/spool/slurmd"
-            mkdir -p /var/spool/slurmd
-            chmod {{ file_mode_755 }} /var/spool/slurmd
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd
+            echo "[INFO] Creating and configuring slurmd spool directory"
+            mkdir -p {{ slurm_slurmd_spool_dir_effective }}
+            chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }}
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
 
             echo "[INFO] ===== Completed slurmd setup (aarch64) ====="
 
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 5128aee1d1..5d930bef47 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -256,12 +256,12 @@
             # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
             mkdir -p {{ client_mount_path }}/slurm/ssh
             echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge"
-            mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
+            mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
 
             echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths"
             echo "{{ cloud_init_nfs_path }}/cert  /cert   nfs defaults,_netdev 0 0" >> /etc/fstab
-            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  /var/log/slurm   nfs defaults,_netdev 0 0" >> /etc/fstab
-            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      /var/spool/slurmd       nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm  {{ slurm_slurmd_log_dir_effective }}   nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd      {{ slurm_slurmd_spool_dir_effective }}       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d     /etc/slurm/epilog.d      nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
@@ -292,22 +292,43 @@
             bash /usr/local/bin/check_slurm_controller_status.sh
 
             echo "[INFO] Setting ownership for Slurm directories"
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
 
             echo "[INFO] Setting permissions for Slurm directories"
-            chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm
+            chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }}
 
             echo "[INFO] Ensuring Slurm epilog directory and logout script permissions"
             chmod {{ file_mode_755 }} /etc/slurm/epilog.d/
             chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh
+{% for epath in slurm_epilog_custom_paths %}
+
+            echo "[INFO] Checking custom epilog script: {{ epath }}"
+            if [ ! -f "{{ epath }}" ]; then
+              echo "[INFO] Creating stub epilog script at {{ epath }}"
+              mkdir -p "$(dirname '{{ epath }}')"
+              printf '#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n' > "{{ epath }}"
+              chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"
+              chmod {{ file_mode_755 }} "{{ epath }}"
+            fi
+{% endfor %}
+{% for ppath in slurm_prolog_custom_paths %}
+
+            echo "[INFO] Checking custom prolog script: {{ ppath }}"
+            if [ ! -f "{{ ppath }}" ]; then
+              echo "[INFO] Creating stub prolog script at {{ ppath }}"
+              mkdir -p "$(dirname '{{ ppath }}')"
+              printf '#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n' > "{{ ppath }}"
+              chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"
+              chmod {{ file_mode_755 }} "{{ ppath }}"
+            fi
+{% endfor %}
 
-            echo "[INFO] Creating and configuring /var/spool/slurmd"
-            mkdir -p /var/spool/slurmd
-            chmod {{ file_mode_755 }} /var/spool/slurmd
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd
+            echo "[INFO] Creating and configuring slurmd spool directory"
+            mkdir -p {{ slurm_slurmd_spool_dir_effective }}
+            chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }}
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }}
 
             echo "[INFO] ===== Completed slurmd setup ====="
 
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 1ff30acf34..641efc7ab9 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -112,6 +112,12 @@
     slurm_conf_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}"
   when: "'slurm' in conf_merge_dict"
 
+- name: Extract effective path parameters from merged configs
+  ansible.builtin.include_tasks: extract_path_overrides.yml
+
+- name: Validate path parameters are absolute
+  ansible.builtin.include_tasks: validate_path_overrides.yml
+
 - name: Get nodes from normal partition and compare with cmpt_list
   ansible.builtin.set_fact:
     normal_partition: "{{ slurm_conf_dict.PartitionName | default([]) | selectattr('PartitionName', 'equalto', slurm_partition_name) | first | default({}) }}"
@@ -134,17 +140,17 @@
     - nodes_in_normal_not_in_cmpt is defined
     - nodes_in_normal_not_in_cmpt | length > 0
 
-- name: Create directories from conf values
+- name: Create directories from conf values (NFS server-side always uses defaults)
   ansible.builtin.include_tasks: exist_dir.yml
   loop:
     - "{{ ctld_list
-     | product([slurm_conf_dict.get('StateSaveLocation', '/var/spool/slurmctld'),
-      (slurm_conf_dict.get('SlurmctldLogFile', '/var/log/slurmctld.log') | dirname),
-      (slurm_conf_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid') | dirname)]) }}"
+     | product(['/var/spool/slurmctld',
+      '/var/log/slurm',
+      '/var/run']) }}"
     - "{{ (cmpt_list + login_list + compiler_login_list)
-     | product([slurm_conf_dict.get('SlurmdSpoolDir', '/var/spool/slurmd'),
-      (slurm_conf_dict.get('SlurmdLogFile', '/var/log/slurmd.log') | dirname),
-      (slurm_conf_dict.get('SlurmdPidFile', '/var/run/slurmd.pid') | dirname)]) }}"
+     | product(['/var/spool/slurmd',
+      '/var/log/slurm',
+      '/var/run']) }}"
   loop_control:
     loop_var: product
 
diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
new file mode 100644
index 0000000000..45565dc4e7
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
@@ -0,0 +1,147 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# ── Extract merged dicts ──────────────────────────────────────────────
+
+- name: Extract slurm.conf merged dict
+  ansible.builtin.set_fact:
+    slurm_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}"
+  when: "'slurm' in conf_merge_dict"
+
+- name: Extract slurmdbd.conf merged dict
+  ansible.builtin.set_fact:
+    slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}"
+  when: "'slurmdbd' in conf_merge_dict"
+
+- name: Extract cgroup.conf merged dict
+  ansible.builtin.set_fact:
+    cgroup_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'cgroup') | first).conf_dict }}"
+  when: "'cgroup' in conf_merge_dict"
+
+# ── slurm.conf: controller path params ────────────────────────────────
+
+- name: Extract effective controller directories from slurm.conf
+  ansible.builtin.set_fact:
+    slurm_ctld_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log']) | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable and slurm_merged_dict.get('SlurmctldLogFile') is not string else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log')) | dirname }}"
+    slurm_state_save_location_effective: "{{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld']) | first if slurm_merged_dict.get('StateSaveLocation') is iterable and slurm_merged_dict.get('StateSaveLocation') is not string else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }}"
+    slurm_ctld_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid']) | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable and slurm_merged_dict.get('SlurmctldPidFile') is not string else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid')) | dirname }}"
+    slurm_sched_log_dir_effective: "{{ ((slurm_merged_dict.get('SlurmSchedLogFile', ['']) | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable and slurm_merged_dict.get('SlurmSchedLogFile') is not string else slurm_merged_dict.get('SlurmSchedLogFile', '')) | default('', true) | dirname | default('', true)) }}"
+  when: slurm_merged_dict is defined
+
+# ── slurm.conf: compute path params ──────────────────────────────────
+
+- name: Extract effective compute directories from slurm.conf
+  ansible.builtin.set_fact:
+    slurm_slurmd_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log']) | first if slurm_merged_dict.get('SlurmdLogFile') is iterable and slurm_merged_dict.get('SlurmdLogFile') is not string else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log')) | dirname }}"
+    slurm_slurmd_spool_dir_effective: "{{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd']) | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable and slurm_merged_dict.get('SlurmdSpoolDir') is not string else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }}"
+    slurm_slurmd_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid']) | first if slurm_merged_dict.get('SlurmdPidFile') is iterable and slurm_merged_dict.get('SlurmdPidFile') is not string else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid')) | dirname }}"
+    slurm_epilog_dir_effective: "{{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh']) | first if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh')) | dirname }}"
+    slurm_prolog_dir_effective: "{{ ((slurm_merged_dict.get('Prolog', ['']) | first if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string else slurm_merged_dict.get('Prolog', '')) | default('', true) | dirname | default('', true)) }}"
+  when: slurm_merged_dict is defined
+
+# ── slurm.conf: all epilog/prolog dirs and custom file paths ─────────
+
+- name: Extract all epilog paths from merged Epilog list
+  ansible.builtin.set_fact:
+    slurm_epilog_paths_all: >-
+      {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string
+          else [slurm_merged_dict.get('Epilog', '')])
+         | reject('equalto', '') | list }}
+    slurm_epilog_dirs_all: >-
+      {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string
+          else [slurm_merged_dict.get('Epilog', '')])
+         | map('dirname') | unique | reject('equalto', '') | list }}
+  when: slurm_merged_dict is defined
+
+- name: Extract custom epilog paths (non-default)
+  ansible.builtin.set_fact:
+    slurm_epilog_custom_paths: >-
+      {{ slurm_epilog_paths_all | reject('search', '^/etc/slurm/epilog\\.d/') | list }}
+  when: slurm_merged_dict is defined
+
+- name: Extract all prolog paths from merged Prolog list
+  ansible.builtin.set_fact:
+    slurm_prolog_paths_all: >-
+      {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string
+          else [slurm_merged_dict.get('Prolog', '')])
+         | reject('equalto', '') | list }}
+    slurm_prolog_dirs_all: >-
+      {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string
+          else [slurm_merged_dict.get('Prolog', '')])
+         | map('dirname') | unique | reject('equalto', '') | list }}
+  when: slurm_merged_dict is defined
+
+- name: Extract custom prolog paths (non-default)
+  ansible.builtin.set_fact:
+    slurm_prolog_custom_paths: >-
+      {{ slurm_prolog_paths_all | list }}
+  when: slurm_merged_dict is defined
+
+# ── slurm.conf: plugin dir (both controller and compute) ─────────────
+
+- name: Extract effective plugin directory from slurm.conf
+  ansible.builtin.set_fact:
+    slurm_plugin_dir_effective: "{{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurm_merged_dict.get('PluginDir') is iterable and slurm_merged_dict.get('PluginDir') is not string else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}"
+  when: slurm_merged_dict is defined
+
+# ── slurmdbd.conf path params ────────────────────────────────────────
+
+- name: Extract effective directories from slurmdbd.conf
+  ansible.builtin.set_fact:
+    slurmdbd_log_dir_effective: "{{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log']) | first if slurmdbd_merged_dict.get('LogFile') is iterable and slurmdbd_merged_dict.get('LogFile') is not string else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log')) | dirname }}"
+    slurmdbd_pid_dir_effective: "{{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid']) | first if slurmdbd_merged_dict.get('PidFile') is iterable and slurmdbd_merged_dict.get('PidFile') is not string else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid')) | dirname }}"
+    slurmdbd_plugin_dir_effective: "{{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurmdbd_merged_dict.get('PluginDir') is iterable and slurmdbd_merged_dict.get('PluginDir') is not string else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}"
+  when: slurmdbd_merged_dict is defined
+
+# ── cgroup.conf path params ──────────────────────────────────────────
+
+- name: Extract effective cgroup mountpoint from cgroup.conf
+  ansible.builtin.set_fact:
+    slurm_cgroup_mountpoint_effective: "{{ ((cgroup_merged_dict.get('CgroupMountpoint', ['']) | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable and cgroup_merged_dict.get('CgroupMountpoint') is not string else cgroup_merged_dict.get('CgroupMountpoint', '')) | default('', true)) }}"
+  when: cgroup_merged_dict is defined
+
+# ── Defaults when confs are not merged ────────────────────────────────
+
+- name: Set default effective directories if slurm.conf not merged
+  ansible.builtin.set_fact:
+    slurm_ctld_log_dir_effective: "/var/log/slurm"
+    slurm_slurmd_log_dir_effective: "/var/log/slurm"
+    slurm_state_save_location_effective: "/var/spool/slurmctld"
+    slurm_slurmd_spool_dir_effective: "/var/spool/slurmd"
+    slurm_ctld_pid_dir_effective: "/var/run"
+    slurm_slurmd_pid_dir_effective: "/var/run"
+    slurm_epilog_dir_effective: "/etc/slurm/epilog.d"
+    slurm_prolog_dir_effective: ""
+    slurm_sched_log_dir_effective: ""
+    slurm_plugin_dir_effective: "/usr/lib64/slurm"
+    slurm_epilog_dirs_all: ["/etc/slurm/epilog.d"]
+    slurm_epilog_paths_all: ["/etc/slurm/epilog.d/logout_user.sh"]
+    slurm_epilog_custom_paths: []
+    slurm_prolog_dirs_all: []
+    slurm_prolog_paths_all: []
+    slurm_prolog_custom_paths: []
+  when: slurm_merged_dict is not defined
+
+- name: Set default effective directories if slurmdbd.conf not merged
+  ansible.builtin.set_fact:
+    slurmdbd_log_dir_effective: "/var/log/slurm"
+    slurmdbd_pid_dir_effective: "/var/run"
+    slurmdbd_plugin_dir_effective: "/usr/lib64/slurm"
+  when: slurmdbd_merged_dict is not defined
+
+- name: Set default effective cgroup mountpoint if cgroup.conf not merged
+  ansible.builtin.set_fact:
+    slurm_cgroup_mountpoint_effective: ""
+  when: cgroup_merged_dict is not defined
diff --git a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml
new file mode 100644
index 0000000000..140b1d4bda
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml
@@ -0,0 +1,83 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# ── slurm.conf path validation ───────────────────────────────────────
+
+- name: Validate slurm.conf path parameters are absolute
+  ansible.builtin.fail:
+    msg: "slurm.conf {{ item }} must be an absolute path (start with /). Current value: {{ slurm_merged_dict.get(item) }}"
+  when:
+    - slurm_merged_dict is defined
+    - slurm_merged_dict.get(item) is defined
+    - slurm_merged_dict.get(item) is not none
+    - (slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | length > 0) or (slurm_merged_dict.get(item) is iterable and slurm_merged_dict.get(item) | list | length > 0)
+    - not ((slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | regex_search('^/')) or (slurm_merged_dict.get(item) is iterable and (slurm_merged_dict.get(item) | first) | regex_search('^/')))
+  loop:
+    - SlurmctldLogFile
+    - SlurmdLogFile
+    - StateSaveLocation
+    - SlurmdSpoolDir
+    - SlurmctldPidFile
+    - SlurmdPidFile
+    - Epilog
+    - Prolog
+    - EpilogSlurmctld
+    - PrologSlurmctld
+    - SlurmSchedLogFile
+    - PluginDir
+    - PlugStackConfig
+    - SrunEpilog
+    - SrunProlog
+    - TaskEpilog
+    - TaskProlog
+    - HealthCheckProgram
+    - RebootProgram
+    - UnkillableStepProgram
+    - ResvEpilog
+    - ResvProlog
+    - TmpFS
+    - JobCompLoc
+    - JobCredentialPrivateKey
+    - JobCredentialPublicCertificate
+
+# ── slurmdbd.conf path validation ────────────────────────────────────
+
+- name: Validate slurmdbd.conf path parameters are absolute
+  ansible.builtin.fail:
+    msg: "slurmdbd.conf {{ item }} must be an absolute path (start with /). Current value: {{ slurmdbd_merged_dict.get(item) }}"
+  when:
+    - slurmdbd_merged_dict is defined
+    - slurmdbd_merged_dict.get(item) is defined
+    - slurmdbd_merged_dict.get(item) is not none
+    - (slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | length > 0) or (slurmdbd_merged_dict.get(item) is iterable and slurmdbd_merged_dict.get(item) | list | length > 0)
+    - not ((slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | regex_search('^/')) or (slurmdbd_merged_dict.get(item) is iterable and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/')))
+  loop:
+    - LogFile
+    - PidFile
+    - PluginDir
+
+# ── cgroup.conf path validation ──────────────────────────────────────
+
+- name: Validate cgroup.conf path parameters are absolute
+  ansible.builtin.fail:
+    msg: "cgroup.conf {{ item }} must be an absolute path (start with /). Current value: {{ cgroup_merged_dict.get(item) }}"
+  when:
+    - cgroup_merged_dict is defined
+    - cgroup_merged_dict.get(item) is defined
+    - cgroup_merged_dict.get(item) is not none
+    - (cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | length > 0) or (cgroup_merged_dict.get(item) is iterable and cgroup_merged_dict.get(item) | list | length > 0)
+    - not ((cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | regex_search('^/')) or (cgroup_merged_dict.get(item) is iterable and (cgroup_merged_dict.get(item) | first) | regex_search('^/')))
+  loop:
+    - CgroupMountpoint

From a68829764b761d53282d1c7db523029a2a03da60 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Mon, 9 Feb 2026 18:58:32 +0530
Subject: [PATCH 089/172] Fixed issue where pulp repo resynced crashed before
 pub/dist creation

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/modules/process_rpm_config.py |  91 ++++++++++-
 common/library/modules/pulp_cleanup.py       | 153 ++++++++++++++++---
 local_repo/pulp_cleanup.yml                  |   6 +-
 3 files changed, 219 insertions(+), 31 deletions(-)

diff --git a/common/library/modules/process_rpm_config.py b/common/library/modules/process_rpm_config.py
index 002923d50c..89a8f0e1ca 100644
--- a/common/library/modules/process_rpm_config.py
+++ b/common/library/modules/process_rpm_config.py
@@ -467,6 +467,27 @@ def check_publication_exists(repo_name, log):
         log.error("Error checking publication for '%s': %s", repo_name, str(e))
         return False
 
+def check_distribution_exists(repo_name, log):
+    """
+    Check if a distribution exists for the repository.
+
+    Args:
+        repo_name (str): The name of the repository.
+        log (logging.Logger): Logger instance for logging.
+
+    Returns:
+        bool: True if distribution exists, False otherwise.
+    """
+    try:
+        command = pulp_rpm_commands["check_distribution"] % repo_name
+        log.info("Checking if distribution exists for repository '%s'", repo_name)
+        result = execute_command(command, log)
+        return bool(result)
+    except Exception as e:
+        log.error("Error checking distribution for '%s': %s", repo_name, str(e))
+        return False
+
+
 def delete_old_publications(repo_name, log):
     """
     Delete all existing publications for a repository.
@@ -792,9 +813,43 @@ def process_sync_results(sync_results, rpm_config, resync_repos, log):
     version_changed_repos = [name for success, name, actually_synced, version_changed in sync_results if success and actually_synced and version_changed]
     log.info(f"Repos with version change: {len(version_changed_repos)} - {version_changed_repos}")
     
-    # If no versions changed, skip publication and distribution entirely
+    # If no versions changed, check for missing publication/distribution
+    # This handles the crash recovery case: process failed after sync but before pub/dist
     if not version_changed_repos:
-        log.info("No version changes detected. Skipping publication and distribution.")
+        log.info("No version changes detected. Checking for missing publication/distribution.")
+
+        # Check all synced repos (including previously synced) for missing pub/dist
+        repos_missing_pub_dist = []
+        all_repo_names = []
+        for repo in rpm_config:
+            repo_name = repo["package"]
+            version = repo.get("version")
+            if version and version != "null":
+                repo_name = f"{repo_name}_{version}"
+            all_repo_names.append(repo_name)
+
+            # If resync_repos is a specific list, only check those repos
+            if resync_repos and resync_repos != "all":
+                resync_list = resync_repos if isinstance(resync_repos, list) else [r.strip() for r in resync_repos.split(",")]
+                if repo_name not in resync_list:
+                    continue
+
+            pub_exists = check_publication_exists(repo_name, log)
+            dist_exists = check_distribution_exists(repo_name, log)
+
+            if not pub_exists or not dist_exists:
+                log.info(f"{repo_name} missing publication={not pub_exists}, distribution={not dist_exists}. Including for pub/dist creation.")
+                repo_copy = repo.copy()
+                repo_copy["_version_changed"] = False
+                repos_missing_pub_dist.append(repo_copy)
+
+        if repos_missing_pub_dist:
+            missing_names = [r["package"] for r in repos_missing_pub_dist]
+            log.info(f"Found {len(repos_missing_pub_dist)} repo(s) missing publication/distribution: {missing_names}")
+            return repos_missing_pub_dist, False, ""
+
+        # All repos have publication and distribution - safe to skip
+        log.info("All repos have existing publication and distribution. Skipping.")
         if actually_synced_repos:
             # Repos were synced but no metadata change
             synced_list = ", ".join(actually_synced_repos)
@@ -820,9 +875,37 @@ def process_sync_results(sync_results, rpm_config, resync_repos, log):
                 repos_for_pub_dist.append(repo_copy)
         return repos_for_pub_dist, False, ""
     else:
-        # If no repos were actually synced, skip publication and distribution
+        # If no repos were actually synced, check for missing pub/dist (crash recovery)
         if not actually_synced_repos:
-            log.info("No repos were actually synced. Skipping publication and distribution.")
+            log.info("No repos were actually synced. Checking for missing publication/distribution.")
+            repos_missing_pub_dist = []
+            for repo in rpm_config:
+                repo_name = repo["package"]
+                version = repo.get("version")
+                if version and version != "null":
+                    repo_name = f"{repo_name}_{version}"
+
+                # If resync_repos is a specific list, only check those repos
+                if resync_repos and resync_repos != "all":
+                    resync_list = resync_repos if isinstance(resync_repos, list) else [r.strip() for r in resync_repos.split(",")]
+                    if repo_name not in resync_list:
+                        continue
+
+                pub_exists = check_publication_exists(repo_name, log)
+                dist_exists = check_distribution_exists(repo_name, log)
+
+                if not pub_exists or not dist_exists:
+                    log.info(f"{repo_name} missing publication={not pub_exists}, distribution={not dist_exists}. Including for pub/dist creation.")
+                    repo_copy = repo.copy()
+                    repo_copy["_version_changed"] = False
+                    repos_missing_pub_dist.append(repo_copy)
+
+            if repos_missing_pub_dist:
+                missing_names = [r["package"] for r in repos_missing_pub_dist]
+                log.info(f"Found {len(repos_missing_pub_dist)} repo(s) missing publication/distribution: {missing_names}")
+                return repos_missing_pub_dist, False, ""
+
+            log.info("All repos have existing publication and distribution. No updates required.")
             return [], True, "All repositories already synced - no updates required"
 
         # Filter rpm_config to only include repos with version change
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 91b863144a..217ca9b308 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -216,6 +216,16 @@ def file_exists_in_status(name: str, base_path: str, logger) -> bool:
     except Exception:
         return False
 
+def get_all_repositories(logger) -> List[str]:
+    """Get all RPM repository names from Pulp."""
+    cmd = pulp_rpm_commands["list_repositories"]
+    result = run_cmd(cmd, logger)
+    if result["rc"] != 0:
+        logger.error(f"Failed to list repositories: {result['stderr']}")
+        return []
+    repos = safe_json_parse(result["stdout"])
+    return [r.get('name', '') for r in repos if r.get('name')]
+
 
 # =============================================================================
 # CLEANUP FUNCTIONS
@@ -708,11 +718,12 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
         return {}
 
 
-def mark_software_partial(affected_software: Dict[str, List[str]], base_path: str, logger, artifact_type: str = None):
+def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None):
     """Mark software entries as partial in software.csv.
     
     Args:
-        affected_software: Dict mapping architecture to list of affected software names
+        affected_software: Either a List[str] of software names (from remove_rpms_from_repository)
+                          or a Dict[str, List[str]] mapping arch to software names (from remove_from_status_files)
         base_path: Base path for software.csv
         logger: Logger instance
         artifact_type: Type of artifact being removed (for logging purposes)
@@ -721,39 +732,119 @@ def mark_software_partial(affected_software: Dict[str, List[str]], base_path: st
     if not affected_software:
         logger.info("No affected software to mark as partial")
         return
+
+    # Normalize input: if a flat list is passed, apply to all architectures
+    if isinstance(affected_software, list):
+        arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES}
+    else:
+        arch_software_map = affected_software
         
     try:
-        # Only mark architectures where artifacts were actually removed
-        for arch, software_names in affected_software.items():
-            logger.info(f"Processing arch: {arch}, software_names: {software_names}")
+        for arch, software_names in arch_software_map.items():
             if not software_names:
                 continue
 
             software_file = f"{base_path}/{arch}/software.csv"
             logger.info(f"Looking for software file: {software_file}")
-            if os.path.exists(software_file):
-                rows = []
-                updated = False
-                with open(software_file, 'r') as f:
-                    reader = csv.DictReader(f)
-                    fieldnames = reader.fieldnames
-                    for row in reader:
-                        logger.info(f"Checking row: {row}")
-                        if row.get('name') in software_names:
-                            row['status'] = 'partial'
-                            updated = True
-                            logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in {arch}/software.csv ({artifact_type} cleanup)")
-                        rows.append(row)
+            if not os.path.exists(software_file):
+                logger.warning(f"Software file not found: {software_file}")
+                continue
 
-                if fieldnames and rows:
-                    with open(software_file, 'w', newline='') as f:
-                        writer = csv.DictWriter(f, fieldnames=fieldnames)
-                        writer.writeheader()
-                        writer.writerows(rows)
-                    logger.info(f"Successfully wrote updated software.csv for {arch}")
+            rows = []
+            updated = False
+            with open(software_file, 'r') as f:
+                reader = csv.DictReader(f)
+                fieldnames = reader.fieldnames
+                for row in reader:
+                    if row.get('name') in software_names:
+                        row['status'] = 'partial'
+                        updated = True
+                        logger.info(f"Marked '{row.get('name')}' as partial in {arch}/software.csv ({artifact_type} cleanup)")
+                    rows.append(row)
+            
+            if fieldnames and rows and updated:
+                with open(software_file, 'w', newline='') as f:
+                    writer = csv.DictWriter(f, fieldnames=fieldnames)
+                    writer.writeheader()
+                    writer.writerows(rows)
+                logger.info(f"Successfully wrote updated software.csv for {arch}")
     except Exception as e:
         logger.error(f"Failed to update software.csv: {e}")
 
+def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> bool:
+    """Check if a software has any RPM dependencies in its status.csv.
+    
+    Args:
+        software_name: Name of the software
+        arch: Architecture (x86_64 or aarch64)
+        base_path: Base path for status files
+        logger: Logger instance
+        
+    Returns:
+        True if software has RPM entries, False otherwise
+    """
+    status_file = f"{base_path}/{arch}/{software_name}/status.csv"
+    if not os.path.exists(status_file):
+        return False
+    
+    try:
+        with open(status_file, 'r') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if row.get('type', '').lower() == 'rpm':
+                    return True
+        return False
+    except Exception as e:
+        logger.error(f"Error checking RPMs for {software_name}: {e}")
+        return False
+
+
+def mark_all_software_partial(base_path: str, logger):
+    """Mark software entries as partial in software.csv for all architectures.
+    
+    This is called when cleanup_repos=all to mark software as partial
+    since all RPM repositories are being deleted.
+    Only marks software that actually has RPM dependencies.
+    
+    Args:
+        base_path: Base path for software.csv files
+        logger: Logger instance
+    """
+    logger.info("Marking software with RPM dependencies as partial (cleanup_repos=all)")
+    try:
+        for arch in ARCH_SUFFIXES:
+            software_file = f"{base_path}/{arch}/software.csv"
+            logger.info(f"Processing software file: {software_file}")
+            
+            if not os.path.exists(software_file):
+                logger.info(f"Software file not found: {software_file}")
+                continue
+            
+            rows = []
+            updated = False
+            with open(software_file, 'r') as f:
+                reader = csv.DictReader(f)
+                fieldnames = reader.fieldnames
+                for row in reader:
+                    software_name = row.get('name', '')
+                    if row.get('status') == 'success':
+                        # Only mark as partial if software has RPM dependencies
+                        if software_has_rpms(software_name, arch, base_path, logger):
+                            row['status'] = 'partial'
+                            updated = True
+                            logger.info(f"Marked '{software_name}' as partial in {arch}/software.csv (has RPM deps)")
+                        else:
+                            logger.info(f"Skipping '{software_name}' - no RPM dependencies")
+                    rows.append(row)
+            
+            if fieldnames and rows and updated:
+                with open(software_file, 'w', newline='') as f:
+                    writer = csv.DictWriter(f, fieldnames=fieldnames)
+                    writer.writeheader()
+                    writer.writerows(rows)
+                logger.info(f"Successfully updated {software_file}")
+    except Exception as e:
+        logger.error(f"Failed to mark all software as partial: {e}")
 
 def write_cleanup_status(results: List[Dict], base_path: str):
     """Write cleanup results to status file."""
@@ -794,6 +885,16 @@ def run_module():
     os.makedirs(base_path, exist_ok=True)
     logger = setup_standard_logger(log_dir)
     
+    # Handle 'all' keyword for repositories only
+    cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all'
+    #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all':
+    if cleanup_all_repos:
+        logger.info("cleanup_repos='all' - fetching all repositories from Pulp")
+        cleanup_repos = get_all_repositories(logger)
+        if not cleanup_repos:
+            module.fail_json(msg="Failed to retrieve repository list from Pulp. Please check if Pulp services are running.")
+        logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}")
+
     logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}")
 
     all_results = []
@@ -804,6 +905,10 @@ def run_module():
         all_results.append(result)
         logger.info(f"Repository {repo}: {result['status']} - {result['message']}")
 
+    # If cleanup_repos=all, mark software with RPM dependencies as partial
+    if cleanup_all_repos and any(r['status'] == 'Success' for r in all_results if r['type'] == 'repository'):
+        mark_all_software_partial(base_path, logger)
+
     # Process containers
     for container in cleanup_containers:
         result = cleanup_container(container, base_path, logger)
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index 123b3a481f..f999b3a2dc 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -40,9 +40,9 @@
     # Step 2: User Confirmation
     - name: Parse cleanup lists
       ansible.builtin.set_fact:
-        repo_list: "{{ (cleanup_repos.split(',') if cleanup_repos != 'all' else []) if cleanup_repos is defined else [] }}"
-        container_list: "{{ (cleanup_containers.split(',') if cleanup_containers is string else cleanup_containers) | default([]) }}"
-        file_list: "{{ (cleanup_files.split(',') if cleanup_files is string else cleanup_files) | default([]) }}"
+        repo_list: "{{ cleanup_repos.split(',') | map('trim') | list if cleanup_repos is string else (cleanup_repos | default([])) }}"
+        container_list: "{{ cleanup_containers.split(',') | map('trim') | list if cleanup_containers is string else (cleanup_containers | default([])) }}"
+        file_list: "{{ cleanup_files.split(',') | map('trim') | list if cleanup_files is string else (cleanup_files | default([])) }}"
 
     - name: Display cleanup summary
       ansible.builtin.debug:

From 64e12f90d65203221dfdac424baee6b8750bc2b9 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Mon, 9 Feb 2026 13:30:36 +0000
Subject: [PATCH 090/172] code fix

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 ...i-group-login_compiler_node_x86_64.yaml.j2 | 64 +++++++-------
 .../ci-group-slurm_node_x86_64.yaml.j2        |  2 +-
 .../hpc_tools/configure_nvhpc_env.sh.j2       | 12 +--
 .../hpc_tools/configure_ucx_openmpi_env.sh.j2 | 15 +---
 .../hpc_tools/export_nvhpc_env.sh.j2          | 21 +----
 .../hpc_tools/install_nvhpc_sdk.sh.j2         | 36 ++++----
 .../templates/hpc_tools/install_openmpi.sh.j2 | 85 ++++++++++++++++---
 .../templates/hpc_tools/install_ucx.sh.j2     | 61 ++++++++++---
 .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 33 ++-----
 9 files changed, 183 insertions(+), 146 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index a1f8a55f50..79e50eb774 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -254,6 +254,39 @@
         - mount -a
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
+
+{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %}
+        # Add NFS entry and mount
+        - mkdir -p {{ client_mount_path }}
+        - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
+        - mount -a
+{% endif %}
+
+{% if hostvars['localhost']['ucx_support'] %}
+        - echo "===== UCX Setup ====="
+        - echo "UCX support is enabled."
+        - /usr/local/bin/install_ucx.sh
+        # - echo "Build script available at"
+        # - echo "  /usr/local/bin/install_ucx.sh"
+        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
+{% endif %}
+
+{% if hostvars['localhost']['openmpi_support'] %}
+        - echo "===== OpenMPI Setup ====="
+        - echo "OpenMPI support is enabled."
+        - /usr/local/bin/install_openmpi.sh
+        # - echo "Build script available at"
+        # - echo "  /usr/local/bin/install_openmpi.sh"
+        # - echo "Run UCX installation first if UCX support is enabled."
+        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
+{% endif %}
+
+{% if hostvars['localhost']['ldms_support'] %}
+        - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
+
+        - /root/ldms_sampler.sh
+{% endif %}
+
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
         - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
@@ -315,37 +348,6 @@
 
 {% endif %}
 
-{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %}
-        # Add NFS entry and mount
-        - mkdir -p {{ client_mount_path }}
-        - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
-        - mount -a
-{% endif %}
-
-{% if hostvars['localhost']['ucx_support'] %}
-        - echo "===== UCX Setup ====="
-        - echo "UCX support is enabled."
-        - /usr/local/bin/install_ucx.sh
-        # - echo "Build script available at"
-        # - echo "  /usr/local/bin/install_ucx.sh"
-        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
-{% endif %}
-
-{% if hostvars['localhost']['openmpi_support'] %}
-        - echo "===== OpenMPI Setup ====="
-        - echo "OpenMPI support is enabled."
-        - /usr/local/bin/install_openmpi.sh
-        # - echo "Build script available at"
-        # - echo "  /usr/local/bin/install_openmpi.sh"
-        # - echo "Run UCX installation first if UCX support is enabled."
-        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
-{% endif %}
-
-{% if hostvars['localhost']['ldms_support'] %}
-        - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
-
-        - /root/ldms_sampler.sh
-{% endif %}
 
         # nvidia sdk install
         - /usr/local/bin/install_nvhpc_sdk.sh
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 64315adf38..84440bbdec 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -474,7 +474,7 @@
         - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
         - mount -a
         - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
-        - echo "Shared NFS mount is available at: {{ client_mount_path }}"
+        # - echo "Shared NFS mount is available at: {{ client_mount_path }}"
         - /usr/local/bin/configure_ucx_openmpi_env.sh
         # - echo ""
         # - echo "IMPORTANT:"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
index 3c7efbc88b..dfc30520b3 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
@@ -50,8 +50,6 @@ if [ -f "$PROFILE_FILE" ]; then
     grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc
 fi
 
-# NVHPC marker file path
-MARKER_TARGET="{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}/.nvhpc_env_ready"
 
 if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then
     echo "[ERROR] NVHPC NFS path not found in /etc/fstab"
@@ -60,12 +58,4 @@ fi
 
 echo "[INFO] NVHPC NFS entry found in /etc/fstab"
 
-if [ ! -d "{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" ]; then
-    echo "[ERROR] Marker directory missing: {{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}"
-    exit 1
-fi
-
-touch "$MARKER_TARGET"
-echo "[SUCCESS] NVHPC marker created: $MARKER_TARGET"
-
-echo "===== NVHPC environment configuration completed successfully ====="
\ No newline at end of file
+echo "===== NVHPC environment configuration completed successfully ====="
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2
index 4064eddbb1..0fa20205c5 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2
@@ -17,8 +17,6 @@ if ! mountpoint -q "$CLIENT_MOUNT"; then
 fi
 
 # ---------------- UCX ----------------
-if [ -d "$UCX_PREFIX/bin" ]; then
-    echo "[INFO] UCX detected at $UCX_PREFIX"
 
     cat > "$PROFILE_DIR/ucx.sh" <<EOF
 # UCX environment
@@ -29,15 +27,8 @@ EOF
 
     chmod 644 "$PROFILE_DIR/ucx.sh"
     echo "[SUCCESS] UCX environment enabled"
-else
-    echo "[INFO] UCX not found at $UCX_PREFIX — skipping"
-    rm -f "$PROFILE_DIR/ucx.sh"
-fi
 
 # ---------------- OpenMPI ----------------
-if [ -d "$OPENMPI_PREFIX/bin" ]; then
-    echo "[INFO] OpenMPI detected at $OPENMPI_PREFIX"
-
     cat > "$PROFILE_DIR/openmpi.sh" <<EOF
 # OpenMPI environment
 export OPENMPI_HOME="$OPENMPI_PREFIX"
@@ -48,9 +39,5 @@ EOF
 
     chmod 644 "$PROFILE_DIR/openmpi.sh"
     echo "[SUCCESS] OpenMPI environment enabled"
-else
-    echo "[INFO] OpenMPI not found at $OPENMPI_PREFIX — skipping"
-    rm -f "$PROFILE_DIR/openmpi.sh"
-fi
 
-echo "===== UCX / OpenMPI environment configuration complete ====="
\ No newline at end of file
+echo "===== UCX / OpenMPI environment configuration complete ====="
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
index 20e3bb0e5f..a0cfdfdbe8 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
@@ -23,12 +23,6 @@ fi
 
 echo "===== NVHPC environment export started ====="
 
-# Validate compilers directory exists
-if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then
-    echo "[ERROR] NVHPC compilers not found at:"
-    echo "        $NVHPC_BASE/compilers/bin"
-    exit 1
-fi
 
 echo "[INFO] Writing persistent NVHPC profile at $PROFILE_FILE"
 
@@ -54,20 +48,7 @@ EOF
 
 chmod 644 "$PROFILE_FILE"
 
-echo "[INFO] Verifying NVHPC compilers using login shell"
-
-# Verify nvc
-if ! bash -lc "command -v nvc && nvc --version >/dev/null"; then
-    echo "[ERROR] nvc verification failed"
-    exit 1
-fi
-
-# Verify nvfortran
-if ! bash -lc "command -v nvfortran && nvfortran --version >/dev/null"; then
-    echo "[ERROR] nvfortran verification failed"
-    exit 1
-fi
 
 echo "[SUCCESS] NVHPC environment exported successfully"
 echo "[INFO] Environment file configured in $PROFILE_FILE"
-echo "===== NVHPC export completed ====="
\ No newline at end of file
+echo "===== NVHPC export completed ====="
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
index 26f3fd1775..75478a470e 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
@@ -2,9 +2,8 @@
 set -e
 
 LOGFILE="/var/log/nvhpc_sdk_install.log"
-exec > >(tee -a "$LOGFILE") 2>&1
 
-echo "===== Starting NVIDIA HPC SDK installation ====="
+echo "===== Starting NVIDIA HPC SDK installation =====" | tee -a "$LOGFILE"
 
 NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}"
 NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
@@ -16,49 +15,49 @@ NVHPC_EXTRACT_DIR="$NVHPC_MOUNT/${NVHPC_PKG_NAME}"
 
 # Skip if already mounted
 if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
-    echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation."
+    echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation." | tee -a "$LOGFILE"
     exit 0
 fi
 
 # Skip if local directory exists
 if [ -d "$NVHPC_LOCAL_MOUNT" ]; then
-    echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping."
+    echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping." | tee -a "$LOGFILE"
     exit 0
 fi
 
 mkdir -p "$NVHPC_MOUNT"
-mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT"
+mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT" >> "$LOGFILE" 2>&1
 
 # Check tarball
-echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..."
+echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..." | tee -a "$LOGFILE"
 if [ ! -f "$NVHPC_TARBALL" ]; then
-    echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation."
+    echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation." | tee -a "$LOGFILE"
     exit 0
 fi
 
 # Extract if needed
 EXTRACT_SIZE_GB=$(du -sBG "$NVHPC_EXTRACT_DIR" 2>/dev/null | cut -f1 | tr -d 'G')
 if [ -d "$NVHPC_EXTRACT_DIR" ] && [ "$EXTRACT_SIZE_GB" -ge 13 ] && [ -f "$NVHPC_EXTRACT_DIR/install" ]; then
-    echo "[INFO] NVHPC already extracted. Skipping."
+    echo "[INFO] NVHPC already extracted. Skipping." | tee -a "$LOGFILE"
 else
-    echo "[INFO] Extracting NVIDIA HPC SDK tarball..."
+    echo "[INFO] Extracting NVIDIA HPC SDK tarball..." | tee -a "$LOGFILE"
     tar -xzf "$NVHPC_TARBALL" -C "$NVHPC_MOUNT" \
         --checkpoint=2000 \
-        --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait"
+        --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait" >> "$LOGFILE" 2>&1
 fi
 
 mkdir -p "$NVHPC_INSTALL_DIR_NFS"
 INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin"
 
 if [ -x "$INSTALL_BIN_DIR/nvc" ]; then
-    echo "[INFO] NVHPC already installed. Skipping installer."
+    echo "[INFO] NVHPC already installed. Skipping installer." | tee -a "$LOGFILE"
 else
-    echo "[INFO] Running NVIDIA HPC SDK installer..."
+    echo "[INFO] Running NVIDIA HPC SDK installer..." | tee -a "$LOGFILE"
     cd "$NVHPC_EXTRACT_DIR"
-    NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install
+    NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install >> "$LOGFILE" 2>&1
 fi
 
-echo "[SUCCESS] NVIDIA HPC SDK installation completed."
+echo "[SUCCESS] NVIDIA HPC SDK installation completed." | tee -a "$LOGFILE"
 
 # Mount NVHPC locally
 mkdir -p "$NVHPC_LOCAL_MOUNT"
@@ -66,10 +65,11 @@ NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
 FSTAB_ENTRY="$NVHPC_INSTALL_EXPORT $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0"
 
 if ! grep -qE "^[^#].*$NVHPC_INSTALL_EXPORT[[:space:]]+$NVHPC_LOCAL_MOUNT[[:space:]]+nfs" /etc/fstab; then
-    echo "[INFO] Adding NVHPC mount to /etc/fstab"
+    echo "[INFO] Adding NVHPC mount to /etc/fstab" | tee -a "$LOGFILE"
     echo "$FSTAB_ENTRY" >> /etc/fstab
 fi
 
-echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..."
-mount "$NVHPC_LOCAL_MOUNT"
-echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT"
\ No newline at end of file
+echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..." | tee -a "$LOGFILE"
+mount "$NVHPC_LOCAL_MOUNT" >> "$LOGFILE" 2>&1
+echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT" | tee -a "$LOGFILE"
+echo "CLOUD-INIT: NVIDIA HPC SDK installation completed successfully" | tee -a "$LOGFILE"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
index 44e1a786b7..9adde78472 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
@@ -5,6 +5,18 @@ CLIENT_MOUNT="{{ client_mount_path }}"
 OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
 OPENMPI_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/openmpi"
 
+# Comprehensive logging
+LOGFILE="/var/log/openmpi_installation.log"
+
+# Redirect all output to log file
+exec > >(tee -a "$LOGFILE") 2>&1
+
+echo "===== OpenMPI Installation Started ====="
+echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "Installation Prefix: $OPENMPI_PREFIX"
+echo "Build Directory: $OPENMPI_BUILD"
+echo "Log File: $LOGFILE" | tee -a "$LOGFILE"
+
 # Check that NFS is mounted
 if ! mountpoint -q "$CLIENT_MOUNT"; then
     echo "[ERROR] $CLIENT_MOUNT is not mounted."
@@ -14,49 +26,65 @@ fi
 
 echo "===== OpenMPI build started ====="
 
-mkdir -p "$OPENMPI_BUILD" "$OPENMPI_PREFIX"
+mkdir -p "$OPENMPI_BUILD"
 cd "$OPENMPI_BUILD"
 
 if [ ! -f openmpi.tar.gz ]; then
+    echo "[INFO] Downloading OpenMPI source code..."
     wget --no-check-certificate \
       https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \
-      -O openmpi.tar.gz \
-      >> "$OPENMPI_PREFIX/openmpi_tar_output.log" 2>&1
+      -O openmpi.tar.gz >> "$LOGFILE" 2>&1
+    echo "[INFO] OpenMPI download completed"
 else
-    echo "openmpi.tar.gz already exists, skipping download." \
-      >> "$OPENMPI_PREFIX/openmpi_tar_output.log"
+    echo "[INFO] openmpi.tar.gz already exists, skipping download."
 fi
 
-tar xzf openmpi.tar.gz
+echo "[INFO] Extracting OpenMPI source code..."
+tar xzf openmpi.tar.gz >> "$LOGFILE" 2>&1
 cd openmpi-*
+echo "[INFO] OpenMPI source extracted to $(pwd)"
+
+echo "[INFO] Creating build directory..."
 mkdir -p build
 
 # Slurm detection
+echo "[INFO] Detecting Slurm integration..."
 if sinfo >/dev/null 2>&1; then
   SLURM_FLAG="--with-slurm=yes --with-munge=/usr"
+  echo "[INFO] Slurm detected - enabling Slurm integration"
 else
   SLURM_FLAG="--with-slurm=no"
+  echo "[INFO] Slurm not detected - disabling Slurm integration"
 fi
 
 # UCX detection
+echo "[INFO] Detecting UCX integration..."
 if [ -x "{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx/bin/ucx_info" ]; then
   UCX_FLAG="--with-ucx={{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
+  echo "[INFO] UCX detected - enabling UCX integration"
 else
   UCX_FLAG=""
+  echo "[INFO] UCX not detected - proceeding without UCX"
 fi
 
 cd build
+echo "[INFO] Configuring OpenMPI build..."
+echo "[INFO] Configure flags: --prefix=$OPENMPI_PREFIX --enable-mpi1-compatibility --enable-prte-prefix-by-default $SLURM_FLAG $UCX_FLAG"
 ../configure --prefix="$OPENMPI_PREFIX" \
   --enable-mpi1-compatibility \
   --enable-prte-prefix-by-default \
-  $SLURM_FLAG $UCX_FLAG
+  $SLURM_FLAG $UCX_FLAG >> "$LOGFILE" 2>&1
 
-make -j {{ openmpi_build_threads | default(8) }}
-make install
+echo "[INFO] Building OpenMPI with {{ openmpi_build_threads | default(8) }} threads..."
+make -j {{ openmpi_build_threads | default(8) }} >> "$LOGFILE" 2>&1
+
+echo "[INFO] Installing OpenMPI..."
+make install >> "$LOGFILE" 2>&1
 
 # Configure OpenMPI environment variables system-wide
 OPENMPI_ENV_FILE="/etc/profile.d/openmpi.sh"
 
+echo "[INFO] Setting up OpenMPI environment variables in $OPENMPI_ENV_FILE..."
 cat > "$OPENMPI_ENV_FILE" <<EOF
 # OpenMPI environment
 export OPENMPI_HOME="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
@@ -67,7 +95,40 @@ EOF
 
 chmod 644 "$OPENMPI_ENV_FILE"
 
-echo "[INFO] OpenMPI installed under {{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi"
-echo "[INFO] OpenMPI environment configured in $OPENMPI_ENV_FILE"
+# Verify installation
+echo "[INFO] Verifying OpenMPI installation..."
+if [ -f "$OPENMPI_PREFIX/bin/ompi_info" ]; then
+    OPENMPI_VERSION=$("$OPENMPI_PREFIX/bin/ompi_info" --version | head -1)
+    echo "[SUCCESS] OpenMPI installation verified - Version: $OPENMPI_VERSION" | tee -a "$LOGFILE"
+else
+    echo "[ERROR] OpenMPI installation verification failed - ompi_info not found" | tee -a "$LOGFILE"
+    exit 1
+fi
+
+# Create installation summary
+echo ""
+echo "===== OpenMPI Installation Summary ====="
+echo "Installation Status: SUCCESS"
+echo "OpenMPI Version: $OPENMPI_VERSION"
+
+echo "Integration Status:"
+if [ "$SLURM_FLAG" = "--with-slurm=yes --with-munge=/usr" ]; then
+    echo "  - Slurm Integration: ENABLED"
+else
+    echo "  - Slurm Integration: DISABLED"
+fi
+if [ -n "$UCX_FLAG" ]; then
+    echo "  - UCX Integration: ENABLED"
+else
+    echo "  - UCX Integration: DISABLED"
+fi
+echo ""
+echo "Log File Created:"
+echo "  - Installation Log: $LOGFILE" | tee -a "$LOGFILE"
+
+echo "[INFO] OpenMPI installed under {{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" | tee -a "$LOGFILE"
+echo "[INFO] OpenMPI environment configured in $OPENMPI_ENV_FILE" | tee -a "$LOGFILE"
 
-echo "===== OpenMPI build completed ====="
\ No newline at end of file
+echo "===== OpenMPI Installation Completed =====" | tee -a "$LOGFILE"
+echo "Completion Timestamp: $(date '+%Y-%m-%d %H:%M:%S')" | tee -a "$LOGFILE"
+echo "CLOUD-INIT: OpenMPI installation completed successfully" | tee -a "$LOGFILE"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
index 73d13d82a8..0231d77683 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
@@ -5,6 +5,15 @@ CLIENT_MOUNT="{{ client_mount_path }}"
 UCX_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
 UCX_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/ucx"
 
+# Comprehensive logging
+LOGFILE="/var/log/ucx_installation.log"
+
+echo "===== UCX Installation Started ====="
+echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "Installation Prefix: $UCX_PREFIX"
+echo "Build Directory: $UCX_BUILD"
+echo "Log File: $LOGFILE" | tee -a "$LOGFILE"
+
 # Check that NFS is mounted
 if ! mountpoint -q "$CLIENT_MOUNT"; then
     echo "[ERROR] $CLIENT_MOUNT is not mounted."
@@ -14,31 +23,41 @@ fi
 
 echo "===== UCX build started ====="
 
-mkdir -p "$UCX_BUILD" "$UCX_PREFIX"
+mkdir -p "$UCX_BUILD"
 cd "$UCX_BUILD"
 
 if [ ! -f ucx.tar.gz ]; then
+    echo "[INFO] Downloading UCX source code..."
     wget --no-check-certificate \
       https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz \
-      -O ucx.tar.gz \
-      >> "$UCX_PREFIX/ucx_tar_output.log" 2>&1
+      -O ucx.tar.gz >> "$LOGFILE" 2>&1
+    echo "[INFO] UCX download completed"
 else
-    echo "ucx.tar.gz already exists, skipping download." \
-      >> "$UCX_PREFIX/ucx_tar_output.log"
+    echo "[INFO] ucx.tar.gz already exists, skipping download."
 fi
 
-tar xzf ucx.tar.gz
+echo "[INFO] Extracting UCX source code..."
+tar xzf ucx.tar.gz >> "$LOGFILE" 2>&1
 cd ucx-*
+echo "[INFO] UCX source extracted to $(pwd)"
+
+echo "[INFO] Creating build directory..."
 mkdir -p build
 cd build
 
-../contrib/configure-release --prefix="$UCX_PREFIX"
-make -j {{ ucx_build_threads | default(8) }}
-make install
+echo "[INFO] Configuring UCX build..."
+../contrib/configure-release --prefix="$UCX_PREFIX" >> "$LOGFILE" 2>&1
+
+echo "[INFO] Building UCX with {{ ucx_build_threads | default(8) }} threads..."
+make -j {{ ucx_build_threads | default(8) }} >> "$LOGFILE" 2>&1
+
+echo "[INFO] Installing UCX..."
+make install >> "$LOGFILE" 2>&1
 
 # Configure UCX environment variables system-wide
 UCX_ENV_FILE="/etc/profile.d/ucx.sh"
 
+echo "[INFO] Setting up UCX environment variables in $UCX_ENV_FILE..."
 cat > "$UCX_ENV_FILE" <<EOF
 # UCX environment
 export UCX_HOME="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
@@ -48,8 +67,24 @@ EOF
 
 chmod 644 "$UCX_ENV_FILE"
 
-echo "[INFO] UCX installed under {{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx"
-echo "[INFO] UCX environment configured in $UCX_ENV_FILE"
-echo "[INFO] Run 'source $UCX_ENV_FILE' or re-login to use ucx_info"
+# Verify installation
+echo "[INFO] Verifying UCX installation..."
+if [ -f "$UCX_PREFIX/bin/ucx_info" ]; then
+    UCX_VERSION=$("$UCX_PREFIX/bin/ucx_info" -v | head -1)
+    echo "[SUCCESS] UCX installation verified - Version: $UCX_VERSION" | tee -a "$LOGFILE"
+else
+    echo "[ERROR] UCX installation verification failed - ucx_info not found" | tee -a "$LOGFILE"
+    exit 1
+fi
+
+echo "Log File Created:"
+echo "  - Installation Log: $LOGFILE" | tee -a "$LOGFILE"
+
+echo "[INFO] UCX installed under {{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" | tee -a "$LOGFILE"
+echo "[INFO] UCX environment configured in $UCX_ENV_FILE" | tee -a "$LOGFILE"
+echo "[INFO] Run 'source $UCX_ENV_FILE' or re-login to use ucx_info" | tee -a "$LOGFILE"
+
+echo "===== UCX Installation Completed =====" | tee -a "$LOGFILE"
+echo "Completion Timestamp: $(date '+%Y-%m-%d %H:%M:%S')" | tee -a "$LOGFILE"
+echo "CLOUD-INIT: UCX installation completed successfully" | tee -a "$LOGFILE"
 
-echo "===== UCX build completed ====="
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
index b57061cd08..8169d1f5a6 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
@@ -7,17 +7,11 @@ echo "===== NVHPC SDK setup (mount + wait) ====="
 PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
 PARENT_MOUNT="/shared-nvhpc-sdk"
 
-NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
+NVHPC_NFS_SHARE="$PARENT_MOUNT/nvhpc"
 NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
 
-NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready"
-
-WAIT_TIMEOUT=3600
-SLEEP_INTERVAL=20
-ELAPSED=0
-
-# 1. Mount parent export
 mkdir -p "$PARENT_MOUNT"
+mkdir -p "$NVHPC_NFS_SHARE"
 
 if ! mountpoint -q "$PARENT_MOUNT"; then
     mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT"
@@ -30,24 +24,11 @@ fi
 
 echo "[INFO] Parent NVHPC export mounted"
 
-# 2. Wait for readiness marker
-echo "[INFO] Waiting for NVHPC readiness marker..."
-
-while [ ! -f "$NVHPC_MARKER" ]; do
-    if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then
-        echo "[ERROR] Timeout waiting for NVHPC readiness marker"
-        exit 1
-    fi
-    sleep "$SLEEP_INTERVAL"
-    ELAPSED=$((ELAPSED + SLEEP_INTERVAL))
-done
-
-echo "[SUCCESS] NVHPC readiness marker detected"
 
-# 3. Ensure fstab entry exists
+# 3. Ensure fstab entry exists (bind mount, NOT NFS)
 if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then
-    echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab
-    echo "[INFO] NVHPC fstab entry added"
+    echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT none bind,_netdev 0 0" >> /etc/fstab
+    echo "[INFO] NVHPC bind-mount fstab entry added"
 else
     echo "[INFO] NVHPC fstab entry already present"
 fi
@@ -56,7 +37,7 @@ fi
 mkdir -p "$NVHPC_LOCAL_MOUNT"
 
 if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
-    mount "$NVHPC_LOCAL_MOUNT"
+    mount --bind "$NVHPC_NFS_SHARE" "$NVHPC_LOCAL_MOUNT"
 fi
 
 if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
@@ -65,4 +46,4 @@ if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
 fi
 
 echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT"
-echo "===== NVHPC setup completed ====="
\ No newline at end of file
+echo "===== NVHPC setup completed ====="

From ee524cd4fe42064ee8d97486fb822ca3087be51a Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Mon, 9 Feb 2026 19:01:11 +0530
Subject: [PATCH 091/172] Added validation for all confs Supported
 configuration files are:   slurm.conf   slurmdbd.conf   cgroup.conf  
 gres.conf   acct_gather.conf   helpers.conf   job_container.conf   mpi.conf  
 oci.conf   topology.conf   burst_buffer.conf

---
 .../common_utils/slurm_conf_utils.py          | 13 +++++--
 .../validation_flows/common_validation.py     | 16 ++++++---
 common/library/modules/slurm_conf.py          |  8 +++--
 .../roles/slurm_config/defaults/main.yml      |  7 ++++
 discovery/roles/slurm_config/tasks/confs.yml  | 14 +++++++-
 .../slurm_config/tasks/handle_extra_confs.yml | 35 +++++++++++++++++++
 discovery/roles/slurm_config/vars/main.yml    | 28 +++++++++++++++
 7 files changed, 110 insertions(+), 11 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/handle_extra_confs.yml

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 20d61afc98..26f24762aa 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -763,6 +763,10 @@ class SlurmParserEnum(str, Enum):
 def validate_config_types(conf_dict, conf_name, module):
     """Validate configuration keys and value types based on SlurmParserEnum."""
     current_conf = all_confs.get(conf_name, {})
+    if not current_conf:
+        return {'invalid_keys': [], 'type_errors': []}
+    # module.fail_json(msg=f"Invalid configuration name: {conf_name}", conf_dict=conf_dict, current_conf=current_conf)
+    module.warn(conf_name)
     invalid_keys = list(
         set(conf_dict.keys()).difference(set(current_conf.keys())))
     type_errors = []
@@ -839,6 +843,7 @@ def parse_slurm_conf(file_path, conf_name, validate):
     """Parses the slurm.conf file and returns it as a dictionary."""
     current_conf = all_confs.get(conf_name, {})
     slurm_dict = OrderedDict()
+    dup_keys = []
 
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"{file_path} not found.")
@@ -878,9 +883,11 @@ def parse_slurm_conf(file_path, conf_name, validate):
                 slurm_dict[skey] = list(slurm_dict.get(
                     skey, [])) + list(tmp_dict.values())
             else:
-                slurm_dict.update(tmp_dict)
-
-    return slurm_dict
+                if skey in slurm_dict:
+                    dup_keys.append(skey)
+                else:
+                    slurm_dict.update(tmp_dict)
+    return slurm_dict, dup_keys
 
 
 def expand_hostlist(expr):
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index ae4e693b9e..198c527440 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1062,24 +1062,30 @@ def validate_omnia_config(
                     f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}"
                     ))
         cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+        skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation")
         for cfg_path_dict in cnfg_src:
             for k,v in cfg_path_dict.items():
                 conf_dict = None
                 if isinstance(v, str):
                     if not os.path.exists(v):
                         errors.append(
-                            create_error_msg(input_file_path, "slurm_cluster config_sources",
+                            create_error_msg('omnia_config.yml', "slurm_cluster config_sources",
                                 f"provided conf path for {k} - {v} does not exist"))
                         continue
                     else: # path exists
-                        conf_dict = parse_slurm_conf(v, k, False)
+                        if not skip_conf_validation:
+                            conf_dict, duplicate_keys = parse_slurm_conf(v, k, False)
+                            if duplicate_keys:
+                                errors.append(
+                                    create_error_msg('omnia_config.yml', "slurm_cluster->config_sources",
+                                        f"duplicate keys found in {k}.conf - {','.join(duplicate_keys)}"))
                 else:
                     conf_dict = v
-                if conf_dict:
+                if conf_dict and not skip_conf_validation:
                     validation_result = validate_config_types(conf_dict, k, module)
-                    if validation_result['type_errors']:
+                    if validation_result.get('type_errors'):
                         errors.extend(validation_result['type_errors'])
-                    if validation_result['invalid_keys']:
+                    if validation_result.get('invalid_keys'):
                         errors.append(
                             create_error_msg('omnia_config.yml', "slurm_cluster->config_sources",
                                 f"{k}.conf invalid keys found - {','.join(validation_result['invalid_keys'])}"))
diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py
index dcacbcae2f..78a4315244 100644
--- a/common/library/modules/slurm_conf.py
+++ b/common/library/modules/slurm_conf.py
@@ -234,7 +234,9 @@ def run_module():
         replace = module.params['replace']
         # Parse the slurm.conf file
         if module.params['op'] == 'parse':
-            s_dict = parse_slurm_conf(module.params['path'], conf_name, validate)
+            s_dict, dup_keys = parse_slurm_conf(module.params['path'], conf_name, validate)
+            if dup_keys:
+                module.fail_json(msg=f"Duplicate keys found in {module.params['path']}: {dup_keys}")
             result['conf_dict'] = s_dict
         elif module.params['op'] == 'render':
             s_list = read_dict2ini(module.params['conf_map'])
@@ -247,7 +249,9 @@ def run_module():
                 elif isinstance(conf_source, str):
                     if not os.path.exists(conf_source):
                         raise FileNotFoundError(f"File {conf_source} does not exist")
-                    s_dict = parse_slurm_conf(conf_source, conf_name, validate)
+                    s_dict, dup_keys = parse_slurm_conf(conf_source, conf_name, validate)
+                    if dup_keys:
+                        module.fail_json(msg=f"Duplicate keys found in {conf_source}: {dup_keys}")
                     conf_dict_list.append(OrderedDict(s_dict))
                 else:
                     raise TypeError(f"Invalid type for conf_source: {type(conf_source)}")
diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml
index a8fbc8e9c8..ad7ab09058 100644
--- a/discovery/roles/slurm_config/defaults/main.yml
+++ b/discovery/roles/slurm_config/defaults/main.yml
@@ -87,3 +87,10 @@ __default_config:
     DbdPort: "{{ slurm_dbd_port }}"
   gres:
     AutoDetect: nvml
+  acct_gather: {}
+  helpers: {}
+  job_container: {}
+  mpi: {}
+  oci: {}
+  topology: {}
+  burst_buffer: {}
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index fdf461f88c..91de036c3f 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -50,6 +50,7 @@
     - configs_input is defined
     - configs_input
     - item.value is string
+    - item.key in conf_files
 
 - name: Build parsed_configs_input dictionary from parsed files
   ansible.builtin.set_fact:
@@ -148,7 +149,7 @@
   loop_control:
     loop_var: product
 
-- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS
+- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd
   ansible.builtin.set_fact:
     conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}"
 
@@ -162,6 +163,17 @@
     remote_src: "{{ copy_from_oim }}"
   loop: "{{ merged_conf.results }}"
   register: ctld_conf_files
+  when:
+    - item.ini_lines
+
+- name: Add extra confs which are not handled
+  ansible.builtin.include_tasks: handle_extra_confs.yml
+  when:
+    - configs_input is defined
+    - configs_input.keys() | difference(conf_files) | length > 0
+  loop: "{{ configs_input.keys() | difference(conf_files) }}"
+  loop_control:
+    loop_var: extra_conf
 
 - name: Check if cluster running
   ansible.builtin.include_tasks: check_ctld_running.yml
diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
new file mode 100644
index 0000000000..c7f1ae5bd5
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
@@ -0,0 +1,35 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Add extra confs which are not handled
+  slurm_conf:
+    op: merge
+    conf_sources: "{{ [configs_input[extra_conf]] }}"
+    conf_name: "{{ extra_conf }}"
+  register: ex_conf
+  delegate_to: localhost
+  when: 
+    - "'.' not in extra_conf"
+
+- name: Write merged .conf
+  ansible.builtin.copy:
+    content: "{{ ex_conf.ini_lines | join('\n') }}\n"
+    dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ extra_conf }}.conf"
+    mode: "{{ conf_file_mode }}"
+    owner: "{{ slurm_user }}"
+    group: "{{ slurm_user_group }}"
+    remote_src: "{{ copy_from_oim }}"
+  when: 
+    - "'.' not in extra_conf"
+    - ex_conf is success
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 939e3ac204..89166b1f12 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -21,6 +21,34 @@ conf_files: # Must match this MASTER list
   - slurmdbd
   - cgroup
   - gres
+  - acct_gather
+  - helpers
+  - job_container
+  - mpi
+  - oci
+  - topology
+  - burst_buffer
+
+# Supported configuration files are:
+  # slurm.conf
+  # slurmdbd.conf
+  # cgroup.conf
+  # gres.conf
+  # acct_gather.conf
+  # helpers.conf
+  # job_container.conf
+  # mpi.conf
+  # oci.conf
+  # topology.conf
+  # burst_buffer.conf
+
+# Non Conf files
+  # topology.yaml
+  # namespace.yaml
+  # plugstack.conf
+  # scrun.lua
+  # cli_filter.lua
+
 copy_from_oim: false
 common_dir:
   - /etc/munge

From 7db818aa3a7619cdf074335008a081ab7fe18972 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Mon, 9 Feb 2026 19:06:57 +0530
Subject: [PATCH 092/172] Lint fix

---
 discovery/roles/slurm_config/tasks/handle_extra_confs.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
index c7f1ae5bd5..307ca01723 100644
--- a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
+++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
@@ -19,7 +19,7 @@
     conf_name: "{{ extra_conf }}"
   register: ex_conf
   delegate_to: localhost
-  when: 
+  when:
     - "'.' not in extra_conf"
 
 - name: Write merged .conf
@@ -30,6 +30,6 @@
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
-  when: 
+  when:
     - "'.' not in extra_conf"
     - ex_conf is success

From 919b5d379bf4dab07a7962f82b9bfa231d4ec8fc Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 9 Feb 2026 19:33:26 +0530
Subject: [PATCH 093/172] fix for variable scope

---
 .../roles/validate_input/tasks/main.yml            | 14 +++++++-------
 .../roles/validate_input/vars/main.yml             |  9 ++++++---
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/input_validation/roles/validate_input/tasks/main.yml b/input_validation/roles/validate_input/tasks/main.yml
index 6a1c773ee5..de6e9f48e9 100644
--- a/input_validation/roles/validate_input/tasks/main.yml
+++ b/input_validation/roles/validate_input/tasks/main.yml
@@ -17,12 +17,12 @@
     omnia_run_tags: "{{ ansible_run_tags | default([]) }}"
   when: omnia_run_tags is not defined
 
+- name: Set validation messages
+  ansible.builtin.set_fact:
+    validation_success_msg: "{{ messages.validation_success }}"
+    validation_error_msg: "{{ messages.validation_error }}"
+
 - name: Validate omnia input config
-  vars:
-    # Note: When running a specific playbook without tags ansible run tags will default to ["all"], thus if two or more tags are present
-    # then the "all" tag should be removed so that only the config files related to that playbook are validated.
-    input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2
-      else omnia_run_tags | default([]) }}"
   block:
     - name: Run validation
       validate_input:
@@ -35,8 +35,8 @@
 
     - name: Debug validation status
       ansible.builtin.debug:
-        msg: "{{ messages.validation_success }}"
+        msg: "{{ validation_success_msg }}"
   rescue:
     - name: Failed due to validation failure
       ansible.builtin.fail:
-        msg: "{{ messages.validation_error }}"
+        msg: "{{ validation_error_msg }}"
diff --git a/input_validation/roles/validate_input/vars/main.yml b/input_validation/roles/validate_input/vars/main.yml
index 4655e7b25a..698eb4da29 100644
--- a/input_validation/roles/validate_input/vars/main.yml
+++ b/input_validation/roles/validate_input/vars/main.yml
@@ -16,8 +16,11 @@
 input_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
 project_name: "{{ hostvars['localhost']['project_name'] }}"
 
+# Note: When running a specific playbook without tags ansible run tags will default to ["all"], thus if two or more tags are present
+# then the "all" tag should be removed so that only the config files related to that playbook are validated.
+input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2
+  else omnia_run_tags | default([]) }}"
+
 messages:
   validation_success: "Successfully validated Omnia input config file(s)"
-  validation_error: >
-    Input validation failed.
-    For detailed validation errors, see: {{ ansible_failed_result.log_file }}
+  validation_error: "Input validation failed. Please check the validation output above for detailed error information."

From eb0ce8e93f6145e084769224403c8a2e3d0503a3 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 9 Feb 2026 20:27:31 +0530
Subject: [PATCH 094/172] Update main.yml

---
 utils/roles/external_kafka_connect_details/tasks/main.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 207d93bfe6..6a387b1b46 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -19,6 +19,14 @@
   changed_when: false
   failed_when: kubectl_check.rc != 0
 
+- name: Delete Kafka output directory (clean start)
+  ansible.builtin.file:
+    path: "{{ kafka_output_dir }}"
+    state: absent
+  delegate_to: localhost
+  connection: local
+  run_once: true
+
 - name: Get Kafka pod status
   ansible.builtin.command: >-
     kubectl get pods -n {{ kafka_namespace }}

From 42e552c6e61e1076231b4ba6ecd8a9b20717ecd5 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Mon, 9 Feb 2026 20:36:52 +0530
Subject: [PATCH 095/172] pylint fixes

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/modules/pulp_cleanup.py | 192 ++++++++++++-------------
 1 file changed, 96 insertions(+), 96 deletions(-)

diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 217ca9b308..72fd11d692 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -28,8 +28,8 @@
 import glob
 import json
 import subprocess
-import time
-from datetime import datetime
+#import time
+#from datetime import datetime
 from typing import Dict, List, Any, Tuple
 
 from ansible.module_utils.basic import AnsibleModule
@@ -61,7 +61,7 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str:
         return "No cleanup results to display"
 
     headers = ["Name", "Type", "Status", "Message"]
-    
+
     # Calculate column widths
     widths = [len(h) for h in headers]
     for r in results:
@@ -73,9 +73,9 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str:
     # Build table
     border = "+" + "+".join("-" * (w + 2) for w in widths) + "+"
     header_row = "|" + "|".join(f" {h.ljust(w)} " for h, w in zip(headers, widths)) + "|"
-    
+
     lines = [border, header_row, border]
-    
+
     for r in results:
         msg = str(r.get('message', ''))[:40]
         row = "|" + "|".join([
@@ -86,7 +86,7 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str:
             f" {msg.ljust(widths[3])} "
         ]) + "|"
         lines.append(row)
-    
+
     lines.append(border)
     return "\n".join(lines)
 
@@ -100,7 +100,7 @@ def run_cmd(cmd: str, logger) -> Dict[str, Any]:
     try:
         result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300)
         return {"rc": result.returncode, "stdout": result.stdout, "stderr": result.stderr}
-    except Exception as e:
+    except (subprocess.SubprocessError, OSError) as e:
         logger.error(f"Command failed: {cmd} - {e}")
         return {"rc": 1, "stdout": "", "stderr": str(e)}
 
@@ -112,7 +112,7 @@ def safe_json_parse(data: str, default: Any = None) -> Any:
     """
     if not data or not isinstance(data, str):
         return default if default is not None else []
-    
+
     try:
         decoder = json.JSONDecoder()
         parsed, _ = decoder.raw_decode(data.strip())
@@ -135,19 +135,19 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]:
     """
     if not image_name:
         return False, "Container image name cannot be empty"
-    
+
     # Must contain at least one '/' to indicate registry/image format
     if '/' not in image_name:
         return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)"
-    
+
     # Must have a registry part (contains '.' or is a known registry)
     parts = image_name.split('/')
     registry = parts[0]
-    
+
     # Check if registry looks valid (contains dot or is localhost)
     if '.' not in registry and registry != 'localhost' and ':' not in registry:
         return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)"
-    
+
     return True, ""
 
 
@@ -209,7 +209,7 @@ def file_exists_in_status(name: str, base_path: str, logger) -> bool:
     """Check if file artifact exists in status files."""
     try:
         for status_file in glob.glob(f"{base_path}/x86_64/*/status.csv"):
-            with open(status_file, 'r') as f:
+            with open(status_file, 'r', encoding='utf-8') as f:
                 if name in f.read():
                     return True
         return False
@@ -234,12 +234,12 @@ def get_all_repositories(logger) -> List[str]:
 def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]:
     """Cleanup a single RPM repository."""
     result = {"name": name, "type": "repository", "status": "Failed", "message": ""}
-    
+
     # Check existence
     if not repo_exists(name, logger):
         result["message"] = "Repository not found"
         return result
-    
+
     try:
         # Delete distributions
         dist_list = run_cmd(pulp_rpm_commands["list_distributions"], logger)
@@ -248,20 +248,20 @@ def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]:
             for d in dists:
                 if d.get('name', '') == name or name in d.get('name', ''):
                     run_cmd(pulp_rpm_commands["delete_distribution"] % d.get('name', ''), logger)
-        
+
         # Delete publications
         pub_list = run_cmd(pulp_rpm_commands["list_publications"] % name, logger)
         if pub_list["rc"] == 0:
             pubs = safe_json_parse(pub_list["stdout"])
             for p in pubs:
                 run_cmd(pulp_rpm_commands["delete_publication"] % p.get('pulp_href', ''), logger)
-        
+
         # Delete remote
         run_cmd(pulp_rpm_commands["delete_remote"] % name, logger)
-        
+
         # Delete repository
         del_result = run_cmd(pulp_rpm_commands["delete_repository"] % name, logger)
-        
+
         if del_result["rc"] == 0:
             result["status"] = "Success"
             result["message"] = "Repository deleted"
@@ -271,10 +271,10 @@ def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]:
             mark_software_partial(affected, base_path, logger, 'repository')
         else:
             result["message"] = f"Delete failed: {del_result['stderr']}"
-            
+
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
-    
+
     return result
 
 
@@ -285,21 +285,21 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]
         user_input: User-provided image name (e.g., registry.k8s.io/pause)
     """
     result = {"name": user_input, "type": "container", "status": "Failed", "message": ""}
-    
+
     # Validate format
     is_valid, error_msg = validate_container_format(user_input)
     if not is_valid:
         result["message"] = error_msg
         return result
-    
+
     # Convert to Pulp naming convention
     pulp_name = convert_to_pulp_container_name(user_input)
-    
+
     # Check existence
     if not container_exists(pulp_name, logger):
         result["message"] = f"Container not found in Pulp (looked for: {pulp_name})"
         return result
-    
+
     try:
         # Delete distributions
         dist_list = run_cmd(pulp_container_commands["list_distributions"], logger)
@@ -308,10 +308,10 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]
             for d in dists:
                 if d.get('name', '') == pulp_name:
                     run_cmd(pulp_container_commands["delete_distribution"] % d.get('name', ''), logger)
-        
+
         # Delete repository
         del_result = run_cmd(pulp_container_commands["delete_repository"] % pulp_name, logger)
-        
+
         if del_result["rc"] == 0:
             result["status"] = "Success"
             result["message"] = "Container deleted"
@@ -320,10 +320,10 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]
             mark_software_partial(affected, base_path, logger, 'image')
         else:
             result["message"] = f"Delete failed: {del_result['stderr']}"
-            
+
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
-    
+
     return result
 
 
@@ -338,7 +338,7 @@ def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]:
         repo_list = run_cmd(pulp_file_commands["list_repositories"], logger)
         if repo_list["rc"] != 0:
             return False, "", ""
-        
+
         repos = safe_json_parse(repo_list["stdout"])
         for repo in repos:
             repo_name = repo.get('name', '')
@@ -351,9 +351,9 @@ def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]:
                 contents = safe_json_parse(content_list["stdout"])
                 if contents:
                     return True, repo_name, contents[0].get('pulp_href', '')
-        
+
         return False, "", ""
-    except Exception:
+    except (OSError, ValueError):
         return False, "", ""
 
 
@@ -365,7 +365,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
     """
     try:
         messages = []
-        
+
         # 1. Remove content from repository
         if content_href:
             remove_result = run_cmd(
@@ -380,7 +380,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
                     f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'",
                     logger
                 )
-        
+
         # 2. Delete distribution if exists
         dist_result = run_cmd(pulp_file_commands["list_distributions"], logger)
         if dist_result["rc"] == 0:
@@ -389,14 +389,14 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
                 if d.get('name', '') == name or name in d.get('name', ''):
                     run_cmd(pulp_file_commands["delete_distribution"] % d.get('name', ''), logger)
                     messages.append("Distribution deleted")
-        
+
         # 3. Try to delete the file repository if it's named after the artifact
         repo_del = run_cmd(pulp_file_commands["delete_repository"] % name, logger)
         if repo_del["rc"] == 0:
             messages.append("Repository deleted")
-        
+
         return True, "; ".join(messages) if messages else "Removed from Pulp"
-        
+
     except Exception as e:
         return False, f"Pulp deletion error: {str(e)}"
 
@@ -410,7 +410,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
     result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""}
     messages = []
     pulp_deleted = False
-    
+
     try:
         # Pulp Python repo name format: pip_module<name>
         # User input could be "cffi==1.17.1" or "pip_modulecffi==1.17.1"
@@ -418,24 +418,24 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
             pulp_repo_name = name
         else:
             pulp_repo_name = f"pip_module{name}"
-        
+
         logger.info(f"Looking for Python repository: {pulp_repo_name}")
-        
+
         # Check if repository exists
         repo_check = run_cmd(pulp_python_commands["show_repository"] % pulp_repo_name, logger)
-        
+
         if repo_check["rc"] == 0:
             # Delete distribution first
             dist_del = run_cmd(pulp_python_commands["delete_distribution"] % pulp_repo_name, logger)
             if dist_del["rc"] == 0:
                 messages.append("Distribution deleted")
-            
+
             # Delete repository
             repo_del = run_cmd(pulp_python_commands["delete_repository"] % pulp_repo_name, logger)
             if repo_del["rc"] == 0:
                 pulp_deleted = True
                 messages.append("Repository deleted")
-            
+
             # Run orphan cleanup
             if pulp_deleted:
                 logger.info("Running orphan cleanup...")
@@ -451,33 +451,33 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
                     repo_name = repo.get('name', '')
                     if name in repo_name or repo_name == pulp_repo_name:
                         logger.info(f"Found matching Python repository: {repo_name}")
-                        
+
                         dist_del = run_cmd(pulp_python_commands["delete_distribution"] % repo_name, logger)
                         if dist_del["rc"] == 0:
                             messages.append("Distribution deleted")
-                        
+
                         repo_del = run_cmd(pulp_python_commands["delete_repository"] % repo_name, logger)
                         if repo_del["rc"] == 0:
                             pulp_deleted = True
                             messages.append("Repository deleted")
                         break
-        
+
         # Update status files
         if file_exists_in_status(name, base_path, logger):
             affected = remove_from_status_files(name, 'pip_module', base_path, logger)
             if affected:
                 messages.append("Status files updated")
                 mark_software_partial(affected, base_path, logger, 'pip_module')
-        
+
         if pulp_deleted:
             result["status"] = "Success"
             result["message"] = "; ".join(messages) if messages else "Cleaned up"
         else:
             result["message"] = f"pip_module '{name}' not found in Pulp"
-            
+
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
-    
+
     return result
 
 
@@ -505,21 +505,21 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
     messages = []
     pulp_deleted = False
     status_removed = False
-    
+
     try:
         # Get the expected Pulp repository name
         pulp_repo_name = get_pulp_file_repo_name(name, file_type)
         logger.info(f"Looking for {file_type} repository: {pulp_repo_name}")
-        
+
         # Check if repository exists directly
         repo_check = run_cmd(pulp_file_commands["show_repository"] % pulp_repo_name, logger)
-        
+
         if repo_check["rc"] == 0:
             # Found exact match - delete distribution and repository
             dist_del = run_cmd(pulp_file_commands["delete_distribution"] % pulp_repo_name, logger)
             if dist_del["rc"] == 0:
                 messages.append("Distribution deleted")
-            
+
             repo_del = run_cmd(pulp_file_commands["delete_repository"] % pulp_repo_name, logger)
             if repo_del["rc"] == 0:
                 pulp_deleted = True
@@ -533,17 +533,17 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
                     repo_name = repo.get('name', '')
                     if name in repo_name or repo_name == pulp_repo_name:
                         logger.info(f"Found matching repository: {repo_name}")
-                        
+
                         dist_del = run_cmd(pulp_file_commands["delete_distribution"] % repo_name, logger)
                         if dist_del["rc"] == 0:
                             messages.append("Distribution deleted")
-                        
+
                         repo_del = run_cmd(pulp_file_commands["delete_repository"] % repo_name, logger)
                         if repo_del["rc"] == 0:
                             pulp_deleted = True
                             messages.append("Repository deleted")
                         break
-        
+
         # Run orphan cleanup to remove actual content files
         if pulp_deleted:
             logger.info("Running orphan cleanup to remove content files...")
@@ -552,7 +552,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
                 messages.append("Orphan cleanup completed")
             else:
                 logger.warning(f"Orphan cleanup warning: {orphan_result['stderr']}")
-        
+
         # Update status files
         if file_exists_in_status(name, base_path, logger):
             affected = remove_from_status_files(name, file_type, base_path, logger)
@@ -560,17 +560,17 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
                 status_removed = True
                 messages.append("Status files updated")
                 mark_software_partial(affected, base_path, logger, file_type)
-        
+
         # Determine overall result
         if pulp_deleted or status_removed:
             result["status"] = "Success"
             result["message"] = "; ".join(messages) if messages else "Cleaned up"
         else:
             result["message"] = f"{file_type} '{name}' not found in Pulp or status files"
-            
+
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
-    
+
     return result
 
 
@@ -582,11 +582,11 @@ def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]:
     - tarball, git, manifest, ansible_galaxy_collection: Pulp File repository
     """
     file_type = detect_file_type(name)
-    
+
     # Handle pip modules separately - they use Python repositories
     if file_type == "pip_module":
         return cleanup_pip_module(name, base_path, logger)
-    
+
     # All other file types use Pulp File repository
     return cleanup_file_repository(name, file_type, base_path, logger)
 
@@ -616,20 +616,20 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[
                 rows = []
                 removed = False
                 has_repo_column = False
-                
+
                 # Check if file has repo_name column
-                with open(status_file, 'r') as f:
+                with open(status_file, 'r', encoding='utf-8') as f:
                     header = f.readline().strip().lower()
                     has_repo_column = "repo_name" in header
-                
-                with open(status_file, 'r') as f:
+
+                with open(status_file, 'r', encoding='utf-8') as f:
                     reader = csv.DictReader(f)
                     fieldnames = reader.fieldnames
                     for row in reader:
                         name = row.get('name', '')
                         row_type = row.get('type', '')
                         rpm_repo = row.get('repo_name', '')
-                        
+
                         logger.info(f"Processing row: {row}")
                         # For RPMs, check if they belong to the deleted repository
                         if row_type == 'rpm':
@@ -640,18 +640,18 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[
                                 rows.append(row)
                         else:
                             rows.append(row)
-                
+
                 if removed and fieldnames:
-                    with open(status_file, 'w', newline='') as f:
+                    with open(status_file, 'w', newline='', encoding='utf-8') as f:
                         writer = csv.DictWriter(f, fieldnames=fieldnames)
                         writer.writeheader()
                         writer.writerows(rows)
-                    
+
                     # Track affected software
                     software_name = os.path.basename(os.path.dirname(status_file))
                     if software_name not in affected_software:
                         affected_software.append(software_name)
-                    
+
         return affected_software
     except Exception as e:
         logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}")
@@ -676,7 +676,7 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
             for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"):
                 rows = []
                 removed = False
-                with open(status_file, 'r') as f:
+                with open(status_file, 'r', encoding='utf-8') as f:
                     reader = csv.DictReader(f)
                     fieldnames = reader.fieldnames
                     for row in reader:
@@ -696,13 +696,13 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
                             logger.info(f"Removing '{name}' from {status_file}")
                         else:
                             rows.append(row)
-                
+
                 if removed and fieldnames:
-                    with open(status_file, 'w', newline='') as f:
+                    with open(status_file, 'w', newline='', encoding='utf-8') as f:
                         writer = csv.DictWriter(f, fieldnames=fieldnames)
                         writer.writeheader()
                         writer.writerows(rows)
-                    
+
                     # Track affected software
                     software_name = os.path.basename(os.path.dirname(status_file))
                     if software_name not in arch_affected:
@@ -710,10 +710,10 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
 
             if arch_affected:
                 affected_software[arch] = arch_affected
-                    
-        logger.info(f"remove_from_status_files returning: {affected_software}")        
+
+        logger.info(f"remove_from_status_files returning: {affected_software}")
         return affected_software
-    except Exception as e:
+    except OSError as e:
         logger.error(f"Failed to remove from status files: {e}")
         return {}
 
@@ -738,7 +738,7 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty
         arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES}
     else:
         arch_software_map = affected_software
-        
+
     try:
         for arch, software_names in arch_software_map.items():
             if not software_names:
@@ -752,7 +752,7 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty
 
             rows = []
             updated = False
-            with open(software_file, 'r') as f:
+            with open(software_file, 'r', encoding='utf-8') as f:
                 reader = csv.DictReader(f)
                 fieldnames = reader.fieldnames
                 for row in reader:
@@ -761,14 +761,14 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty
                         updated = True
                         logger.info(f"Marked '{row.get('name')}' as partial in {arch}/software.csv ({artifact_type} cleanup)")
                     rows.append(row)
-            
+
             if fieldnames and rows and updated:
-                with open(software_file, 'w', newline='') as f:
+                with open(software_file, 'w', newline='', encoding='utf-8') as f:
                     writer = csv.DictWriter(f, fieldnames=fieldnames)
                     writer.writeheader()
                     writer.writerows(rows)
                 logger.info(f"Successfully wrote updated software.csv for {arch}")
-    except Exception as e:
+    except OSError as e:
         logger.error(f"Failed to update software.csv: {e}")
 
 def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> bool:
@@ -786,15 +786,15 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) ->
     status_file = f"{base_path}/{arch}/{software_name}/status.csv"
     if not os.path.exists(status_file):
         return False
-    
+
     try:
-        with open(status_file, 'r') as f:
+        with open(status_file, 'r', encoding='utf-8') as f:
             reader = csv.DictReader(f)
             for row in reader:
                 if row.get('type', '').lower() == 'rpm':
                     return True
         return False
-    except Exception as e:
+    except OSError as e:
         logger.error(f"Error checking RPMs for {software_name}: {e}")
         return False
 
@@ -815,14 +815,14 @@ def mark_all_software_partial(base_path: str, logger):
         for arch in ARCH_SUFFIXES:
             software_file = f"{base_path}/{arch}/software.csv"
             logger.info(f"Processing software file: {software_file}")
-            
+
             if not os.path.exists(software_file):
                 logger.info(f"Software file not found: {software_file}")
                 continue
-            
+
             rows = []
             updated = False
-            with open(software_file, 'r') as f:
+            with open(software_file, 'r', encoding='utf-8') as f:
                 reader = csv.DictReader(f)
                 fieldnames = reader.fieldnames
                 for row in reader:
@@ -836,26 +836,26 @@ def mark_all_software_partial(base_path: str, logger):
                         else:
                             logger.info(f"Skipping '{software_name}' - no RPM dependencies")
                     rows.append(row)
-            
+
             if fieldnames and rows and updated:
-                with open(software_file, 'w', newline='') as f:
+                with open(software_file, 'w', newline='', encoding='utf-8') as f:
                     writer = csv.DictWriter(f, fieldnames=fieldnames)
                     writer.writeheader()
                     writer.writerows(rows)
                 logger.info(f"Successfully updated {software_file}")
-    except Exception as e:
+    except OSError as e:
         logger.error(f"Failed to mark all software as partial: {e}")
 
 def write_cleanup_status(results: List[Dict], base_path: str):
     """Write cleanup results to status file."""
     status_file = f"{base_path}/cleanup_status.csv"
     os.makedirs(os.path.dirname(status_file), exist_ok=True)
-    
-    with open(status_file, 'w', newline='') as f:
+
+    with open(status_file, 'w', newline='', encoding='utf-8') as f:
         writer = csv.DictWriter(f, fieldnames=['name', 'type', 'status', 'message'])
         writer.writeheader()
         writer.writerows(results)
-    
+
     return status_file
 
 
@@ -884,7 +884,7 @@ def run_module():
     log_dir = os.path.join(base_path, "cleanup")
     os.makedirs(base_path, exist_ok=True)
     logger = setup_standard_logger(log_dir)
-    
+
     # Handle 'all' keyword for repositories only
     cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all'
     #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all':

From 2f452e9f543e10a8c1abbbb457f87e43f12f1a56 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 9 Feb 2026 22:02:38 +0530
Subject: [PATCH 096/172] kafka and victoria update

---
 .../tasks/main.yml                            | 30 +++++++-----
 .../vars/main.yml                             | 12 +++++
 .../tasks/main.yml                            | 46 ++++++++++++++-----
 .../vars/main.yml                             | 19 ++++++++
 4 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 6a387b1b46..a99fba33b3 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -181,21 +181,27 @@
         [
           'Kafka connection details written to: ' ~ kafka_output_file,
           '',
-          'Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
+          '[IMPORTANT] Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
           '',
-          'TLS:',
-          '  CA: ' ~ kafka_output_dir ~ '/ca.crt',
+          '[IMPORTANT] TLS files (on OIM host):',
+          '  CA (server certificate for OME): ' ~ kafka_output_dir ~ '/ca.crt',
           '  client cert: ' ~ kafka_output_dir ~ '/user.crt',
-          '  client key: ' ~ kafka_output_dir ~ '/user.key',
+          '  client key:  ' ~ kafka_output_dir ~ '/user.key',
           '',
-          'OME note (mTLS):',
-          '  Use ca.crt as the server certificate in OME.',
-          '  Create a client certificate in .pfx format (provide a passphrase when prompted):',
-          '    cd ' ~ kafka_output_dir,
-          '    openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
-          '  Use user.pfx as the client certificate in OME.',
-          '  If you are using the OME UI from a different system than the OIM host,',
-          '  copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.',
+          'OME steps (mTLS):',
+          '  [STEP 1] Create client certificate in .pfx format (passphrase required):',
+          '           cd ' ~ kafka_output_dir,
+          '           openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt',
+          '  [STEP 2] ' ~ kafka_ome_cross_machine_note_line1,
+          '           ' ~ kafka_ome_cross_machine_note_line2,
+          '  [STEP 3] In the OME UI, navigate to:',
+          '           ' ~ kafka_ome_ui_navigation_line1,
+          '           ' ~ kafka_ome_ui_navigation_line2,
+          '  [STEP 4] Click: ' ~ kafka_ome_ui_enable_label,
+          '  [STEP 5] Set Kafka Bootstrap Server to: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
+          '  [STEP 6] Set Authentication Mode to: ' ~ kafka_ome_auth_mode_value,
+          '  [STEP 7] ' ~ kafka_ome_server_cert_note,
+          '  [STEP 8] ' ~ kafka_ome_client_cert_note,
           ''
         ]
       }}
diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml
index d0bd070d47..295a9ad5df 100644
--- a/utils/roles/external_kafka_connect_details/vars/main.yml
+++ b/utils/roles/external_kafka_connect_details/vars/main.yml
@@ -28,3 +28,15 @@ kafka_err_pods_not_ready: "One or more Kafka pods are not Ready."
 kafka_err_external_ip_missing: >-
   Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}'
   exists in namespace '{{ kafka_namespace }}' and has an external IP assigned.
+
+kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity"
+kafka_ome_ui_navigation_line2: "Remote Telemetry Configuration -> Kafka Connectivity"
+kafka_ome_ui_enable_label: "Enable Kafka Connectivity"
+kafka_ome_auth_mode_value: "SSL"
+
+kafka_ome_server_cert_note: "Upload ca.crt as the server certificate in OME."
+kafka_ome_client_cert_note: "Upload user.pfx as the client certificate in OME (mTLS)."
+kafka_ome_cross_machine_note_line1: >-
+  If OME UI is accessed from a different system than the OIM host,
+kafka_ome_cross_machine_note_line2: >-
+  copy ca.crt and user.pfx to that system before uploading them in the UI.
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 38b0ce3045..14e83f7688 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -164,8 +164,7 @@
     vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}"
     vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}"
     victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt"
-    victoria_tls_cert: "{{ victoria_tls_cert_dir }}/server.crt"
-    victoria_tls_key: "{{ victoria_tls_cert_dir }}/server.key"
+    
 
 - name: Fail when LoadBalancer IPs are not available
   ansible.builtin.fail:
@@ -238,12 +237,22 @@
             query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query"
             ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui"
         tls:
-          server_crt: "{{ victoria_tls_cert }}"
+          ca_crt: "{{ victoria_tls_ca }}"
         notes:
           sfm:
             vminsert_write_url: "{{ victoria_vminsert_write_url }}"
             hosts_entry: "{{ victoria_sfm_hosts_entry }}"
             hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}"
+            ui_navigation: "{{ victoria_sfm_ui_navigation }}"
+            remote_write_target_name: "{{ victoria_sfm_remote_write_target_name }}"
+            remote_write_message_version: "{{ victoria_sfm_remote_write_message_version }}"
+            remote_write_enable_value: "{{ victoria_sfm_remote_write_enable_value }}"
+            tls_server_cert_file_name: "{{ victoria_sfm_tls_server_cert_file_name }}"
+            tls_server_cert_file_path: "{{ victoria_tls_ca }}"
+            ssh_note: "{{ victoria_sfm_ssh_note }}"
+            hosts_scope_note: "{{ victoria_sfm_hosts_scope_note }}"
+            pod_shell_command_example: "{{ victoria_sfm_pod_shell_command_example }}"
+            hosts_restart_note: "{{ victoria_sfm_hosts_restart_note }}"
 
 - name: Ensure output directory exists
   ansible.builtin.file:
@@ -273,20 +282,33 @@
           'Mode: ' ~ victoria_deployment_mode,
           '',
           'Endpoints:',
-          '  vminsert write: ' ~ victoria_vminsert_write_url,
+          '  [IMPORTANT] vminsert write: ' ~ victoria_vminsert_write_url,
           '  vmselect query: ' ~ victoria_vmselect_query_url,
           '  vmselect UI:    ' ~ victoria_vmselect_ui_url,
           '',
           'TLS:',
-          '  server.crt: ' ~ victoria_tls_cert,
+          '  ca.crt:     ' ~ victoria_tls_ca,
           '',
-          'SFM note:',
-          '  Use vminsert write URL for SFM: ' ~ victoria_vminsert_write_url,
-          '  Add these entries to /etc/hosts on the SFM server:',
-          '    ' ~ victoria_sfm_hosts_entry_vminsert_display,
-          '    ' ~ victoria_sfm_hosts_entry_vmselect_display,
-          '  If you are using the SFM UI from a different system than the OIM host,',
-          '  copy server.crt from the OIM host to that system before selecting/uploading it in the UI.'
+          'SFM steps (TLS):',
+          '  [STEP 1] ' ~ victoria_sfm_cross_machine_tls_note_line1,
+          '           ' ~ victoria_sfm_cross_machine_tls_note_line2,
+          '  [STEP 2] In the SFM UI, update the vminsert URL:',
+          '           ' ~ victoria_sfm_ui_navigation,
+          '           Edit target: ' ~ victoria_sfm_remote_write_target_name,
+          '           Set Enable to: ' ~ victoria_sfm_remote_write_enable_value,
+          '           Set URL to: ' ~ victoria_vminsert_write_url,
+          '           Set Message Version to: ' ~ victoria_sfm_remote_write_message_version,
+          '           TLS Config: Upload ' ~ victoria_sfm_tls_server_cert_file_name,
+          '                      as ' ~ victoria_sfm_tls_server_cert_file_label ~ ': ' ~ victoria_tls_ca,
+          '  [STEP 3] ' ~ victoria_sfm_ssh_note,
+          '  [STEP 4] Update /etc/hosts only inside the SFM Prometheus pod:',
+          '           ' ~ victoria_sfm_hosts_scope_note,
+          '           ' ~ victoria_sfm_pod_shell_command_example,
+          '           Add these entries inside the pod:',
+          '             ' ~ victoria_sfm_hosts_entry_vminsert_display,
+          '             ' ~ victoria_sfm_hosts_entry_vmselect_display,
+          '  [NOTE] ' ~ victoria_sfm_hosts_restart_note,
+          ''
         ]
       }}
   delegate_to: localhost
diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml
index ea1c083deb..c033adaa1c 100644
--- a/utils/roles/external_victoria_connect_details/vars/main.yml
+++ b/utils/roles/external_victoria_connect_details/vars/main.yml
@@ -29,3 +29,22 @@ victoria_err_pods_not_ready: "One or more Victoria pods are not Ready."
 victoria_err_lb_missing: >-
   Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect'
   exist in namespace '{{ victoria_namespace }}' and have external IPs assigned.
+
+victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write"
+victoria_sfm_remote_write_target_name: "victoria"
+victoria_sfm_remote_write_message_version: "v1"
+victoria_sfm_remote_write_enable_value: "ON"
+
+victoria_sfm_ssh_note: "SSH to the SFM IP with admin credentials."
+victoria_sfm_hosts_scope_note: >-
+  /etc/hosts update is required only inside the SFM Prometheus pod (not on the SFM server host).
+victoria_sfm_pod_shell_command_example: >-
+  kubectl exec -it sfm-prometheus-deployment-xxxxx-xx -n sfm-1 -- /bin/sh
+victoria_sfm_hosts_restart_note: "Repeat /etc/hosts update if the SFM pod restarts."
+victoria_sfm_cross_machine_tls_note_line1: >-
+  If using the SFM UI from a different system than the OIM host,
+victoria_sfm_cross_machine_tls_note_line2: >-
+  copy ca.crt to that system before uploading it in the UI.
+
+victoria_sfm_tls_server_cert_file_label: "Server Certificate File"
+victoria_sfm_tls_server_cert_file_name: "ca.crt"

From 24734898f1b9987dcff6ab8ef2f370a3f03c7cba Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 9 Feb 2026 22:05:03 +0530
Subject: [PATCH 097/172] update kafka

---
 utils/roles/external_kafka_connect_details/tasks/main.yml | 1 -
 utils/roles/external_kafka_connect_details/vars/main.yml  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index a99fba33b3..96c6d0ca5f 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -196,7 +196,6 @@
           '           ' ~ kafka_ome_cross_machine_note_line2,
           '  [STEP 3] In the OME UI, navigate to:',
           '           ' ~ kafka_ome_ui_navigation_line1,
-          '           ' ~ kafka_ome_ui_navigation_line2,
           '  [STEP 4] Click: ' ~ kafka_ome_ui_enable_label,
           '  [STEP 5] Set Kafka Bootstrap Server to: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port,
           '  [STEP 6] Set Authentication Mode to: ' ~ kafka_ome_auth_mode_value,
diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml
index 295a9ad5df..ff257328a8 100644
--- a/utils/roles/external_kafka_connect_details/vars/main.yml
+++ b/utils/roles/external_kafka_connect_details/vars/main.yml
@@ -30,7 +30,6 @@ kafka_err_external_ip_missing: >-
   exists in namespace '{{ kafka_namespace }}' and has an external IP assigned.
 
 kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity"
-kafka_ome_ui_navigation_line2: "Remote Telemetry Configuration -> Kafka Connectivity"
 kafka_ome_ui_enable_label: "Enable Kafka Connectivity"
 kafka_ome_auth_mode_value: "SSL"
 

From 3db2b320e90abebd38fdc6d46fa77a363d7d4a13 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 9 Feb 2026 22:10:52 +0530
Subject: [PATCH 098/172] Update main.yml

---
 utils/roles/external_victoria_connect_details/tasks/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 14e83f7688..e06b061828 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -176,7 +176,7 @@
   ansible.builtin.set_fact:
     victoria_sfm_hosts_entry: >-
       {{
-        'echo "' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts'
+        'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts'
         if (vminsert_lb_ip.stdout | trim | length) > 0
         else ''
       }}
@@ -185,7 +185,7 @@
   ansible.builtin.set_fact:
     victoria_sfm_hosts_entry_vmselect: >-
       {{
-        'echo "' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts'
+        'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts'
         if (vmselect_lb_ip.stdout | trim | length) > 0
         else ''
       }}

From 6cfcb7b7fd67f8633480bd359b19e95c59b77532 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Tue, 10 Feb 2026 09:56:35 +0530
Subject: [PATCH 099/172] copyright info updated

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/modules/process_rpm_config.py | 2 +-
 common/library/modules/pulp_cleanup.py       | 4 +---
 local_repo/pulp_cleanup.yml                  | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/library/modules/process_rpm_config.py b/common/library/modules/process_rpm_config.py
index 89a8f0e1ca..550d0c078f 100644
--- a/common/library/modules/process_rpm_config.py
+++ b/common/library/modules/process_rpm_config.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 72fd11d692..6f80e82f83 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,8 +28,6 @@
 import glob
 import json
 import subprocess
-#import time
-#from datetime import datetime
 from typing import Dict, List, Any, Tuple
 
 from ansible.module_utils.basic import AnsibleModule
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index f999b3a2dc..5d409bbc1f 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 7790c2d1b238464db4fb41a62a7fcd61172df965 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Tue, 10 Feb 2026 05:39:01 +0000
Subject: [PATCH 100/172] fixed lint issues

---
 .../tasks/extract_path_overrides.yml          | 118 ++++++++++++++----
 1 file changed, 96 insertions(+), 22 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
index 45565dc4e7..ab1bf17aa6 100644
--- a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
+++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
@@ -34,21 +34,64 @@
 
 - name: Extract effective controller directories from slurm.conf
   ansible.builtin.set_fact:
-    slurm_ctld_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log']) | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable and slurm_merged_dict.get('SlurmctldLogFile') is not string else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log')) | dirname }}"
-    slurm_state_save_location_effective: "{{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld']) | first if slurm_merged_dict.get('StateSaveLocation') is iterable and slurm_merged_dict.get('StateSaveLocation') is not string else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }}"
-    slurm_ctld_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid']) | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable and slurm_merged_dict.get('SlurmctldPidFile') is not string else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid')) | dirname }}"
-    slurm_sched_log_dir_effective: "{{ ((slurm_merged_dict.get('SlurmSchedLogFile', ['']) | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable and slurm_merged_dict.get('SlurmSchedLogFile') is not string else slurm_merged_dict.get('SlurmSchedLogFile', '')) | default('', true) | dirname | default('', true)) }}"
+    slurm_ctld_log_dir_effective: >-
+      {{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log'])
+         | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable
+         and slurm_merged_dict.get('SlurmctldLogFile') is not string
+         else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log'))
+         | dirname }}
+    slurm_state_save_location_effective: >-
+      {{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld'])
+         | first if slurm_merged_dict.get('StateSaveLocation') is iterable
+         and slurm_merged_dict.get('StateSaveLocation') is not string
+         else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }}
+    slurm_ctld_pid_dir_effective: >-
+      {{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid'])
+         | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable
+         and slurm_merged_dict.get('SlurmctldPidFile') is not string
+         else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid'))
+         | dirname }}
+    slurm_sched_log_dir_effective: >-
+      {{ ((slurm_merged_dict.get('SlurmSchedLogFile', [''])
+         | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable
+         and slurm_merged_dict.get('SlurmSchedLogFile') is not string
+         else slurm_merged_dict.get('SlurmSchedLogFile', ''))
+         | default('', true) | dirname | default('', true)) }}
   when: slurm_merged_dict is defined
 
 # ── slurm.conf: compute path params ──────────────────────────────────
 
 - name: Extract effective compute directories from slurm.conf
   ansible.builtin.set_fact:
-    slurm_slurmd_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log']) | first if slurm_merged_dict.get('SlurmdLogFile') is iterable and slurm_merged_dict.get('SlurmdLogFile') is not string else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log')) | dirname }}"
-    slurm_slurmd_spool_dir_effective: "{{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd']) | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable and slurm_merged_dict.get('SlurmdSpoolDir') is not string else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }}"
-    slurm_slurmd_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid']) | first if slurm_merged_dict.get('SlurmdPidFile') is iterable and slurm_merged_dict.get('SlurmdPidFile') is not string else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid')) | dirname }}"
-    slurm_epilog_dir_effective: "{{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh']) | first if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh')) | dirname }}"
-    slurm_prolog_dir_effective: "{{ ((slurm_merged_dict.get('Prolog', ['']) | first if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string else slurm_merged_dict.get('Prolog', '')) | default('', true) | dirname | default('', true)) }}"
+    slurm_slurmd_log_dir_effective: >-
+      {{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log'])
+         | first if slurm_merged_dict.get('SlurmdLogFile') is iterable
+         and slurm_merged_dict.get('SlurmdLogFile') is not string
+         else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log'))
+         | dirname }}
+    slurm_slurmd_spool_dir_effective: >-
+      {{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd'])
+         | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable
+         and slurm_merged_dict.get('SlurmdSpoolDir') is not string
+         else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }}
+    slurm_slurmd_pid_dir_effective: >-
+      {{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid'])
+         | first if slurm_merged_dict.get('SlurmdPidFile') is iterable
+         and slurm_merged_dict.get('SlurmdPidFile') is not string
+         else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid'))
+         | dirname }}
+    slurm_epilog_dir_effective: >-
+      {{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh'])
+         | first if slurm_merged_dict.get('Epilog') is iterable
+         and slurm_merged_dict.get('Epilog') is not string
+         else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh'))
+         | dirname }}
+    slurm_prolog_dir_effective: >-
+      {{ ((slurm_merged_dict.get('Prolog', [''])
+         | first if slurm_merged_dict.get('Prolog') is iterable
+         and slurm_merged_dict.get('Prolog') is not string
+         else slurm_merged_dict.get('Prolog', ''))
+         | default('', true) | dirname | default('', true)) }}
   when: slurm_merged_dict is defined
 
 # ── slurm.conf: all epilog/prolog dirs and custom file paths ─────────
@@ -56,12 +99,16 @@
 - name: Extract all epilog paths from merged Epilog list
   ansible.builtin.set_fact:
     slurm_epilog_paths_all: >-
-      {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string
-          else [slurm_merged_dict.get('Epilog', '')])
+      {{ (slurm_merged_dict.get('Epilog', [])
+         if slurm_merged_dict.get('Epilog') is iterable
+         and slurm_merged_dict.get('Epilog') is not string
+         else [slurm_merged_dict.get('Epilog', '')])
          | reject('equalto', '') | list }}
     slurm_epilog_dirs_all: >-
-      {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string
-          else [slurm_merged_dict.get('Epilog', '')])
+      {{ (slurm_merged_dict.get('Epilog', [])
+         if slurm_merged_dict.get('Epilog') is iterable
+         and slurm_merged_dict.get('Epilog') is not string
+         else [slurm_merged_dict.get('Epilog', '')])
          | map('dirname') | unique | reject('equalto', '') | list }}
   when: slurm_merged_dict is defined
 
@@ -74,12 +121,16 @@
 - name: Extract all prolog paths from merged Prolog list
   ansible.builtin.set_fact:
     slurm_prolog_paths_all: >-
-      {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string
-          else [slurm_merged_dict.get('Prolog', '')])
+      {{ (slurm_merged_dict.get('Prolog', [])
+         if slurm_merged_dict.get('Prolog') is iterable
+         and slurm_merged_dict.get('Prolog') is not string
+         else [slurm_merged_dict.get('Prolog', '')])
          | reject('equalto', '') | list }}
     slurm_prolog_dirs_all: >-
-      {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string
-          else [slurm_merged_dict.get('Prolog', '')])
+      {{ (slurm_merged_dict.get('Prolog', [])
+         if slurm_merged_dict.get('Prolog') is iterable
+         and slurm_merged_dict.get('Prolog') is not string
+         else [slurm_merged_dict.get('Prolog', '')])
          | map('dirname') | unique | reject('equalto', '') | list }}
   when: slurm_merged_dict is defined
 
@@ -93,23 +144,46 @@
 
 - name: Extract effective plugin directory from slurm.conf
   ansible.builtin.set_fact:
-    slurm_plugin_dir_effective: "{{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurm_merged_dict.get('PluginDir') is iterable and slurm_merged_dict.get('PluginDir') is not string else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}"
+    slurm_plugin_dir_effective: >-
+      {{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm'])
+         | first if slurm_merged_dict.get('PluginDir') is iterable
+         and slurm_merged_dict.get('PluginDir') is not string
+         else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}
   when: slurm_merged_dict is defined
 
 # ── slurmdbd.conf path params ────────────────────────────────────────
 
 - name: Extract effective directories from slurmdbd.conf
   ansible.builtin.set_fact:
-    slurmdbd_log_dir_effective: "{{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log']) | first if slurmdbd_merged_dict.get('LogFile') is iterable and slurmdbd_merged_dict.get('LogFile') is not string else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log')) | dirname }}"
-    slurmdbd_pid_dir_effective: "{{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid']) | first if slurmdbd_merged_dict.get('PidFile') is iterable and slurmdbd_merged_dict.get('PidFile') is not string else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid')) | dirname }}"
-    slurmdbd_plugin_dir_effective: "{{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurmdbd_merged_dict.get('PluginDir') is iterable and slurmdbd_merged_dict.get('PluginDir') is not string else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}"
+    slurmdbd_log_dir_effective: >-
+      {{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log'])
+         | first if slurmdbd_merged_dict.get('LogFile') is iterable
+         and slurmdbd_merged_dict.get('LogFile') is not string
+         else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log'))
+         | dirname }}
+    slurmdbd_pid_dir_effective: >-
+      {{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid'])
+         | first if slurmdbd_merged_dict.get('PidFile') is iterable
+         and slurmdbd_merged_dict.get('PidFile') is not string
+         else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid'))
+         | dirname }}
+    slurmdbd_plugin_dir_effective: >-
+      {{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm'])
+         | first if slurmdbd_merged_dict.get('PluginDir') is iterable
+         and slurmdbd_merged_dict.get('PluginDir') is not string
+         else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}
   when: slurmdbd_merged_dict is defined
 
 # ── cgroup.conf path params ──────────────────────────────────────────
 
 - name: Extract effective cgroup mountpoint from cgroup.conf
   ansible.builtin.set_fact:
-    slurm_cgroup_mountpoint_effective: "{{ ((cgroup_merged_dict.get('CgroupMountpoint', ['']) | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable and cgroup_merged_dict.get('CgroupMountpoint') is not string else cgroup_merged_dict.get('CgroupMountpoint', '')) | default('', true)) }}"
+    slurm_cgroup_mountpoint_effective: >-
+      {{ ((cgroup_merged_dict.get('CgroupMountpoint', [''])
+         | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable
+         and cgroup_merged_dict.get('CgroupMountpoint') is not string
+         else cgroup_merged_dict.get('CgroupMountpoint', ''))
+         | default('', true)) }}
   when: cgroup_merged_dict is defined
 
 # ── Defaults when confs are not merged ────────────────────────────────

From cde00165a54d541fde3f9c545a1746b44f74ad24 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Tue, 10 Feb 2026 11:57:07 +0530
Subject: [PATCH 101/172] update kafka and victoria utility

---
 utils/external_kafka_connect_details.yml      | 43 ++++++++++++++++---
 utils/external_victoria_connect_details.yml   | 43 ++++++++++++++++---
 .../vars/main.yml                             |  8 ++++
 .../vars/main.yml                             |  8 ++++
 4 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml
index 1f2093e54e..a55c54ad3b 100644
--- a/utils/external_kafka_connect_details.yml
+++ b/utils/external_kafka_connect_details.yml
@@ -18,14 +18,43 @@
   connection: local
   gather_facts: false
   tasks:
-    - name: Fail if service_kube_control_plane group is missing or empty
+    - name: Load Kafka utility role variables
+      ansible.builtin.include_vars:
+        file: "{{ playbook_dir }}/roles/external_kafka_connect_details/vars/main.yml"
+
+    - name: Include input directory
+      ansible.builtin.include_role:
+        name: include_input_dir
+
+    - name: Set HA config path
+      ansible.builtin.set_fact:
+        k8s_ha_config_path: "{{ input_project_dir }}/high_availability_config.yml"
+
+    - name: Load High Availability config
+      ansible.builtin.include_vars:
+        file: "{{ k8s_ha_config_path }}"
+        name: ha_config
+      failed_when: false
+      register: ha_config_load
+
+    - name: Fail when High Availability config cannot be loaded
       ansible.builtin.fail:
-        msg: >-
-          Inventory must define a 'service_kube_control_plane' group with exactly one host.
-          Provide either the service kube control plane VIP or one of the service kube control plane node IPs.
-          Run with '-i <inventory>' and ensure exactly one host is in that group.
-      when:
-        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1
+        msg: "{{ kafka_preflight_err_ha_config_missing }}"
+      when: ha_config_load.failed
+
+    - name: Set service kube control plane VIP from HA config
+      ansible.builtin.set_fact:
+        kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}"
+
+    - name: Fail when service kube control plane VIP is not available
+      ansible.builtin.fail:
+        msg: "{{ kafka_preflight_err_ha_vip_missing }}"
+      when: (kube_vip | trim | length) == 0
+
+    - name: Create service_kube_control_plane group from VIP
+      ansible.builtin.add_host:
+        name: "{{ kube_vip }}"
+        groups: service_kube_control_plane
 
 - name: Fetch external Kafka connection details
   hosts: service_kube_control_plane
diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml
index f955bbbc78..23e388baf6 100644
--- a/utils/external_victoria_connect_details.yml
+++ b/utils/external_victoria_connect_details.yml
@@ -18,14 +18,43 @@
   connection: local
   gather_facts: false
   tasks:
-    - name: Fail if service_kube_control_plane group is missing or empty
+    - name: Load Victoria utility role variables
+      ansible.builtin.include_vars:
+        file: "{{ playbook_dir }}/roles/external_victoria_connect_details/vars/main.yml"
+
+    - name: Include input directory
+      ansible.builtin.include_role:
+        name: include_input_dir
+
+    - name: Set HA config path
+      ansible.builtin.set_fact:
+        k8s_ha_config_path: "{{ input_project_dir }}/high_availability_config.yml"
+
+    - name: Load High Availability config
+      ansible.builtin.include_vars:
+        file: "{{ k8s_ha_config_path }}"
+        name: ha_config
+      failed_when: false
+      register: ha_config_load
+
+    - name: Fail when High Availability config cannot be loaded
       ansible.builtin.fail:
-        msg: >-
-          Inventory must define a 'service_kube_control_plane' group with exactly one host.
-          Provide either the service kube control plane VIP or one of the service kube control plane node IPs.
-          Run with '-i <inventory>' and ensure exactly one host is in that group.
-      when:
-        - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1
+        msg: "{{ victoria_preflight_err_ha_config_missing }}"
+      when: ha_config_load.failed
+
+    - name: Set service kube control plane VIP from HA config
+      ansible.builtin.set_fact:
+        kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}"
+
+    - name: Fail when service kube control plane VIP is not available
+      ansible.builtin.fail:
+        msg: "{{ victoria_preflight_err_ha_vip_missing }}"
+      when: (kube_vip | trim | length) == 0
+
+    - name: Create service_kube_control_plane group from VIP
+      ansible.builtin.add_host:
+        name: "{{ kube_vip }}"
+        groups: service_kube_control_plane
 
 - name: Fetch external Victoria connection details
   hosts: service_kube_control_plane
diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml
index ff257328a8..7a7d831275 100644
--- a/utils/roles/external_kafka_connect_details/vars/main.yml
+++ b/utils/roles/external_kafka_connect_details/vars/main.yml
@@ -29,6 +29,14 @@ kafka_err_external_ip_missing: >-
   Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}'
   exists in namespace '{{ kafka_namespace }}' and has an external IP assigned.
 
+kafka_preflight_err_ha_config_missing: >-
+  Failed to load High Availability config file: {{ k8s_ha_config_path }}.
+  Provide a valid HA config so the service Kubernetes VIP can be used.
+
+kafka_preflight_err_ha_vip_missing: >-
+  Failed to determine the service Kubernetes control plane VIP from High Availability config.
+  Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}.
+
 kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity"
 kafka_ome_ui_enable_label: "Enable Kafka Connectivity"
 kafka_ome_auth_mode_value: "SSL"
diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml
index c033adaa1c..c2de781fbf 100644
--- a/utils/roles/external_victoria_connect_details/vars/main.yml
+++ b/utils/roles/external_victoria_connect_details/vars/main.yml
@@ -30,6 +30,14 @@ victoria_err_lb_missing: >-
   Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect'
   exist in namespace '{{ victoria_namespace }}' and have external IPs assigned.
 
+victoria_preflight_err_ha_config_missing: >-
+  Failed to load High Availability config file: {{ k8s_ha_config_path }}.
+  Provide a valid HA config so the service Kubernetes VIP can be used.
+
+victoria_preflight_err_ha_vip_missing: >-
+  Failed to determine the service Kubernetes control plane VIP from High Availability config.
+  Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}.
+
 victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write"
 victoria_sfm_remote_write_target_name: "victoria"
 victoria_sfm_remote_write_message_version: "v1"

From 160e9ec91564ec4f6e6ca2d1cffe583d7484e170 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 10 Feb 2026 11:58:08 +0530
Subject: [PATCH 102/172] Added upgrade logic for local_repo_config.yml,
 omnia_config.yml, provision_config.yml, storage_config.yml,
 security_config.yml and telemetry_config.yml

---
 .../import_input_parameters/tasks/main.yml    |  15 ++
 .../tasks/transform_local_repo_config.yml     | 121 +++++++++
 .../tasks/transform_omnia_config.yml          | 103 ++++++++
 .../tasks/transform_provision_config.yml      | 100 ++++++++
 .../tasks/transform_storage_config.yml        | 130 ++++++++++
 .../tasks/transform_telemetry_config.yml      | 148 +++++++++++
 .../templates/local_repo_config.j2            | 199 ++++++++++++++
 .../templates/omnia_config.j2                 | 160 ++++++++++++
 .../templates/provision_config.j2             |  40 +++
 .../templates/storage_config.j2               |  95 +++++++
 .../templates/telemetry_config.j2             | 242 ++++++++++++++++++
 .../import_input_parameters/vars/main.yml     |  78 +++++-
 12 files changed, 1430 insertions(+), 1 deletion(-)
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml
 create mode 100644 upgrade/roles/import_input_parameters/templates/local_repo_config.j2
 create mode 100644 upgrade/roles/import_input_parameters/templates/omnia_config.j2
 create mode 100644 upgrade/roles/import_input_parameters/templates/provision_config.j2
 create mode 100644 upgrade/roles/import_input_parameters/templates/storage_config.j2
 create mode 100644 upgrade/roles/import_input_parameters/templates/telemetry_config.j2

diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml
index 7687f852bb..ff77cf2c0e 100644
--- a/upgrade/roles/import_input_parameters/tasks/main.yml
+++ b/upgrade/roles/import_input_parameters/tasks/main.yml
@@ -22,5 +22,20 @@
 - name: Transform high_availability_config.yml from Omnia 2.0 to 2.1
   ansible.builtin.include_tasks: transform_high_availability_config.yml
 
+- name: Transform local_repo_config.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_local_repo_config.yml
+
+- name: Transform provision_config.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_provision_config.yml
+
+- name: Transform storage_config.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_storage_config.yml
+
+- name: Transform omnia_config.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_omnia_config.yml
+
+- name: Transform telemetry_config.yml from Omnia 2.0 to 2.1
+  ansible.builtin.include_tasks: transform_telemetry_config.yml
+
 - name: Restore input files from backup
   ansible.builtin.include_tasks: restore_input_files.yml
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
new file mode 100644
index 0000000000..20c95798b1
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
@@ -0,0 +1,121 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup local_repo_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/local_repo_config.yml"
+  register: backup_local_repo_config_stat
+
+- name: Fail if backup local_repo_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_local_repo_config_missing }}"
+  when: not backup_local_repo_config_stat.stat.exists
+
+- name: Check if local_repo_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/local_repo_config.yml"
+  register: local_repo_config_stat
+
+- name: Fail if local_repo_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_local_repo_config_missing }}"
+  when: not local_repo_config_stat.stat.exists
+
+- name: Read backup local_repo_config.yml (source of truth)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/local_repo_config.yml"
+  register: backup_local_repo_config_slurp
+
+- name: Parse backup local_repo_config.yml
+  ansible.builtin.set_fact:
+    backup_local_repo_config: "{{ backup_local_repo_config_slurp.content | b64decode | from_yaml }}"
+
+- name: Normalize user_registry
+  ansible.builtin.set_fact:
+    local_repo_user_registry: >-
+      {{
+        (
+          backup_local_repo_config.user_registry
+          if (backup_local_repo_config.user_registry is defined)
+          else
+            (
+              (
+                (backup_local_repo_config.omnia_registry | default([]))
+                | select('string')
+                | map('regex_replace', '^(.*)$', '{"host": "\\1", "cert_path": "", "key_path": ""}')
+                | map('from_json')
+                | list
+              )
+            )
+        )
+      }}
+
+- name: Normalize repo url keys to 2.1 schema
+  ansible.builtin.set_fact:
+    local_repo_user_repo_url_x86_64: "{{ backup_local_repo_config.user_repo_url_x86_64 | default(backup_local_repo_config.user_repo_url | default([])) }}"
+    local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}"
+    local_repo_rhel_os_url_x86_64: "{{ backup_local_repo_config.rhel_os_url_x86_64 | default(backup_local_repo_config.rhel_os_url | default([])) }}"
+    local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}"
+    local_repo_omnia_repo_url_rhel_x86_64: "{{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}"
+    local_repo_omnia_repo_url_rhel_aarch64: "{{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}"
+    local_repo_additional_repos_x86_64: "{{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }}"
+    local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}"
+
+- name: Fail if omnia_repo_url_rhel_x86_64 is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_omnia_repo_url_rhel_x86_64_missing }}"
+  when: (local_repo_omnia_repo_url_rhel_x86_64 | default([]) | length) == 0
+
+- name: Fail if omnia_repo_url_rhel_aarch64 is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_omnia_repo_url_rhel_aarch64_missing }}"
+  when: (local_repo_omnia_repo_url_rhel_aarch64 | default([]) | length) == 0
+
+- name: Write local_repo_config.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: local_repo_config.j2
+    dest: "{{ input_project_dir }}/local_repo_config.yml"
+    mode: "{{ default_file_mode }}"
+  vars:
+    local_repo_user_registry: "{{ local_repo_user_registry }}"
+    local_repo_user_repo_url_x86_64: "{{ local_repo_user_repo_url_x86_64 }}"
+    local_repo_user_repo_url_aarch64: "{{ local_repo_user_repo_url_aarch64 }}"
+    local_repo_rhel_os_url_x86_64: "{{ local_repo_rhel_os_url_x86_64 }}"
+    local_repo_rhel_os_url_aarch64: "{{ local_repo_rhel_os_url_aarch64 }}"
+    local_repo_omnia_repo_url_rhel_x86_64: "{{ local_repo_omnia_repo_url_rhel_x86_64 }}"
+    local_repo_omnia_repo_url_rhel_aarch64: "{{ local_repo_omnia_repo_url_rhel_aarch64 }}"
+    local_repo_additional_repos_x86_64: "{{ local_repo_additional_repos_x86_64 }}"
+    local_repo_additional_repos_aarch64: "{{ local_repo_additional_repos_aarch64 }}"
+
+- name: Validate YAML syntax of transformed local_repo_config.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/local_repo_config.yml','r'))"
+  register: local_repo_yaml_validation
+  changed_when: false
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_yaml_validation_failed }}"
+  when:
+    - local_repo_yaml_validation.rc != 0
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "{{ msg_using_backup_local_repo_config }}"
+  when: true
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: "{{ msg_local_repo_config_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml
new file mode 100644
index 0000000000..ab62c3ff28
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml
@@ -0,0 +1,103 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup omnia_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/omnia_config.yml"
+  register: backup_omnia_config_stat
+
+- name: Fail if backup omnia_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_omnia_config_missing }}"
+  when: not backup_omnia_config_stat.stat.exists
+
+- name: Check if omnia_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/omnia_config.yml"
+  register: omnia_config_stat
+
+- name: Fail if omnia_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_omnia_config_missing }}"
+  when: not omnia_config_stat.stat.exists
+
+- name: Read backup omnia_config.yml (source of truth)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/omnia_config.yml"
+  register: backup_omnia_config_slurp
+
+- name: Parse backup omnia_config.yml
+  ansible.builtin.set_fact:
+    backup_omnia_config: "{{ backup_omnia_config_slurp.content | b64decode | from_yaml }}"
+
+- name: Normalize omnia_config.yml values
+  ansible.builtin.set_fact:
+    omnia_slurm_cluster_raw: "{{ backup_omnia_config.slurm_cluster | default([]) }}"
+    omnia_service_k8s_cluster_raw: "{{ backup_omnia_config.service_k8s_cluster | default([]) }}"
+
+- name: Ensure slurm_cluster and service_k8s_cluster are lists
+  ansible.builtin.set_fact:
+    omnia_slurm_cluster: >-
+      {{
+        [omnia_slurm_cluster_raw]
+        if (omnia_slurm_cluster_raw is mapping)
+        else omnia_slurm_cluster_raw
+      }}
+    omnia_service_k8s_cluster: >-
+      {{
+        [omnia_service_k8s_cluster_raw]
+        if (omnia_service_k8s_cluster_raw is mapping)
+        else omnia_service_k8s_cluster_raw
+      }}
+
+- name: Fail if slurm_cluster is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_slurm_cluster_missing }}"
+  when: (omnia_slurm_cluster | default([]) | length) == 0
+
+- name: Fail if service_k8s_cluster is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_service_k8s_cluster_missing }}"
+  when: (omnia_service_k8s_cluster | default([]) | length) == 0
+
+- name: Write omnia_config.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: omnia_config.j2
+    dest: "{{ input_project_dir }}/omnia_config.yml"
+    mode: "{{ default_file_mode }}"
+  vars:
+    omnia_slurm_cluster: "{{ omnia_slurm_cluster }}"
+    omnia_service_k8s_cluster: "{{ omnia_service_k8s_cluster }}"
+
+- name: Validate YAML syntax of transformed omnia_config.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/omnia_config.yml','r'))"
+  register: omnia_yaml_validation
+  changed_when: false
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_yaml_validation_failed }}"
+  when:
+    - omnia_yaml_validation.rc != 0
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "{{ msg_using_backup_omnia_config }}"
+  when: true
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: "{{ msg_omnia_config_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml
new file mode 100644
index 0000000000..71e9ee0dc2
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml
@@ -0,0 +1,100 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup provision_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/provision_config.yml"
+  register: backup_provision_config_stat
+
+- name: Fail if backup provision_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_provision_config_missing }}"
+  when: not backup_provision_config_stat.stat.exists
+
+- name: Check if provision_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/provision_config.yml"
+  register: provision_config_stat
+
+- name: Fail if provision_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_provision_config_missing }}"
+  when: not provision_config_stat.stat.exists
+
+- name: Read backup provision_config.yml (source of truth)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/provision_config.yml"
+  register: backup_provision_config_slurp
+
+- name: Parse backup provision_config.yml
+  ansible.builtin.set_fact:
+    backup_provision_config: "{{ backup_provision_config_slurp.content | b64decode | from_yaml }}"
+
+- name: Normalize provision_config.yml values
+  ansible.builtin.set_fact:
+    provision_pxe_mapping_file_path_raw: >-
+      {{
+        backup_provision_config.pxe_mapping_file_path
+        | default('/opt/omnia/input/project_default/pxe_mapping_file.csv')
+      }}
+    provision_language: "{{ backup_provision_config.language | default('en_US.UTF-8') }}"
+    provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default('86400') }}"
+
+- name: Rewrite legacy pxe_mapping_file_path to current project input directory
+  ansible.builtin.set_fact:
+    provision_pxe_mapping_file_path: >-
+      {{
+        (
+          provision_pxe_mapping_file_path_raw
+          | string
+          | regex_replace('^/opt/omnia/input/project_default/', input_project_dir ~ '/')
+        )
+      }}
+
+- name: Fail if pxe_mapping_file_path is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_pxe_mapping_file_path_missing }}"
+  when: (provision_pxe_mapping_file_path | string | trim) == ''
+
+- name: Write provision_config.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: provision_config.j2
+    dest: "{{ input_project_dir }}/provision_config.yml"
+    mode: "{{ default_file_mode }}"
+  vars:
+    provision_pxe_mapping_file_path: "{{ provision_pxe_mapping_file_path }}"
+    provision_language: "{{ provision_language }}"
+    provision_default_lease_time: "{{ provision_default_lease_time }}"
+
+- name: Validate YAML syntax of transformed provision_config.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/provision_config.yml','r'))"
+  register: provision_yaml_validation
+  changed_when: false
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_yaml_validation_failed }}"
+  when:
+    - provision_yaml_validation.rc != 0
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "{{ msg_using_backup_provision_config }}"
+  when: true
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: "{{ msg_provision_config_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
new file mode 100644
index 0000000000..72b82aa7f8
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
@@ -0,0 +1,130 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup storage_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/storage_config.yml"
+  register: backup_storage_config_stat
+
+- name: Fail if backup storage_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_storage_config_missing }}"
+  when: not backup_storage_config_stat.stat.exists
+
+- name: Check if storage_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/storage_config.yml"
+  register: storage_config_stat
+
+- name: Fail if storage_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_storage_config_missing }}"
+  when: not storage_config_stat.stat.exists
+
+- name: Read backup storage_config.yml (source of truth)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/storage_config.yml"
+  register: backup_storage_config_slurp
+
+- name: Parse backup storage_config.yml
+  ansible.builtin.set_fact:
+    backup_storage_config: "{{ backup_storage_config_slurp.content | b64decode | from_yaml }}"
+
+- name: Normalize storage_config.yml values
+  ansible.builtin.set_fact:
+    storage_nfs_client_params: "{{ backup_storage_config.nfs_client_params | default([]) }}"
+    storage_powervault_config: "{{ backup_storage_config.powervault_config | default(none) }}"
+    storage_has_powervault: "{{ backup_storage_config.powervault_config is defined }}"
+
+- name: Fail if powervault_config is present but missing mandatory keys
+  ansible.builtin.fail:
+    msg: "{{ msg_powervault_missing_keys }}"
+  when:
+    - storage_has_powervault
+    - storage_powervault_config.ip is not defined or (storage_powervault_config.ip | default([]) | length) == 0
+      or storage_powervault_config.isci_initiators is not defined
+      or (storage_powervault_config.isci_initiators | string | trim) == ''
+      or storage_powervault_config.volume_id is not defined
+      or (storage_powervault_config.volume_id | string | trim) == ''
+
+- name: Fail if nfs_client_params is missing
+  ansible.builtin.fail:
+    msg: "{{ msg_nfs_client_params_missing }}"
+  when: (storage_nfs_client_params | default([]) | length) == 0
+
+- name: Fail if any NFS client entry is missing required keys
+  ansible.builtin.fail:
+    msg: "{{ msg_nfs_client_param_entry_missing_keys }}"
+  when: >-
+    {{
+      (
+        storage_nfs_client_params
+        | selectattr('server_ip', 'undefined')
+        | list
+        | length
+      ) > 0
+      or
+      (
+        storage_nfs_client_params
+        | selectattr('server_share_path', 'undefined')
+        | list
+        | length
+      ) > 0
+      or
+      (
+        storage_nfs_client_params
+        | selectattr('client_share_path', 'undefined')
+        | list
+        | length
+      ) > 0
+      or
+      (
+        storage_nfs_client_params
+        | selectattr('client_mount_options', 'undefined')
+        | list
+        | length
+      ) > 0
+    }}
+
+- name: Write storage_config.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: storage_config.j2
+    dest: "{{ input_project_dir }}/storage_config.yml"
+    mode: "{{ default_file_mode }}"
+  vars:
+    storage_nfs_client_params: "{{ storage_nfs_client_params }}"
+    storage_powervault_config: "{{ storage_powervault_config }}"
+    storage_has_powervault: "{{ storage_has_powervault }}"
+
+- name: Validate YAML syntax of transformed storage_config.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/storage_config.yml','r'))"
+  register: storage_yaml_validation
+  changed_when: false
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_yaml_validation_failed }}"
+  when:
+    - storage_yaml_validation.rc != 0
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "{{ msg_using_backup_storage_config }}"
+  when: true
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: "{{ msg_storage_config_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml
new file mode 100644
index 0000000000..1aa095e66b
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml
@@ -0,0 +1,148 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup telemetry_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/telemetry_config.yml"
+  register: backup_telemetry_config_stat
+
+- name: Fail if backup telemetry_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_backup_telemetry_config_missing }}"
+  when: not backup_telemetry_config_stat.stat.exists
+
+- name: Check if telemetry_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/telemetry_config.yml"
+  register: telemetry_config_stat
+
+- name: Fail if telemetry_config.yml is not present
+  ansible.builtin.fail:
+    msg: "{{ msg_telemetry_config_missing }}"
+  when: not telemetry_config_stat.stat.exists
+
+- name: Read backup telemetry_config.yml (source of truth)
+  ansible.builtin.slurp:
+    src: "{{ backup_location }}/telemetry_config.yml"
+  register: backup_telemetry_config_slurp
+
+- name: Parse backup telemetry_config.yml
+  ansible.builtin.set_fact:
+    backup_telemetry_config: "{{ backup_telemetry_config_slurp.content | b64decode | from_yaml }}"
+
+- name: Normalize nested backup telemetry sections
+  ansible.builtin.set_fact:
+    backup_telemetry_victoria_config: "{{ backup_telemetry_config.victoria_configurations | default({}) }}"
+    backup_telemetry_kafka_config: "{{ backup_telemetry_config.kafka_configurations | default({}) }}"
+
+- name: Normalize telemetry_config.yml values
+  ansible.builtin.set_fact:
+    telemetry_idrac_telemetry_support: "{{ backup_telemetry_config.idrac_telemetry_support | default(true) }}"
+    telemetry_idrac_telemetry_collection_type: >-
+      {{
+        backup_telemetry_config.idrac_telemetry_collection_type
+        | default('victoria,kafka')
+      }}
+    telemetry_victoria_deployment_mode: "{{ backup_telemetry_victoria_config.deployment_mode | default('cluster') }}"
+    telemetry_victoria_persistence_size: "{{ backup_telemetry_victoria_config.persistence_size | default('8Gi') }}"
+    telemetry_victoria_retention_period: "{{ backup_telemetry_victoria_config.retention_period | default(168) }}"
+    telemetry_kafka_persistence_size: "{{ backup_telemetry_kafka_config.persistence_size | default('8Gi') }}"
+    telemetry_kafka_log_retention_hours: "{{ backup_telemetry_kafka_config.log_retention_hours | default(168) }}"
+    telemetry_kafka_log_retention_bytes: "{{ backup_telemetry_kafka_config.log_retention_bytes | default(-1) }}"
+    telemetry_kafka_log_segment_bytes: "{{ backup_telemetry_kafka_config.log_segment_bytes | default(1073741824) }}"
+    telemetry_kafka_topic_partitions: >-
+      {{
+        backup_telemetry_kafka_config.topic_partitions
+        | default([
+          {'name': 'idrac', 'partitions': 1},
+          {'name': 'ldms', 'partitions': 2}
+        ])
+      }}
+    telemetry_ldms_agg_port: "{{ backup_telemetry_config.ldms_agg_port | default(6001) }}"
+    telemetry_ldms_store_port: "{{ backup_telemetry_config.ldms_store_port | default(6001) }}"
+    telemetry_ldms_sampler_port: "{{ backup_telemetry_config.ldms_sampler_port | default(10001) }}"
+    telemetry_ldms_sampler_configurations: >-
+      {{
+        backup_telemetry_config.ldms_sampler_configurations
+        | default([
+          {
+            'plugin_name': 'meminfo',
+            'config_parameters': '',
+            'activation_parameters': 'interval=1000000'
+          },
+          {
+            'plugin_name': 'procstat2',
+            'config_parameters': '',
+            'activation_parameters': 'interval=1000000'
+          },
+          {
+            'plugin_name': 'vmstat',
+            'config_parameters': '',
+            'activation_parameters': 'interval=1000000'
+          },
+          {
+            'plugin_name': 'loadavg',
+            'config_parameters': '',
+            'activation_parameters': 'interval=1000000'
+          },
+          {
+            'plugin_name': 'procnetdev2',
+            'config_parameters': '',
+            'activation_parameters': 'interval=1000000 offset=0'
+          }
+        ])
+      }}
+
+- name: Write telemetry_config.yml in Omnia 2.1 format
+  ansible.builtin.template:
+    src: telemetry_config.j2
+    dest: "{{ input_project_dir }}/telemetry_config.yml"
+    mode: "{{ default_file_mode }}"
+  vars:
+    telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}"
+    telemetry_idrac_telemetry_collection_type: "{{ telemetry_idrac_telemetry_collection_type }}"
+    telemetry_victoria_deployment_mode: "{{ telemetry_victoria_deployment_mode }}"
+    telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}"
+    telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}"
+    telemetry_kafka_persistence_size: "{{ telemetry_kafka_persistence_size }}"
+    telemetry_kafka_log_retention_hours: "{{ telemetry_kafka_log_retention_hours }}"
+    telemetry_kafka_log_retention_bytes: "{{ telemetry_kafka_log_retention_bytes }}"
+    telemetry_kafka_log_segment_bytes: "{{ telemetry_kafka_log_segment_bytes }}"
+    telemetry_kafka_topic_partitions: "{{ telemetry_kafka_topic_partitions }}"
+    telemetry_ldms_agg_port: "{{ telemetry_ldms_agg_port }}"
+    telemetry_ldms_store_port: "{{ telemetry_ldms_store_port }}"
+    telemetry_ldms_sampler_port: "{{ telemetry_ldms_sampler_port }}"
+    telemetry_ldms_sampler_configurations: "{{ telemetry_ldms_sampler_configurations }}"
+
+- name: Validate YAML syntax of transformed telemetry_config.yml
+  ansible.builtin.command:
+    cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/telemetry_config.yml','r'))"
+  register: telemetry_yaml_validation
+  changed_when: false
+
+- name: Fail if YAML validation fails
+  ansible.builtin.fail:
+    msg: "{{ msg_yaml_validation_failed }}"
+  when:
+    - telemetry_yaml_validation.rc != 0
+
+- name: Display backup path (no-op when skipped)
+  ansible.builtin.debug:
+    msg: "{{ msg_using_backup_telemetry_config }}"
+  when: true
+
+- name: Display transformation summary
+  ansible.builtin.debug:
+    msg: "{{ msg_telemetry_config_transform_summary }}"
diff --git a/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2
new file mode 100644
index 0000000000..dbe38d70ad
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2
@@ -0,0 +1,199 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ***********************************************************************
+# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE.
+# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE.
+# ***********************************************************************
+
+# ================================
+# VARIABLE DETAILS
+# ================================
+# 1. user_registry
+#--------------------------
+# Configuration for user registry to configure additional images in Pulp
+# Fields:
+#   host       : Registry IP and port in format "IP:port"
+#   cert_path  : Path to SSL certificate file (.crt) - Required only if host is using HTTPS
+#   key_path   : Path to SSL private key file (.key) - Required only if host is using HTTPS
+# Notes:
+#   - If host is HTTPS, cert_path and key_path are required
+#   - If host is HTTP, cert_path and key_path can be left empty
+#   - cert_path should point to .crt files only
+#   - key_path should point to .key files only
+#   - cert and key paths are accessed from within the omnia_core container
+# 2. user_repo_url_x86_64
+#--------------------------
+#    Optional list of user-defined repository URLs for x86_64 architecture.
+#    Each entry can include: url, gpgkey, sslcacert, sslclientkey, sslclientcert, name, policy.
+#    Used for custom cluster packages like <arch>_slurm_custom.
+# Fields:
+#   url         : Base URL of the repository
+#   gpgkey      : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security)
+#   name        : Name of the repository
+#   sslcacert   : Path to SSL CA certificate (if using SSL)
+#   sslclientkey: Path to SSL client key (if using SSL)
+#   sslclientcert: Path to SSL client certificate (if using SSL)
+#   policy      : Repository policy (always, partial)
+# Notes:
+#   - Do not use Jinja variables in this configuration.
+#   - Omit SSL fields entirely if SSL is not in use.
+#   - Its a madatory field in case of slurm_custom with name as '<arch>_slurm_custom'
+#
+# 3. user_repo_url_aarch64
+#---------------------------
+#    Same as above but for aarch64 architecture.
+#
+# 4. rhel_os_url_x86_64
+#-----------------------------
+#    Mandatory when RHEL subscription is not registered.
+#    Contains repository URLs for codeready-builder, baseos, and appstream for x86_64.
+# Fields:
+#   url         : Base URL of the repository
+#   gpgkey      : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security)
+#   sslcacert   : Path to SSL CA certificate (if using SSL)
+#   sslclientkey: Path to SSL client key (if using SSL)
+#   sslclientcert: Path to SSL client certificate (if using SSL)
+#   policy      : Repository policy if mentioned allowed values (always, partial). IF not mentioned will consider from software_config.json
+#   name        : Name of the repository [ Allowed repo names <arch>_codeready-builder, <arch>_appstream, <arch>_baseos
+# Notes:
+#   - Do not use Jinja variables in this configuration.
+#   - Omit SSL fields entirely if SSL is not in use.
+#   - RHEL subscription is not registered, All 3 repositories [ <arch>_codeready-builder, <arch>_appstream, <arch>_baseos ]entries
+#      are mandatory.
+#
+# 5. rhel_os_url_aarch64
+#----------------------------
+#    Same as above but for aarch64 architecture.
+#
+#### ADVANCE CONFIGURATIONS FOR LOCAL REPO ###
+# 6. omnia_repo_url_rhel_x86_64
+#-------------------------------
+#    Mandatory repository URLs for downloading RPMS for Omnia features on RHEL x86_64.
+#    Each entry includes url, gpgkey, and name.
+#
+# This variable defines all the repo urls from where rpms will be downloaded for omnia features when cluster_os_type is rhel and arch x86_64
+# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously.
+# Fields:
+#  url        : Base URL of the repository.
+#  gpgkey     : URL of the GPG key for the repository.
+#                   If left empty, gpgcheck=0 for that repository.
+#  name       : A unique identifier for the repository or registry.
+#
+# 7. omnia_repo_url_rhel_aarch64
+#--------------------------------
+#    Same as above but for RHEL aarch64.
+#
+# 8. additional_repos_x86_64
+#----------------------------
+#    Optional list of additional repository URLs for x86_64 architecture.
+#    These repos are aggregated into a single Pulp repository, allowing dynamic
+#    addition/removal without changing compute node configurations.
+# Fields:
+#   url           : Base URL of the repository (required)
+#   gpgkey        : GPG key URL (required, can be empty - disables gpgcheck)
+#   name          : Unique name for the repository (required)
+#   sslcacert     : Path to SSL CA certificate (optional)
+#   sslclientkey  : Path to SSL client key (optional)
+#   sslclientcert : Path to SSL client certificate (optional)
+# Notes:
+#   - All repos are synced into a single aggregated Pulp repository
+#   - Compute nodes are configured once with a fixed URL that never changes
+#   - Policy is controlled globally via repo_config in software_config.json (per-entry policy not supported)
+#   - Name must be unique within this list and must not conflict with names in other repo keys
+#   - Packages from these repos can only be used via additional_packages.json
+#
+# 9. additional_repos_aarch64
+#-----------------------------
+#    Same as above but for aarch64 architecture.
+
+# ================================
+# VARIABLES
+# ================================
+# user_registry:
+#    - { host: "172.16.107.254:4000", cert_path: "/opt/omnia/domain.crt", key_path: "/opt/omnia/domain.key" }
+user_registry:
+{% set _user_registry = local_repo_user_registry | default([]) %}
+{% if (_user_registry | length) > 0 %}
+{% for _reg in _user_registry %}
+  - { host: {{ (_reg.host | default('')) | to_json }}, cert_path: {{ (_reg.cert_path | default('')) | to_json }}, key_path: {{ (_reg.key_path | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+# user_repo_url_x86_64:
+#  - { url: "", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "",  name: "x86_64_slurm_custom" }
+user_repo_url_x86_64:
+{% set _user_repo_url_x86_64 = local_repo_user_repo_url_x86_64 | default([]) %}
+{% if (_user_repo_url_x86_64 | length) > 0 %}
+{% for _repo in _user_repo_url_x86_64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+user_repo_url_aarch64:
+{% set _user_repo_url_aarch64 = local_repo_user_repo_url_aarch64 | default([]) %}
+{% if (_user_repo_url_aarch64 | length) > 0 %}
+{% for _repo in _user_repo_url_aarch64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+#Example:
+# rhel_os_url_x86_64:
+#  - { url: "http://crb.com/CRB/x86_64/os/", gpgkey: "http://crb.com/CRB/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_codeready-builder"}
+#  - { url: "http://BaseOS.com/BaseOS/x86_64/os/", gpgkey: "http://BaseOS.com/BaseOS/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_baseos"}
+#  - { url: "http://AppStream.com/AppStream/x86_64/os/", gpgkey: "http://AppStream.com/AppStream/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream" }
+rhel_os_url_x86_64:
+{% set _rhel_os_url_x86_64 = local_repo_rhel_os_url_x86_64 | default([]) %}
+{% if (_rhel_os_url_x86_64 | length) > 0 %}
+{% for _repo in _rhel_os_url_x86_64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, policy: {{ (_repo.policy | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+rhel_os_url_aarch64:
+{% set _rhel_os_url_aarch64 = local_repo_rhel_os_url_aarch64 | default([]) %}
+{% if (_rhel_os_url_aarch64 | length) > 0 %}
+{% for _repo in _rhel_os_url_aarch64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, policy: {{ (_repo.policy | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously.
+omnia_repo_url_rhel_x86_64:
+{% set _omnia_repo_url_rhel_x86_64 = local_repo_omnia_repo_url_rhel_x86_64 | default([]) %}
+{% if (_omnia_repo_url_rhel_x86_64 | length) > 0 %}
+{% for _repo in _omnia_repo_url_rhel_x86_64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+omnia_repo_url_rhel_aarch64:
+{% set _omnia_repo_url_rhel_aarch64 = local_repo_omnia_repo_url_rhel_aarch64 | default([]) %}
+{% if (_omnia_repo_url_rhel_aarch64 | length) > 0 %}
+{% for _repo in _omnia_repo_url_rhel_aarch64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+# Example:
+# additional_repos_x86_64:
+#  - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" }
+#  - { url: "https://repo.example.com/x86_64/", gpgkey: "", name: "custom-repo", sslcacert: "/path/ca.crt", sslclientkey: "/path/client.key", sslclientcert: "/path/client.crt" }
+additional_repos_x86_64:
+{% set _additional_repos_x86_64 = local_repo_additional_repos_x86_64 | default([]) %}
+{% if (_additional_repos_x86_64 | length) > 0 %}
+{% for _repo in _additional_repos_x86_64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
+additional_repos_aarch64:
+{% set _additional_repos_aarch64 = local_repo_additional_repos_aarch64 | default([]) %}
+{% if (_additional_repos_aarch64 | length) > 0 %}
+{% for _repo in _additional_repos_aarch64 %}
+  - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }} }
+{% endfor %}
+{% endif %}
diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2
new file mode 100644
index 0000000000..aec7a05ab7
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2
@@ -0,0 +1,160 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# ***********************************************************************
+# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE.
+# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE.
+# ***********************************************************************
+
+# -----------------------------SLURM------------------------------------------------
+# slurm_cluster
+# List of slurm clusters
+# cluster_name is required field
+
+# nfs_storage_name
+# Storage name corresponding to the NFS share to be used by slurm cluster 
+# This should match with exactly with a entry in storage_config.yml
+
+# config_sources
+# defines how the Slurm configuration files are provided to the cluster.
+# <conf name>: 
+#    <mapping> or <filepath>
+# <mapping> Supply the configuration values directly as a key–value map
+# <filepath> Supply the absolute path to a custom configuration file
+# The conf files supported by slurm are
+# slurm
+# cgroup
+# slurmdbd
+# gres
+# Thes files will be written into the slurm_config directory with .conf suffix
+
+slurm_cluster:
+{% set _slurm_cluster = omnia_slurm_cluster | default([]) %}
+{% if (_slurm_cluster | length) > 0 %}
+{% for _cluster in _slurm_cluster %}
+  - cluster_name: {{ _cluster.cluster_name | default('') }}
+    nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }}
+{% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %}
+    config_sources:
+{% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %}
+{% for _conf_name, _conf_val in _cluster.config_sources.items() %}
+{% if _conf_name in _supported %}
+{% if _conf_name == 'cgroup' and (_conf_val is mapping) %}
+      cgroup:
+        CgroupPlugin: {{ _conf_val.CgroupPlugin | default('autodetect') }}
+{% for _k, _v in _conf_val.items() %}
+{% if _k not in ['AllowedRAMSpace', 'CgroupPlugin', 'ConstrainCores', 'ConstrainDevices', 'ConstrainRAMSpace', 'ConstrainSwapSpace'] %}
+        {{ _k }}: {{ _v }}
+{% endif %}
+{% endfor %}
+        ConstrainCores: {{ _conf_val.ConstrainCores | default(true) }}
+        ConstrainDevices: {{ _conf_val.ConstrainDevices | default(true) }}
+        ConstrainRAMSpace: {{ _conf_val.ConstrainRAMSpace | default(true) }}
+        ConstrainSwapSpace: {{ _conf_val.ConstrainSwapSpace | default(true) }}
+{% if _conf_val.AllowedRAMSpace is defined %}
+        ### AllowedRAMSpace: {{ _conf_val.AllowedRAMSpace }}       This is not supported in 2.1, just attached for reference
+{% endif %}
+{% elif _conf_val is mapping %}
+      {{ _conf_name }}:
+{% for _k, _v in _conf_val.items() %}
+        {{ _k }}: {{ _v }}
+{% endfor %}
+{% else %}
+      {{ _conf_name }}: {{ _conf_val }}
+{% endif %}
+{% endif %}
+{% endfor %}
+    #   OR
+
+    # config_sources:
+    #   slurm: /path/to/custom_slurm.conf
+    #   cgroup: /path/to/custom_cgroup.conf
+    #   slurmdbd: /path/to/custom_slurmdbd.conf
+    #   gres: /path/to/custom_gres.conf
+{% else %}
+    # config_sources:
+    #   slurm:
+    #     SlurmctldTimeout: 60
+    #     SlurmdTimeout: 150
+    #   cgroup:
+    #     CgroupPlugin: autodetect
+    #     ConstrainCores: True
+    #     ConstrainDevices: True
+    #     ConstrainRAMSpace: True
+    #     ConstrainSwapSpace: True
+
+    #   OR
+
+    # config_sources:
+    #   slurm: /path/to/custom_slurm.conf
+    #   cgroup: /path/to/custom_cgroup.conf
+    #   slurmdbd: /path/to/custom_slurmdbd.conf
+    #   gres: /path/to/custom_gres.conf
+{% endif %}
+{% endfor %}
+{% endif %}
+
+# ----------------------------SERVICE K8S------------------------------------------------------
+# For service k8s cluster below parameters are required,(List)
+# - cluster_name is required field
+
+# - deployment: Exactly one entry in both the service_k8s_cluster lists must have deployment set to true to indicate where Kubernetes should be deployed.
+# Please ensure corresponding cluster entry is added to high_availability_config.yml if deployment is set to true. 
+
+# - Kubernetes SDN network.K8s_cni (Mandatory) - It can either be "calico" or "flannel".Default value assigned is "calico".
+# While setting up Kubernetes plugin for RoCE NIC, ensure that this value is set to "flannel"
+
+# - pod_external_ip_range: (Mandatory) These addresses will be used by Loadbalancer for assigning External IPs to K8s services
+# Make sure the IP range is not assigned to any node in the cluster.
+# Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16"
+
+# - k8s_service_addresses: Kubernetes internal network for services.This network must be unused in your network infrastructure.
+# Default value is "10.233.0.0/18"
+
+# - k8s_pod_network_cidr: Kubernetes pod network CIDR for internal network. When used, it will assign IP addresses from this range to individual pods.
+# This network must be unused in your network infrastructure.
+# Default value is "10.233.64.0/18"
+
+# nfs_storage_name : The nfs name should be same as one of the nfs name defined in storage_config.yml to configure the server.
+# ----------------------------CSI Driver------------------------------------------------------
+# Following csi powerscale driver input variables are mandatory only if csi_driver_powerscale entry is present in software_config.json
+# csi_powerscale_driver_secret_file_path: Absolute file path for the secret.yaml file.
+# User need to download secret.yaml file and fill required data in secret file. Provided the path of the secret file here.
+# File path for the values.yml file which will contain the Powerscale driver configuration parameters.
+# csi_powerscale_driver_values_file_path: User need to download values.yaml file and fill required data in values.yaml file. 
+# Provided the path of the values.yaml file here. mention configurable values
+
+# - k8s_crio_storage_size: Specifies the disk size allocated for CRI-O container storage.
+# This storage is used to store container images, writable layers, and runtime data.
+# Acceptable formats: "10G", "15G", "50G" (Only positive values in Gigabytes are allowed)
+# Default value is "20G"
+
+
+service_k8s_cluster:
+{% set _service_k8s_cluster = omnia_service_k8s_cluster | default([]) %}
+{% if (_service_k8s_cluster | length) > 0 %}
+{% for _cluster in _service_k8s_cluster %}
+  - cluster_name: {{ _cluster.cluster_name | default('') }}
+    deployment: {{ _cluster.deployment | default(false) }}
+    k8s_cni: {{ _cluster.k8s_cni | default('calico') }}
+    pod_external_ip_range: "{{ _cluster.pod_external_ip_range | default('') }}"
+    k8s_service_addresses: "{{ _cluster.k8s_service_addresses | default('') }}"
+    k8s_pod_network_cidr: "{{ _cluster.k8s_pod_network_cidr | default('') }}"
+    nfs_storage_name: "{{ _cluster.nfs_storage_name | default('') }}"
+    csi_powerscale_driver_secret_file_path: "{{ _cluster.csi_powerscale_driver_secret_file_path | default('') }}"
+    csi_powerscale_driver_values_file_path: "{{ _cluster.csi_powerscale_driver_values_file_path | default('') }}"
+    k8s_crio_storage_size: {{ _cluster.k8s_crio_storage_size | default('20G') }}
+{% endfor %}
+{% endif %}
diff --git a/upgrade/roles/import_input_parameters/templates/provision_config.j2 b/upgrade/roles/import_input_parameters/templates/provision_config.j2
new file mode 100644
index 0000000000..01fd84b2cf
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/provision_config.j2
@@ -0,0 +1,40 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# ***********************************************************************
+# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE.
+# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE.
+# ***********************************************************************
+
+#### Mandatory
+# This depicts the path where user has kept the PXE mapping file.
+# The mapping file consists of the Service tag, Admin MAC,Hostname and its respective admin IP address and/or BMC IP.
+# Ensure that admin IPs given in mapping file are within the network defined in the network_spec.yml
+# A templates for mapping file exists in omnia/examples, namely, pxe_mapping_file.csv
+# Format of csv: FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP
+pxe_mapping_file_path: "{{ provision_pxe_mapping_file_path }}"
+
+#### Mandatory
+# Language that needs to be set during OS provisioning.
+# Only language supported is "en_US.UTF-8"
+language: "{{ provision_language }}"
+
+#### Mandatory
+# Default lease time needs to be used by DHCP
+# Unit: seconds
+# Min: 21600
+# Default: 86400
+# Max: 31536000
+default_lease_time: "{{ provision_default_lease_time }}"
diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/storage_config.j2
new file mode 100644
index 0000000000..1c695a19a5
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/storage_config.j2
@@ -0,0 +1,95 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+# ***********************************************************************
+# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE.
+# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE.
+# ***********************************************************************
+
+# -----------------------------Powervault-------------------------------------------
+# powervault_config
+# ip: ipv4
+# A list of PowerVault controller IP addresses used for iSCSI target discovery and login.
+# In this configuration, a single controller portal is provided.
+
+# port:
+# Defines the TCP port for the iSCSI target service.
+# Port 3260 is the standard port for iSCSI communication.
+
+# isci_initiators:
+# Specifies the InitiatorName used by the host when connecting to the iSCSI target.
+# This IQN uniquely identifies the host to the storage array.
+
+# volume_id:
+# This is the unique WWN/identifier for the
+# specific volume that should be used for persistent storage.
+# The script uses this value during multipath scanning to select the correct mapped device
+
+#powervault_config:
+#  ip:
+#    - 172.1.2.3
+#  port: 3260
+#  isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7
+#  volume_id: 00c0ff4343f1f1f1001c8c4e6901000000
+
+{% if storage_has_powervault %}
+powervault_config:
+{% if storage_powervault_config.ip is defined %}
+  ip:
+{% for _ip in (storage_powervault_config.ip | default([])) %}
+    - {{ _ip }}
+{% endfor %}
+{% else %}
+  ip: []
+{% endif %}
+{% if storage_powervault_config.port is defined %}
+  port: {{ storage_powervault_config.port }}
+{% endif %}
+  isci_initiators: {{ storage_powervault_config.isci_initiators | default('') }}
+  volume_id: {{ storage_powervault_config.volume_id | default('') }}
+{% endif %}
+
+# -----------------------------NFS------------------------------------------------
+
+# This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node
+# This takes a list of dicts with possible keys server_ip, server_share_path, client_share_path, client_mount_options
+# In both the cases, the USER must manually update 'server_ip' and 'server_share_path' below with the correct values.
+# If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr"
+# Its mandatory to provide atleast one entry in nfs_client_params
+# Example for single mount file system:
+# nfs_client_params:
+# nfs_name : str ,Name of the NFS storage resource. The default is "nfs_storage_default".
+#     The user can assign any custom string to specify a different NFS storage resource.
+# - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"}
+# Example for supporting multiple mount points:
+# nfs_client_params:
+# - { server_ip: 198.168.0.1,server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"}
+# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"}
+# Example for multiple mount file system:
+# nfs_client_params:
+# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard"}
+# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"}
+
+nfs_client_params:
+{% set _nfs = storage_nfs_client_params | default([]) %}
+{% for _entry in _nfs %}
+  - server_ip: "{{ _entry.server_ip | default('') }}" # Provide the IP of the NFS server
+    server_share_path: "{{ _entry.server_share_path | default('') }}" # Provide server share path of the NFS Server
+    client_share_path: {{ _entry.client_share_path | default('') }}
+    client_mount_options: "{{ _entry.client_mount_options | default('nosuid,rw,sync,hard,intr') }}"
+{% if _entry.nfs_name is defined %}
+    nfs_name: {{ _entry.nfs_name }}
+{% endif %}
+
+{% endfor %}
diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2
new file mode 100644
index 0000000000..cb89944e1c
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2
@@ -0,0 +1,242 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# ***********************************************************************
+# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE.
+# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE.
+# ***********************************************************************
+
+# ============================================================================
+# TELEMETRY CONFIGURATION OVERVIEW
+# ============================================================================
+# This file configures telemetry data collection and storage for Dell Omnia.
+#
+# SECTIONS:
+#   1. iDRAC Telemetry    : Hardware metrics from Dell PowerEdge servers
+#   2. VictoriaMetrics    : Time-series database for metric storage
+#   3. Kafka              : Distributed streaming platform for telemetry data
+#   4. LDMS               : Lightweight Distributed Metric Service for compute nodes
+#
+# ============================================================================
+# STORAGE REQUIREMENTS SUMMARY
+# ============================================================================
+# 
+# VICTORIAMETRICS STORAGE:
+# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐
+# │ Deployment Mode │ Per-Pod Storage  │ Number of Pods  │ Total Storage    │
+# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤
+# │ Single-node     │ persistence_size │ 1 pod           │ 1× storage       │
+# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤
+# │ Cluster         │ persistence_size │ 3 vmstorage     │ 3× storage       │
+# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘
+# Example: 8Gi per pod → Single-node: 8Gi total, Cluster: 24Gi total
+#
+# KAFKA STORAGE:
+# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐
+# │ Component       │ Per-Pod Storage  │ Number of Pods  │ Total Storage    │
+# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤
+# │ Kafka Broker    │ persistence_size │ 3 pods          │ 3× storage       │
+# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤
+# │ Kafka Controller│ persistence_size │ 3 pods          │ 3× storage       │
+# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤
+# │ TOTAL KAFKA     │ persistence_size │ 6 pods          │ 6× storage       │
+# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘
+# Example: 8Gi per pod → 48Gi total Kafka storage
+#
+# COMBINED STORAGE EXAMPLES:
+#   Default (8Gi each): VictoriaMetrics Cluster (24Gi) + Kafka (48Gi) = 72Gi total
+#   Single-node mode:   VictoriaMetrics Single (8Gi) + Kafka (48Gi) = 56Gi total
+#
+# STORAGE OPTIONS:
+#   - VictoriaMetrics: Store iDRAC telemetry in time-series database
+#   - Kafka: Stream iDRAC and LDMS telemetry to Kafka topics
+#   - Both: Store iDRAC in both Victoria and Kafka (recommended)
+# ============================================================================
+
+# ============================================================================
+# iDRAC TELEMETRY CONFIGURATION
+# ============================================================================
+# iDRAC telemetry collects hardware metrics from Dell PowerEdge servers.
+# Telemetry data can be stored in VictoriaMetrics, Kafka, or both.
+
+# Enable or disable iDRAC telemetry support
+# Accepted values: true or false
+# Default: true
+idrac_telemetry_support: {{ telemetry_idrac_telemetry_support | default(true) | bool | ternary('true', 'false') }}
+
+# Specify where to store iDRAC telemetry data
+# Supported values:
+#   - "victoria"        : Store in VictoriaMetrics only
+#   - "kafka"           : Store in Kafka only
+#   - "victoria,kafka"  : Store in both (recommended)
+# Default: "victoria,kafka"
+idrac_telemetry_collection_type: {{ telemetry_idrac_telemetry_collection_type | default('victoria,kafka') | to_json }}
+
+# ============================================================================
+# VICTORIAMETRICS CONFIGURATION
+# ============================================================================
+# VictoriaMetrics is a time-series database for storing telemetry metrics.
+# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type.
+#
+# DEPLOYMENT MODES:
+#   - single-node: Simple deployment with one pod (suitable for small deployments)
+#   - cluster: High-availability deployment with multiple components
+#               (recommended for production and large-scale deployments)
+victoria_configurations:
+  # VictoriaMetrics deployment mode
+  # Supported values:
+  #   - "single-node" : Simple deployment (1 pod, suitable for dev/test)
+  #   - "cluster"     : High-availability deployment (7 pods, recommended for production)
+  # Default: "cluster"
+  #
+  # Cluster Mode Benefits:
+  #   - High availability (no single point of failure)
+  #   - Horizontal scalability (scale components independently)
+  #   - Better performance (4x ingestion, 2x query speed)
+  #   - Production-ready architecture
+  #
+  # Single-Node Benefits:
+  #   - Simple setup (fewer resources)
+  #   - Suitable for small deployments (<10 nodes)
+  #   - Lower resource usage (~4Gi memory vs ~10Gi for cluster)
+  deployment_mode: {{ telemetry_victoria_deployment_mode | default('cluster') | to_json }}
+
+  # The amount of storage allocated for EACH VictoriaMetrics persistent volume.
+  # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode:
+  #   - Single-node mode: Total storage = persistence_size × 1 pod
+  #   - Cluster mode: Total storage = persistence_size × 3 vmstorage pods
+  #   - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage
+  # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]"
+  # Default: 8Gi (results in 24Gi total storage for cluster mode)
+  persistence_size: {{ telemetry_victoria_persistence_size | default('8Gi') | to_json }}
+
+  # Duration (in hours) to retain victoria logs before they are deleted.
+  # Default: 168 (7 days)
+  retention_period: {{ telemetry_victoria_retention_period | default(168) }}
+
+# ============================================================================
+# KAFKA CONFIGURATION
+# ============================================================================
+# Apache Kafka is a distributed streaming platform for storing telemetry data.
+# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type.
+# Also used for LDMS telemetry when LDMS software is configured.
+#
+# NOTE: Kafka topics are auto-generated based on enabled features:
+#   - 'idrac' topic: Required when idrac_telemetry_support=true and 'kafka' is enabled
+#   - 'ldms' topic:  Required when LDMS is configured in software_config.json
+kafka_configurations:
+  # The amount of storage allocated for EACH Kafka persistent volume.
+  # IMPORTANT: Total Kafka storage = persistence_size × 6 pods
+  #   - 3 Kafka brokers (each gets persistence_size storage)
+  #   - 3 Kafka controllers (each gets persistence_size storage)
+  #   - Example: 8Gi × 6 = 48Gi total Kafka storage
+  # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]"
+  # Default: 8Gi (results in 48Gi total Kafka storage)
+  persistence_size: {{ telemetry_kafka_persistence_size | default('8Gi') | to_json }}
+
+  # The number of hours to retain Kafka logs before they are deleted.
+  # Default: 168 (7 days)
+  log_retention_hours: {{ telemetry_kafka_log_retention_hours | default(168) }}
+
+  # The maximum size of Kafka logs (in bytes) before they are deleted.
+  # Default: -1 (unlimited)
+  log_retention_bytes: {{ telemetry_kafka_log_retention_bytes | default(-1) }}
+
+  # The maximum size of Kafka log segments (in bytes) before they are deleted.
+  # Default: 1073741824 (1 GB)
+  log_segment_bytes: {{ telemetry_kafka_log_segment_bytes | default(1073741824) }}
+
+  # Kafka Topic Partitions Configuration
+  # ----------------------------------------------------------------------------
+  # Define the number of partitions for each Kafka topic.
+  # Increasing partitions can improve throughput but also increases storage/overhead.
+  #
+  # IMPORTANT: Topic names are FIXED and cannot be changed.
+  #   - Topic names: Only 'idrac' and 'ldms' are allowed
+  #   - Configurable: Only partition counts can be modified
+  #
+  # Topic Requirements (auto-validated):
+  #   - 'idrac': Required when idrac_telemetry_support=true and 'kafka' is enabled
+  #   - 'ldms':  Required when LDMS software is configured in software_config.json
+  #
+  # Default partition counts: idrac=1, ldms=2
+  topic_partitions:
+{% for _topic in (telemetry_kafka_topic_partitions | default([])) %}
+    - name: {{ _topic.name | default('') | to_json }}
+      partitions: {{ _topic.partitions | default(1) }}
+{% endfor %}
+
+# ============================================================================
+# LDMS (Lightweight Distributed Metric Service) CONFIGURATION
+# ============================================================================
+# LDMS collects performance metrics from compute nodes (CPU, memory, network, etc.)
+# and streams them to Kafka for storage and analysis.
+#
+# PREREQUISITE: To enable LDMS support, add the following to software_config.json:
+#   {
+#     "softwares": [
+#       {"name": "ldms", "arch": ["x86_64", "aarch64"]}
+#     ]
+#   }
+#
+# When LDMS software is configured, the 'ldms' topic MUST be defined in
+# kafka_configurations.topic_partitions above.
+#
+# LDMS Port Configurations
+# Aggregator port on service k8s cluster
+# Valid range: 6001-6100
+# Default: 6001
+ldms_agg_port: {{ telemetry_ldms_agg_port | default(6001) }}
+
+# Store daemon port on service k8s cluster
+# Can be the same as ldms_agg_port
+# Valid range: 6001-6100
+# Default: 6001
+ldms_store_port: {{ telemetry_ldms_store_port | default(6001) }}
+
+# Sampler port on compute nodes
+# Valid range: 10001-10100
+# Default: 10001
+ldms_sampler_port: {{ telemetry_ldms_sampler_port | default(10001) }}
+
+# LDMS Sampler Plugin Configurations
+# ----------------------------------------------------------------------------
+# Configure which metrics to collect from compute nodes and collection intervals.
+# Each plugin collects specific system metrics.
+#
+# Parameters:
+#   - plugin_name: Name of the LDMS sampler plugin
+#   - config_parameters: Plugin-specific configuration (as a single string)
+#   - activation_parameters: Collection schedule in MICROSECONDS
+#       Format: "interval=<microseconds> offset=<microseconds>"
+#       Example: "interval=1000000"         (1000000 microseconds = 1 second)
+#                "interval=1000000 offset=0" (1000000 microseconds with no offset)
+#
+# Available Plugins:
+#   - meminfo: Memory usage statistics
+#   - procstat2: Process statistics
+#   - vmstat: Virtual memory statistics
+#   - loadavg: System load average
+#   - procnetdev2: Network interface statistics
+ldms_sampler_configurations:
+{% if telemetry_ldms_sampler_configurations is none %}
+  null
+{% else %}
+{% for _plugin in (telemetry_ldms_sampler_configurations | default([])) %}
+  - plugin_name: {{ _plugin.plugin_name | default('') }}
+    config_parameters: {{ _plugin.config_parameters | default('') | to_json }}
+    activation_parameters: {{ _plugin.activation_parameters | default('interval=1000000') | to_json }}
+{% endfor %}
+{% endif %}
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index bc4ca7430a..722399b7d0 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -43,6 +43,39 @@ msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format -
 msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory"
 msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)"
 
+# Local repo config transformation messages
+msg_backup_local_repo_config_missing: "Backup local_repo_config.yml missing"
+msg_local_repo_config_missing: "local_repo_config.yml missing"
+msg_using_backup_local_repo_config: "Using backup local_repo_config.yml (backup not modified)"
+msg_omnia_repo_url_rhel_x86_64_missing: "omnia_repo_url_rhel_x86_64 is mandatory"
+msg_omnia_repo_url_rhel_aarch64_missing: "omnia_repo_url_rhel_aarch64 is mandatory"
+
+# Provision config transformation messages
+msg_backup_provision_config_missing: "Backup provision_config.yml missing"
+msg_provision_config_missing: "provision_config.yml missing"
+msg_using_backup_provision_config: "Using backup provision_config.yml (backup not modified)"
+msg_pxe_mapping_file_path_missing: "pxe_mapping_file_path is mandatory"
+
+# Storage config transformation messages
+msg_backup_storage_config_missing: "Backup storage_config.yml missing"
+msg_storage_config_missing: "storage_config.yml missing"
+msg_using_backup_storage_config: "Using backup storage_config.yml (backup not modified)"
+msg_nfs_client_params_missing: "nfs_client_params is mandatory"
+msg_nfs_client_param_entry_missing_keys: "Each entry in nfs_client_params must define server_ip, server_share_path, client_share_path, and client_mount_options"
+msg_powervault_missing_keys: "powervault_config (when present) must define ip (non-empty list), isci_initiators, and volume_id"
+
+# Omnia config transformation messages
+msg_backup_omnia_config_missing: "Backup omnia_config.yml missing"
+msg_omnia_config_missing: "omnia_config.yml missing"
+msg_using_backup_omnia_config: "Using backup omnia_config.yml (backup not modified)"
+msg_slurm_cluster_missing: "slurm_cluster is mandatory"
+msg_service_k8s_cluster_missing: "service_k8s_cluster is mandatory"
+
+# Telemetry config transformation messages
+msg_backup_telemetry_config_missing: "Backup telemetry_config.yml missing"
+msg_telemetry_config_missing: "telemetry_config.yml missing"
+msg_using_backup_telemetry_config: "Using backup telemetry_config.yml (backup not modified)"
+
 ### Restore summary messages
 msg_restore_summary: |
   {{ restore_item.name }} restored from backup.
@@ -66,6 +99,45 @@ msg_ha_config_transform_summary: |
   - Ensured service_k8s_cluster_ha is a list
   - Ensured virtual_ip_address is present
 
+# Restore summary message for local repo config transformation
+msg_local_repo_config_transform_summary: |
+  local_repo_config.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/local_repo_config.yml
+  Changes:
+  - Normalized repo URL keys to arch-specific schema
+  - Migrated omnia_registry to user_registry (when present)
+  - Ensured mandatory omnia_repo_url_rhel_* keys are present
+
+# Restore summary message for provision config transformation
+msg_provision_config_transform_summary: |
+  provision_config.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/provision_config.yml
+  Changes:
+  - Ensured pxe_mapping_file_path, language, and default_lease_time are present
+
+# Restore summary message for storage config transformation
+msg_storage_config_transform_summary: |
+  storage_config.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/storage_config.yml
+  Changes:
+  - Ensured nfs_client_params is present and entries contain required keys
+
+# Restore summary message for omnia config transformation
+msg_omnia_config_transform_summary: |
+  omnia_config.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/omnia_config.yml
+  Changes:
+  - Ensured slurm_cluster and service_k8s_cluster are lists
+  - Ensured required sections are present
+
+# Restore summary message for telemetry config transformation
+msg_telemetry_config_transform_summary: |
+  telemetry_config.yml upgraded to 2.1 format.
+  Backup preserved at: {{ backup_location }}/telemetry_config.yml
+  Changes:
+  - Rendered Omnia 2.1 telemetry template with values from 2.0 backup
+  - Applied schema defaults for missing fields
+
 # === Input files to restore from backup ===
 # Add input files here that should be copied from backup_location to input_project_dir
 # Each entry should have:
@@ -78,11 +150,15 @@ msg_ha_config_transform_summary: |
 # - Files that are the same format in 2.0 and 2.1
 # - Files where you want to preserve the backup values exactly
 #
-# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml)
+# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml, local_repo_config.yml,
+# provision_config.yml, user_registry_credential.yml)
 restore_input_files:
   - name: software_config.json
     mode: '0644'
     validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'"
+  - name: security_config.yml
+    mode: '0644'
+    validate_cmd: "python3 -c \"import yaml; yaml.safe_load(open('{{ input_project_dir }}/security_config.yml','r'))\""
   - name: pxe_mapping_file.csv
     mode: '0644'
     validate_cmd: ""

From 2cbce8f6ca9769937c3464fe3c8f0df0ea5c8006 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Tue, 10 Feb 2026 12:05:48 +0530
Subject: [PATCH 103/172] Update omnia.sh

---
 omnia.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/omnia.sh b/omnia.sh
index 746fb8fd34..235cc1dbc1 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -988,6 +988,18 @@ show_help() {
 }
 
 install_omnia_core() {
+    # Detect existing Omnia 2.0 installation
+    if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        # Read version from metadata inside container
+        current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
+        if [ "$current_version" = "2.0.0.0" ]; then
+            echo -e "${RED}ERROR: Existing Omnia 2.0 installation detected.${NC}"
+            echo -e "${YELLOW}To upgrade, run: $0 --upgrade${NC}"
+            echo -e "${YELLOW}For a fresh install, first run: $0 --uninstall${NC}"
+            exit 1
+        fi
+    fi
+
     local omnia_core_tag="1.1"
     local omnia_core_registry=""
     

From 0c090cf22961491aaa3aa3c97e949033c839f1e7 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Tue, 10 Feb 2026 12:08:06 +0530
Subject: [PATCH 104/172] update reachability

---
 .../external_kafka_connect_details/tasks/main.yml      | 10 ++++++++++
 .../roles/external_kafka_connect_details/vars/main.yml |  4 ++++
 .../external_victoria_connect_details/tasks/main.yml   | 10 ++++++++++
 .../external_victoria_connect_details/vars/main.yml    |  4 ++++
 4 files changed, 28 insertions(+)

diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml
index 96c6d0ca5f..3ee17c1c80 100644
--- a/utils/roles/external_kafka_connect_details/tasks/main.yml
+++ b/utils/roles/external_kafka_connect_details/tasks/main.yml
@@ -13,6 +13,16 @@
 #  limitations under the License.
 ---
 
+- name: Validate service k8s controller connectivity
+  block:
+    - name: Wait for service k8s controller connection
+      ansible.builtin.wait_for_connection:
+        timeout: 30
+  rescue:
+    - name: Fail when service k8s controller is not reachable
+      ansible.builtin.fail:
+        msg: "{{ kafka_preflight_err_service_k8s_controller_unreachable }}"
+
 - name: Check kubectl presence
   ansible.builtin.command: kubectl version --client=true
   register: kubectl_check
diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml
index 7a7d831275..be23cde089 100644
--- a/utils/roles/external_kafka_connect_details/vars/main.yml
+++ b/utils/roles/external_kafka_connect_details/vars/main.yml
@@ -37,6 +37,10 @@ kafka_preflight_err_ha_vip_missing: >-
   Failed to determine the service Kubernetes control plane VIP from High Availability config.
   Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}.
 
+kafka_preflight_err_service_k8s_controller_unreachable: >-
+  Service Kubernetes controller is not reachable over SSH: {{ ansible_host | default(inventory_hostname) }}.
+  Ensure the service Kubernetes VIP is reachable and resolvable from the OIM host.
+
 kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity"
 kafka_ome_ui_enable_label: "Enable Kafka Connectivity"
 kafka_ome_auth_mode_value: "SSL"
diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index e06b061828..1c3e98a516 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -13,6 +13,16 @@
 #  limitations under the License.
 ---
 
+- name: Validate service k8s controller connectivity
+  block:
+    - name: Wait for service k8s controller connection
+      ansible.builtin.wait_for_connection:
+        timeout: 30
+  rescue:
+    - name: Fail when service k8s controller is not reachable
+      ansible.builtin.fail:
+        msg: "{{ victoria_preflight_err_service_k8s_controller_unreachable }}"
+
 - name: Check kubectl presence
   ansible.builtin.command: kubectl version --client=true
   register: kubectl_check
diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml
index c2de781fbf..f9a1fb72dd 100644
--- a/utils/roles/external_victoria_connect_details/vars/main.yml
+++ b/utils/roles/external_victoria_connect_details/vars/main.yml
@@ -38,6 +38,10 @@ victoria_preflight_err_ha_vip_missing: >-
   Failed to determine the service Kubernetes control plane VIP from High Availability config.
   Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}.
 
+victoria_preflight_err_service_k8s_controller_unreachable: >-
+  Service Kubernetes controller is not reachable over SSH: {{ ansible_host | default(inventory_hostname) }}.
+  Ensure the service Kubernetes VIP is reachable and resolvable from the OIM host.
+
 victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write"
 victoria_sfm_remote_write_target_name: "victoria"
 victoria_sfm_remote_write_message_version: "v1"

From 44c3f435a71badb264e84ee5e88d4a30dd7ae4c8 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Tue, 10 Feb 2026 12:08:47 +0530
Subject: [PATCH 105/172] Update main.yml

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 utils/roles/external_victoria_connect_details/tasks/main.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index e06b061828..0fbf8a9fc5 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -164,7 +164,6 @@
     vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}"
     vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}"
     victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt"
-    
 
 - name: Fail when LoadBalancer IPs are not available
   ansible.builtin.fail:

From c007c366a7e5d9ef6fd6b66ea0bd621dd65eb746 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Tue, 10 Feb 2026 06:42:44 +0000
Subject: [PATCH 106/172] change iscsi_initiator field name

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../module_utils/input_validation/schema/storage_config.json  | 4 ++--
 .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2     | 2 +-
 input/storage_config.yml                                      | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json
index 114d88f525..e300410346 100644
--- a/common/library/module_utils/input_validation/schema/storage_config.json
+++ b/common/library/module_utils/input_validation/schema/storage_config.json
@@ -52,7 +52,7 @@
       },
       "powervault_config": {
         "type": "object",
-        "required": ["ip", "iscsi_initiators", "volume_id"],
+        "required": ["ip", "iscsi_initiator", "volume_id"],
         "properties": {
           "ip": {
             "description": "List of target controller IP addresses",
@@ -70,7 +70,7 @@
             "type": "integer"
           },
 
-          "iscsi_initiators": {
+          "iscsi_initiator": {
             "description": "iSCSI initiator IQN",
             "type": "string",
             "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$"
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
index d99d9dc90f..9d8f6c0f38 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
@@ -92,7 +92,7 @@
 
             PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %})
             PORT="{{ powervault_config.port | default(3260) }}"
-            INITIATOR_IQN="{{ powervault_config.iscsi_initiators | default('') }}"
+            INITIATOR_IQN="{{ powervault_config.iscsi_initiator | default('') }}"
             VOLUME_ID="{{ powervault_config.volume_id | default('') }}"
             FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}"
             MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}"
diff --git a/input/storage_config.yml b/input/storage_config.yml
index 9492f15558..14ab13e0d1 100644
--- a/input/storage_config.yml
+++ b/input/storage_config.yml
@@ -22,7 +22,7 @@
 # Mandatory when using PowerVault for persistent storage.
 # Below parameters are mandatory when powervault_config is defined
     # ip: A list of PowerVault controller ipv4 addresses used for iSCSI target discovery and login.
-    # iscsi_initiators: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array.
+    # iscsi_initiator: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array.
     # volume_id: This is the unique WWN/identifier for the specific volume that should be used for persistent storage. This value is used for multipath scanning to select the correct mapped device.
 
 # Below are the optional parameters when powervault_config is defined
@@ -35,7 +35,7 @@
 #  ip:
 #    - 172.1.2.3
 #  port: 3260
-#  iscsi_initiators: iqn.initiator.com.example:7d7d7d7d7d7
+#  iscsi_initiator: iqn.initiator.com.example:7d7d7d7d7d7
 #  volume_id: 00c0ff4343f1f1f1001c8c4e6901000000
 
 

From 979dbd08402773f67e5c981ab479eefc6f102939 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Tue, 10 Feb 2026 12:13:56 +0530
Subject: [PATCH 107/172] Update main.yml

---
 utils/roles/external_victoria_connect_details/tasks/main.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml
index 1c3e98a516..260c8376fd 100644
--- a/utils/roles/external_victoria_connect_details/tasks/main.yml
+++ b/utils/roles/external_victoria_connect_details/tasks/main.yml
@@ -174,7 +174,6 @@
     vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}"
     vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}"
     victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt"
-    
 
 - name: Fail when LoadBalancer IPs are not available
   ansible.builtin.fail:

From a4fc5b8552c278c47d596d695197bc0a0b58590c Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Tue, 10 Feb 2026 06:48:38 +0000
Subject: [PATCH 108/172] change iscsi_initiator field value

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 input/storage_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/input/storage_config.yml b/input/storage_config.yml
index 14ab13e0d1..399bf42fd6 100644
--- a/input/storage_config.yml
+++ b/input/storage_config.yml
@@ -35,7 +35,7 @@
 #  ip:
 #    - 172.1.2.3
 #  port: 3260
-#  iscsi_initiator: iqn.initiator.com.example:7d7d7d7d7d7
+#  iscsi_initiator: iqn.2025-01.com.dell:scontrol-node
 #  volume_id: 00c0ff4343f1f1f1001c8c4e6901000000
 
 

From 783e4f9dce1cce56dac56f4b11d731aa55962039 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 10 Feb 2026 12:32:56 +0530
Subject: [PATCH 109/172] Updated to take care of ansible lint issues

---
 .../tasks/transform_local_repo_config.yml     |  9 ++-
 .../tasks/transform_storage_config.yml        | 56 +++++++++----------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
index 20c95798b1..2b513e36ae 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
@@ -68,9 +68,12 @@
     local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}"
     local_repo_rhel_os_url_x86_64: "{{ backup_local_repo_config.rhel_os_url_x86_64 | default(backup_local_repo_config.rhel_os_url | default([])) }}"
     local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}"
-    local_repo_omnia_repo_url_rhel_x86_64: "{{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}"
-    local_repo_omnia_repo_url_rhel_aarch64: "{{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}"
-    local_repo_additional_repos_x86_64: "{{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }}"
+    local_repo_omnia_repo_url_rhel_x86_64: >-
+      {{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}
+    local_repo_omnia_repo_url_rhel_aarch64: >-
+      {{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}
+    local_repo_additional_repos_x86_64: >-
+      {{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }}
     local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}"
 
 - name: Fail if omnia_repo_url_rhel_x86_64 is missing
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
index 72b82aa7f8..3590f55995 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
@@ -68,35 +68,33 @@
   ansible.builtin.fail:
     msg: "{{ msg_nfs_client_param_entry_missing_keys }}"
   when: >-
-    {{
-      (
-        storage_nfs_client_params
-        | selectattr('server_ip', 'undefined')
-        | list
-        | length
-      ) > 0
-      or
-      (
-        storage_nfs_client_params
-        | selectattr('server_share_path', 'undefined')
-        | list
-        | length
-      ) > 0
-      or
-      (
-        storage_nfs_client_params
-        | selectattr('client_share_path', 'undefined')
-        | list
-        | length
-      ) > 0
-      or
-      (
-        storage_nfs_client_params
-        | selectattr('client_mount_options', 'undefined')
-        | list
-        | length
-      ) > 0
-    }}
+    (
+      storage_nfs_client_params
+      | selectattr('server_ip', 'undefined')
+      | list
+      | length
+    ) > 0
+    or
+    (
+      storage_nfs_client_params
+      | selectattr('server_share_path', 'undefined')
+      | list
+      | length
+    ) > 0
+    or
+    (
+      storage_nfs_client_params
+      | selectattr('client_share_path', 'undefined')
+      | list
+      | length
+    ) > 0
+    or
+    (
+      storage_nfs_client_params
+      | selectattr('client_mount_options', 'undefined')
+      | list
+      | length
+    ) > 0
 
 - name: Write storage_config.yml in Omnia 2.1 format
   ansible.builtin.template:

From 717820cca7efd5d149a62354d4fcd7ee61bca689 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 10 Feb 2026 12:38:35 +0530
Subject: [PATCH 110/172] Updating to take care of ansible lint issues

---
 .../tasks/transform_local_repo_config.yml     | 33 ++++++++++++++-----
 .../tasks/transform_storage_config.yml        | 31 +++--------------
 2 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
index 2b513e36ae..4b8ac8e3ec 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml
@@ -64,16 +64,33 @@
 
 - name: Normalize repo url keys to 2.1 schema
   ansible.builtin.set_fact:
-    local_repo_user_repo_url_x86_64: "{{ backup_local_repo_config.user_repo_url_x86_64 | default(backup_local_repo_config.user_repo_url | default([])) }}"
+    local_repo_user_repo_url_x86_64: "{{
+      backup_local_repo_config.user_repo_url_x86_64 |
+      default(backup_local_repo_config.user_repo |
+      default([]))
+    }}"
     local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}"
-    local_repo_rhel_os_url_x86_64: "{{ backup_local_repo_config.rhel_os_url_x86_64 | default(backup_local_repo_config.rhel_os_url | default([])) }}"
+    local_repo_rhel_os_url_x86_64: "{{
+      backup_local_repo_config.rhel_os_url_x86_64 |
+      default(backup_local_repo_config.rhel_os_url |
+      default([]))
+    }}"
     local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}"
-    local_repo_omnia_repo_url_rhel_x86_64: >-
-      {{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}
-    local_repo_omnia_repo_url_rhel_aarch64: >-
-      {{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}
-    local_repo_additional_repos_x86_64: >-
-      {{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }}
+    local_repo_omnia_repo_url_rhel_x86_64: "{{
+      backup_local_repo_config.omnia_repo_url_rhel_x86_64 |
+      default(backup_local_repo_config.omnia_repo_url_rhel |
+      default([]))
+    }}"
+    local_repo_omnia_repo_url_rhel_aarch64: "{{
+      backup_local_repo_config.omnia_repo_url_rhel_aarch64 |
+      default(backup_local_repo_config.omnia_repo_url_rhel |
+      default([]))
+    }}"
+    local_repo_additional_repos_x86_64: "{{
+      backup_local_repo_config.additional_repos_x86_64 |
+      default(backup_local_repo_config.additional_repos |
+      default([]))
+    }}"
     local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}"
 
 - name: Fail if omnia_repo_url_rhel_x86_64 is missing
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
index 3590f55995..54dbf07bc0 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
@@ -68,33 +68,10 @@
   ansible.builtin.fail:
     msg: "{{ msg_nfs_client_param_entry_missing_keys }}"
   when: >-
-    (
-      storage_nfs_client_params
-      | selectattr('server_ip', 'undefined')
-      | list
-      | length
-    ) > 0
-    or
-    (
-      storage_nfs_client_params
-      | selectattr('server_share_path', 'undefined')
-      | list
-      | length
-    ) > 0
-    or
-    (
-      storage_nfs_client_params
-      | selectattr('client_share_path', 'undefined')
-      | list
-      | length
-    ) > 0
-    or
-    (
-      storage_nfs_client_params
-      | selectattr('client_mount_options', 'undefined')
-      | list
-      | length
-    ) > 0
+    (storage_nfs_client_params | selectattr('server_ip', 'undefined') | list | length) > 0 or
+    (storage_nfs_client_params | selectattr('server_share_path', 'undefined') | list | length) > 0 or
+    (storage_nfs_client_params | selectattr('client_share_path', 'undefined') | list | length) > 0 or
+    (storage_nfs_client_params | selectattr('client_mount_options', 'undefined') | list | length) > 0
 
 - name: Write storage_config.yml in Omnia 2.1 format
   ansible.builtin.template:

From 573c5ef6b84b72ded8a742cf6751c4873dc49b90 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Tue, 10 Feb 2026 13:17:23 +0530
Subject: [PATCH 111/172] Added /etc/hosts check

---
 .../common_utils/slurm_conf_utils.py          |  2 +-
 .../slurm_config/tasks/check_ctld_running.yml | 34 ++++++++++++++++---
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 26f24762aa..3d5b259c2d 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -865,7 +865,7 @@ def parse_slurm_conf(file_path, conf_name, validate):
             if validate and skey not in current_conf:
                 raise ValueError(
                     f"Invalid key while parsing {file_path}: {skey}")
-            if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY:
+            if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY or len(tmp_dict) > 1:
                 slurm_dict[list(tmp_dict.keys())[0]] = list(
                     slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict]
             elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV:
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 52984c2afb..126261dfcd 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -43,16 +43,40 @@
         - ansible_facts.services['slurmctld.service'] is defined
         - ansible_facts.services['slurmctld.service'].state == 'running'
 
+    - name: Check reachability of hosts in ip_name_map
+      ansible.builtin.wait_for:
+        host: "{{ host }}"
+        port: 22
+        timeout: 10
+        state: started
+      delegate_to: localhost
+      loop: "{{ ip_name_map.values() | list }}"
+      loop_control:
+        loop_var: host
+      register: ip_map_ssh_check
+      ignore_errors: true
+      ignore_unreachable: true
+
+    - name: Build list of reachable hosts from ip_name_map
+      ansible.builtin.set_fact:
+        reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}"
+
     - name: Update /etc/hosts with controller hostname and IP
       ansible.builtin.lineinfile:
         path: /etc/hosts
-        regexp: '^{{ ip.value }}\s+{{ ip.key }}'
-        line: "{{ ip.value }} {{ ip.key }}"
+        regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}'
+        line: "{{ host_entry.value }} {{ host_entry.key }}"
         state: present
-      loop: "{{ ip_name_map | dict2items }}"
+      loop: "{{ reachable_hosts | product(ip_name_map | dict2items) | list }}"
       loop_control:
-        loop_var: ip
-      delegate_to: "{{ item }}"
+        loop_var: host_combo
+      vars:
+        target_host: "{{ host_combo[0] }}"
+        host_entry: "{{ host_combo[1] }}"
+      delegate_to: "{{ target_host }}"
+      when: reachable_hosts | length > 0
+      ignore_errors: true
+      ignore_unreachable: true
 
     - name: Trigger the scontrol reconfigure
       ansible.builtin.command: scontrol reconfigure

From 711817d8e89151744f965515505a4c9c2d90a070 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Tue, 10 Feb 2026 13:46:46 +0530
Subject: [PATCH 112/172] Lint fixes

---
 .../input_validation/common_utils/slurm_conf_utils.py           | 2 +-
 discovery/roles/slurm_config/tasks/check_ctld_running.yml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 3d5b259c2d..0c98f64e6c 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -766,7 +766,7 @@ def validate_config_types(conf_dict, conf_name, module):
     if not current_conf:
         return {'invalid_keys': [], 'type_errors': []}
     # module.fail_json(msg=f"Invalid configuration name: {conf_name}", conf_dict=conf_dict, current_conf=current_conf)
-    module.warn(conf_name)
+    # module.warn(conf_name)
     invalid_keys = list(
         set(conf_dict.keys()).difference(set(current_conf.keys())))
     type_errors = []
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 126261dfcd..5af73f984c 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -75,8 +75,8 @@
         host_entry: "{{ host_combo[1] }}"
       delegate_to: "{{ target_host }}"
       when: reachable_hosts | length > 0
-      ignore_errors: true
       ignore_unreachable: true
+      failed_when: true
 
     - name: Trigger the scontrol reconfigure
       ansible.builtin.command: scontrol reconfigure

From 851bc46f3faeea0d1da96a70abf3adfa11017c43 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Tue, 10 Feb 2026 13:53:06 +0530
Subject: [PATCH 113/172] input doc update

---
 input/omnia_config.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index 032fa77ce0..a5b448fc0d 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -38,6 +38,13 @@
 # cgroup
 # slurmdbd
 # gres
+# acct_gather
+# helpers
+# job_container
+# mpi
+# oci
+# topology
+# burst_buffer
 # Thes files will be written into the slurm_config directory with .conf suffix
 
 slurm_cluster:

From 3226780a605caf0f71d1820db0cdd047f183d854 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Tue, 10 Feb 2026 10:01:47 +0000
Subject: [PATCH 114/172] lint issues fixed

---
 .../tasks/validate_path_overrides.yml         | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml
index 140b1d4bda..c4a1783b02 100644
--- a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml
+++ b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml
@@ -22,8 +22,16 @@
     - slurm_merged_dict is defined
     - slurm_merged_dict.get(item) is defined
     - slurm_merged_dict.get(item) is not none
-    - (slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | length > 0) or (slurm_merged_dict.get(item) is iterable and slurm_merged_dict.get(item) | list | length > 0)
-    - not ((slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | regex_search('^/')) or (slurm_merged_dict.get(item) is iterable and (slurm_merged_dict.get(item) | first) | regex_search('^/')))
+    - >-
+      (slurm_merged_dict.get(item) is string
+       and slurm_merged_dict.get(item) | length > 0)
+      or (slurm_merged_dict.get(item) is iterable
+          and slurm_merged_dict.get(item) | list | length > 0)
+    - >-
+      not ((slurm_merged_dict.get(item) is string
+            and slurm_merged_dict.get(item) | regex_search('^/'))
+           or (slurm_merged_dict.get(item) is iterable
+               and (slurm_merged_dict.get(item) | first) | regex_search('^/')))
   loop:
     - SlurmctldLogFile
     - SlurmdLogFile
@@ -61,8 +69,16 @@
     - slurmdbd_merged_dict is defined
     - slurmdbd_merged_dict.get(item) is defined
     - slurmdbd_merged_dict.get(item) is not none
-    - (slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | length > 0) or (slurmdbd_merged_dict.get(item) is iterable and slurmdbd_merged_dict.get(item) | list | length > 0)
-    - not ((slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | regex_search('^/')) or (slurmdbd_merged_dict.get(item) is iterable and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/')))
+    - >-
+      (slurmdbd_merged_dict.get(item) is string
+       and slurmdbd_merged_dict.get(item) | length > 0)
+      or (slurmdbd_merged_dict.get(item) is iterable
+          and slurmdbd_merged_dict.get(item) | list | length > 0)
+    - >-
+      not ((slurmdbd_merged_dict.get(item) is string
+            and slurmdbd_merged_dict.get(item) | regex_search('^/'))
+           or (slurmdbd_merged_dict.get(item) is iterable
+               and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/')))
   loop:
     - LogFile
     - PidFile
@@ -77,7 +93,15 @@
     - cgroup_merged_dict is defined
     - cgroup_merged_dict.get(item) is defined
     - cgroup_merged_dict.get(item) is not none
-    - (cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | length > 0) or (cgroup_merged_dict.get(item) is iterable and cgroup_merged_dict.get(item) | list | length > 0)
-    - not ((cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | regex_search('^/')) or (cgroup_merged_dict.get(item) is iterable and (cgroup_merged_dict.get(item) | first) | regex_search('^/')))
+    - >-
+      (cgroup_merged_dict.get(item) is string
+       and cgroup_merged_dict.get(item) | length > 0)
+      or (cgroup_merged_dict.get(item) is iterable
+          and cgroup_merged_dict.get(item) | list | length > 0)
+    - >-
+      not ((cgroup_merged_dict.get(item) is string
+            and cgroup_merged_dict.get(item) | regex_search('^/'))
+           or (cgroup_merged_dict.get(item) is iterable
+               and (cgroup_merged_dict.get(item) | first) | regex_search('^/')))
   loop:
     - CgroupMountpoint

From 47f028a58d513bbdd40abb53d14d35fc809f997b Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Tue, 10 Feb 2026 15:48:03 +0530
Subject: [PATCH 115/172] Enhanced example

---
 input/omnia_config.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index a5b448fc0d..bb5a4f06fa 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -54,6 +54,13 @@ slurm_cluster:
     #   slurm:
     #     SlurmctldTimeout: 60
     #     SlurmdTimeout: 150
+    #     NodeName:
+    #       - NodeName: newnode1
+    #         CPUs: 16
+    #         RealMemory: 64000
+    #       - NodeName: newnode2
+    #         CPUs: 16
+    #         RealMemory: 64000
     #   cgroup:
     #     CgroupPlugin: autodetect
     #     ConstrainCores: True

From 720facebb98c286fc913d6bb6441e1e4783e43c6 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Tue, 10 Feb 2026 10:39:55 +0000
Subject: [PATCH 116/172] slurmd fix

---
 .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2    | 2 ++
 .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index a81d564ba6..3dc8f65514 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -468,4 +468,6 @@
 
         - /root/ldms_sampler.sh
 {% endif %}
+        - systemctl restart slurmd
+
         - echo "Cloud-Init has completed successfully."
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 5d930bef47..62a4e9e063 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -483,4 +483,6 @@
 
         - /root/ldms_sampler.sh
 {% endif %}
+        - systemctl restart slurmd
+
         - echo "Cloud-Init has completed successfully."

From b6e5b8b6185496f0dffb202d30468376174bafdb Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Tue, 10 Feb 2026 16:43:26 +0530
Subject: [PATCH 117/172] rpm_file type handling in pulp

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../input_validation/common_utils/config.py   |   1 +
 .../library/module_utils/local_repo/config.py |   7 +-
 .../local_repo/download_common.py             | 154 ++++++++++++++++++
 common/library/modules/parallel_tasks.py      |  11 +-
 common/library/modules/pulp_cleanup.py        |   2 +-
 5 files changed, 168 insertions(+), 7 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py
index e0b5b0ea46..e6e8a09042 100644
--- a/common/library/module_utils/input_validation/common_utils/config.py
+++ b/common/library/module_utils/input_validation/common_utils/config.py
@@ -146,6 +146,7 @@
 TYPE_REQUIREMENTS = {
     "rpm": ["package", "repo_name"],
     "rpm_list": ["package_list", "repo_name"],
+    "rpm_file": ["package", "url"],
     "ansible_galaxy_collection": ["package", "version"],
     "git": ["package", "version", "url"],
     "image": ["package", ["tag", "digest"]],  # Special: one of tag or digest
diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index e26e8a6e71..0518e2bb01 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -51,7 +51,7 @@
 # Used by software_utils.py
 # ----------------------------
 PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git',
-                 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list']
+                 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file']
 CSV_COLUMNS = {"column1": "name", "column2": "status"}
 SOFTWARE_CONFIG_SUBDIR = "config"
 RPM_LABEL_TEMPLATE = "RPMs for {key}"
@@ -183,7 +183,10 @@
     "list_repositories": "pulp rpm repository list",
     "list_remotes": "pulp rpm remote list",
     "list_distributions": "pulp rpm distribution list",
-    "orphan_cleanup": "pulp orphan cleanup --protection-time 0"
+    "orphan_cleanup": "pulp orphan cleanup --protection-time 0",
+    "list_all_publications": "pulp rpm publication list",
+    "upload_content": "pulp rpm content upload --repository %s --file %s",
+    "update_distribution_repo_config": "pulp rpm distribution update --name %s --generate-repo-config"
 }
 
 # ----------------------------
diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py
index f139384b23..892725b207 100644
--- a/common/library/module_utils/local_repo/download_common.py
+++ b/common/library/module_utils/local_repo/download_common.py
@@ -35,6 +35,7 @@
 from ansible.module_utils.local_repo.common_functions import load_pulp_config
 from ansible.module_utils.local_repo.config import (
     pulp_file_commands,
+    pulp_rpm_commands,
     CLI_FILE_PATH,
     POST_TIMEOUT,
     ISO_POLL_VAL,
@@ -1023,3 +1024,156 @@ def process_pip(package, repo_store_path, status_file_path,  cluster_os_type, cl
 
         logger.info("#" * 30 + f" {process_pip.__name__} end " + "#" * 30)
         return status
+
+def process_rpm_file(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, arc, logger):
+    """
+    Process an RPM file package by downloading it and setting up a Pulp RPM repository.
+
+    Args:
+        package (dict): A dictionary containing the package information.
+        repo_store_path (str): The path to the repository store.
+        status_file_path (str): The path to the status file.
+        cluster_os_type (str): The type of the cluster operating system.
+        cluster_os_version (str): The version of the cluster operating system.
+        arc (str): The architecture (x86_64 or aarch64).
+        logger (logging.Logger): The logger instance.
+
+    Returns:
+        str: The status of the RPM file package processing.
+    """
+    logger.info("#" * 30 + f" {process_rpm_file.__name__} start " + "#" * 30)
+
+    try:
+        package_name = package['package']
+        url = package.get('url', None)
+        package_type = package['type']
+        repo_name = arc.lower() + "_" + package_name
+
+        if not url:
+            logger.error(f"No URL provided for RPM file package: {package_name}")
+            status = "Failed"
+            write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+            return status
+
+        url = shlex.quote(url).strip("'\"")
+        logger.info(f"Processing RPM File Package: {package_name}, URL: {url}")
+
+        # Create rpm_file directory structure
+        rpm_file_directory = os.path.join(
+            repo_store_path, "offline_repo", "cluster", arc.lower(),
+                        cluster_os_type, cluster_os_version, "rpm_file", package_name
+        )
+        os.makedirs(rpm_file_directory, exist_ok=True)
+
+        # Extract filename from URL
+        download_file_name = url.split('/')[-1]
+        rpm_file_path = os.path.join(rpm_file_directory, download_file_name)
+
+        # Step 1: Download the RPM file
+        logger.info("Step 1: Downloading RPM file...")
+        if os.path.exists(rpm_file_path):
+            logger.info(f"RPM file already exists: {rpm_file_path}")
+        else:
+            # Verify URL exists
+            subprocess.run(['wget', '-q', '--spider', '--tries=1', url], check=True)
+
+            # Download the file
+            download_command = f"wget -O {shlex.quote(rpm_file_path)} {url}"
+            if not execute_command(download_command, logger):
+                logger.error(f"Failed to download RPM file from: {url}")
+                status = "Failed"
+                write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+                return status
+
+        # Step 2: CREATE A NEW RPM REPOSITORY IN PULP (if it doesn't exist)
+        logger.info("Step 2: Creating RPM repository in Pulp...")
+        # Check if repository already exists
+        if execute_command(pulp_rpm_commands["show_repository"] % repo_name, logger):
+            logger.info(f"RPM repository {repo_name} already exists. Skipping creation.")
+        else:
+            logger.info(f"Creating RPM repository: {repo_name}")
+            if not execute_command(pulp_rpm_commands["create_repository"] % repo_name, logger):
+                logger.error(f"Failed to create RPM repository: {repo_name}")
+                status = "Failed"
+                write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+                return status
+
+        # Step 3: UPLOAD THE RPM INTO THE REPO
+        logger.info("Step 3: Uploading RPM to repository...")
+        upload_command = pulp_rpm_commands["upload_content"] % (repo_name, shlex.quote(rpm_file_path))
+        if not execute_command(upload_command, logger):
+            logger.error(f"Failed to upload RPM to repository: {repo_name}")
+            status = "Failed"
+            write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+            return status
+
+        # Step 4: PUBLISH THE REPOSITORY
+        logger.info("Step 4: Publishing repository...")
+        if not execute_command(pulp_rpm_commands["publish_repository"] % repo_name, logger):
+            logger.error(f"Failed to publish repository: {repo_name}")
+            status = "Failed"
+            write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+            return status
+
+        # Step 5: CREATE A DISTRIBUTION FOR THE REPO (if it doesn't exist)
+        logger.info("Step 5: Creating distribution...")
+   
+        # Check if distribution already exists
+        if execute_command(pulp_rpm_commands["check_distribution"] % repo_name, logger):
+            logger.info(f"Distribution {repo_name} already exists. Skipping creation.")
+        else:
+            logger.info(f"Creating distribution: {repo_name}")
+            # Get the publication href
+            pub_result = execute_command(pulp_rpm_commands["list_all_publications"], logger, type_json=True)
+            if not pub_result or not pub_result.get("stdout"):
+                logger.error("Failed to get publication list")
+                status = "Failed"
+                write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+                return status
+
+            publications = pub_result["stdout"]
+            if not publications:
+                logger.error("No publications found")
+                status = "Failed"
+                write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+                return status
+
+            latest_publication = publications[0]
+            publication_href = latest_publication.get("pulp_href")
+            
+            if not publication_href:
+                logger.error("No publication href found")
+                status = "Failed"
+                write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+                return status
+
+            base_path = f" opt/omnia/offline_repo/cluster/{arc}/rhel/10.0/rpms/{repo_name}"
+            dist_create_command = pulp_rpm_commands["distribute_repository"] % (repo_name, base_path, repo_name)
+            if not execute_command(dist_create_command, logger):
+                logger.error(f"Failed to create distribution: {repo_name}")
+                status = "Failed"
+                write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+                return status
+
+        # Step 6: ENABLE AUTO-GENERATION OF .repo FILES
+        logger.info("Step 6: Enabling auto-generation of .repo files...")
+        update_command = pulp_rpm_commands["update_distribution_repo_config"] % repo_name
+        if not execute_command(update_command, logger):
+            logger.warning(f"Failed to enable repo config generation for: {repo_name}")
+            # Not a critical failure, continue
+
+        logger.info(f"RPM file package {package_name} processed successfully!")
+        status = "Success"
+
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error executing RPM file commands: {e}")
+        status = "Failed"
+    except Exception as e:
+        logger.error(f"Error processing RPM file package: {e}")
+        status = "Failed"
+
+    finally:
+        # Write the status to the file
+        write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name)
+        logger.info("#" * 30 + f" {process_rpm_file.__name__} end " + "#" * 30)
+        return status
\ No newline at end of file
diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py
index 4fd910d027..5951a525b2 100644
--- a/common/library/modules/parallel_tasks.py
+++ b/common/library/modules/parallel_tasks.py
@@ -28,7 +28,8 @@
     process_shell,
     process_ansible_galaxy_collection,
     process_iso,
-    process_pip
+    process_pip,
+    process_rpm_file
 )
 from ansible.module_utils.local_repo.download_image import process_image
 from ansible.module_utils.local_repo.download_rpm import process_rpm
@@ -175,6 +176,8 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_
             return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
         if task_type == "image":
             return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password]
+        if task_type == "rpm_file":
+            return process_rpm_file, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
         if task_type == "rpm":
             return process_rpm, [task, repo_store_path, status_file,
                                  cluster_os_type, cluster_os_version, repo_config_value, arc]
@@ -251,13 +254,13 @@ def generate_software_status_table(status_dict,slogger):
             table.field_names = ["Name", "Status"]
             for name, status in items:
                 table.add_row([name, status.lower()])
-            
+
             tables.append(table.get_string())
             slogger.info(f"Completed table for {arch}")
-            
+
         slogger.info("Software status table generation completed successfully")
         return "\n\n".join(tables)
-    
+
     except Exception as e:
         slogger.error(f"Error occurred while generating software status table: {e}")
         return f"Error: {e}"
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 6f80e82f83..00ed27d0dd 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -630,7 +630,7 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[
 
                         logger.info(f"Processing row: {row}")
                         # For RPMs, check if they belong to the deleted repository
-                        if row_type == 'rpm':
+                        if row_type == 'rpm' or row_type == 'rpm_file':
                             if has_repo_column and rpm_repo == repo_name:
                                 removed = True
                                 logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)")

From bd71162f9a6d6740b1c369b3e69cd9685405c50a Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 10 Feb 2026 18:46:34 +0530
Subject: [PATCH 118/172] Removing unrequired logic for storage_config and
 provision_config

---
 .../tasks/transform_provision_config.yml        | 17 +----------------
 .../tasks/transform_storage_config.yml          | 15 ---------------
 .../templates/storage_config.j2                 | 17 -----------------
 .../roles/import_input_parameters/vars/main.yml | 15 +++++++++------
 4 files changed, 10 insertions(+), 54 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml
index 71e9ee0dc2..42598d59bc 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml
@@ -44,25 +44,10 @@
 
 - name: Normalize provision_config.yml values
   ansible.builtin.set_fact:
-    provision_pxe_mapping_file_path_raw: >-
-      {{
-        backup_provision_config.pxe_mapping_file_path
-        | default('/opt/omnia/input/project_default/pxe_mapping_file.csv')
-      }}
+    provision_pxe_mapping_file_path: "{{ backup_provision_config.pxe_mapping_file_path | default('pxe_mapping_file.csv') }}"
     provision_language: "{{ backup_provision_config.language | default('en_US.UTF-8') }}"
     provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default('86400') }}"
 
-- name: Rewrite legacy pxe_mapping_file_path to current project input directory
-  ansible.builtin.set_fact:
-    provision_pxe_mapping_file_path: >-
-      {{
-        (
-          provision_pxe_mapping_file_path_raw
-          | string
-          | regex_replace('^/opt/omnia/input/project_default/', input_project_dir ~ '/')
-        )
-      }}
-
 - name: Fail if pxe_mapping_file_path is missing
   ansible.builtin.fail:
     msg: "{{ msg_pxe_mapping_file_path_missing }}"
diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
index 54dbf07bc0..8a167df6fb 100644
--- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
+++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml
@@ -45,19 +45,6 @@
 - name: Normalize storage_config.yml values
   ansible.builtin.set_fact:
     storage_nfs_client_params: "{{ backup_storage_config.nfs_client_params | default([]) }}"
-    storage_powervault_config: "{{ backup_storage_config.powervault_config | default(none) }}"
-    storage_has_powervault: "{{ backup_storage_config.powervault_config is defined }}"
-
-- name: Fail if powervault_config is present but missing mandatory keys
-  ansible.builtin.fail:
-    msg: "{{ msg_powervault_missing_keys }}"
-  when:
-    - storage_has_powervault
-    - storage_powervault_config.ip is not defined or (storage_powervault_config.ip | default([]) | length) == 0
-      or storage_powervault_config.isci_initiators is not defined
-      or (storage_powervault_config.isci_initiators | string | trim) == ''
-      or storage_powervault_config.volume_id is not defined
-      or (storage_powervault_config.volume_id | string | trim) == ''
 
 - name: Fail if nfs_client_params is missing
   ansible.builtin.fail:
@@ -80,8 +67,6 @@
     mode: "{{ default_file_mode }}"
   vars:
     storage_nfs_client_params: "{{ storage_nfs_client_params }}"
-    storage_powervault_config: "{{ storage_powervault_config }}"
-    storage_has_powervault: "{{ storage_has_powervault }}"
 
 - name: Validate YAML syntax of transformed storage_config.yml
   ansible.builtin.command:
diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/storage_config.j2
index 1c695a19a5..f6be3642c4 100644
--- a/upgrade/roles/import_input_parameters/templates/storage_config.j2
+++ b/upgrade/roles/import_input_parameters/templates/storage_config.j2
@@ -43,23 +43,6 @@
 #  isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7
 #  volume_id: 00c0ff4343f1f1f1001c8c4e6901000000
 
-{% if storage_has_powervault %}
-powervault_config:
-{% if storage_powervault_config.ip is defined %}
-  ip:
-{% for _ip in (storage_powervault_config.ip | default([])) %}
-    - {{ _ip }}
-{% endfor %}
-{% else %}
-  ip: []
-{% endif %}
-{% if storage_powervault_config.port is defined %}
-  port: {{ storage_powervault_config.port }}
-{% endif %}
-  isci_initiators: {{ storage_powervault_config.isci_initiators | default('') }}
-  volume_id: {{ storage_powervault_config.volume_id | default('') }}
-{% endif %}
-
 # -----------------------------NFS------------------------------------------------
 
 # This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 722399b7d0..65e8b65f38 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -57,12 +57,15 @@ msg_using_backup_provision_config: "Using backup provision_config.yml (backup no
 msg_pxe_mapping_file_path_missing: "pxe_mapping_file_path is mandatory"
 
 # Storage config transformation messages
-msg_backup_storage_config_missing: "Backup storage_config.yml missing"
-msg_storage_config_missing: "storage_config.yml missing"
-msg_using_backup_storage_config: "Using backup storage_config.yml (backup not modified)"
-msg_nfs_client_params_missing: "nfs_client_params is mandatory"
-msg_nfs_client_param_entry_missing_keys: "Each entry in nfs_client_params must define server_ip, server_share_path, client_share_path, and client_mount_options"
-msg_powervault_missing_keys: "powervault_config (when present) must define ip (non-empty list), isci_initiators, and volume_id"
+msg_backup_storage_config_missing: "storage_config.yml not found in backup at {{ backup_location }}/storage_config.yml"
+msg_storage_config_missing: "storage_config.yml not found at {{ input_project_dir }}/storage_config.yml"
+msg_nfs_client_params_missing: "storage_config.yml must define nfs_client_params with at least one entry"
+msg_nfs_client_param_entry_missing_keys: "Each nfs_client_params entry must define server_ip, server_share_path, and client_share_path"
+msg_using_backup_storage_config: "Transforming storage_config.yml from backup at {{ backup_location }}/storage_config.yml"
+msg_storage_config_transform_summary: |-
+  Transformed storage_config.yml from Omnia 2.0 to 2.1:
+  - Preserved nfs_client_params from backup
+  - Applied schema defaults for missing fields
 
 # Omnia config transformation messages
 msg_backup_omnia_config_missing: "Backup omnia_config.yml missing"

From 8b26ad8102716c2833330b56f8c5df6f2fa2368e Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 10 Feb 2026 19:52:08 +0530
Subject: [PATCH 119/172] Update main.yml

---
 upgrade/roles/import_input_parameters/vars/main.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 65e8b65f38..c27f111cde 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -62,10 +62,6 @@ msg_storage_config_missing: "storage_config.yml not found at {{ input_project_di
 msg_nfs_client_params_missing: "storage_config.yml must define nfs_client_params with at least one entry"
 msg_nfs_client_param_entry_missing_keys: "Each nfs_client_params entry must define server_ip, server_share_path, and client_share_path"
 msg_using_backup_storage_config: "Transforming storage_config.yml from backup at {{ backup_location }}/storage_config.yml"
-msg_storage_config_transform_summary: |-
-  Transformed storage_config.yml from Omnia 2.0 to 2.1:
-  - Preserved nfs_client_params from backup
-  - Applied schema defaults for missing fields
 
 # Omnia config transformation messages
 msg_backup_omnia_config_missing: "Backup omnia_config.yml missing"

From c4a559fbe1b42394be88f545b6993636030b0e6b Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 11:16:26 +0530
Subject: [PATCH 120/172] aarch chnages

---
 ...-group-login_compiler_node_aarch64.yaml.j2 | 135 ++++++++----------
 .../ci-group-slurm_node_aarch64.yaml.j2       |  27 ++++
 .../ci-group-slurm_node_x86_64.yaml.j2        |  17 +--
 .../hpc_tools/configure_nvhpc_env.sh.j2       |   4 +-
 .../hpc_tools/export_nvhpc_env.sh.j2          |   4 +-
 .../hpc_tools/install_nvhpc_sdk.sh.j2         |  22 ++-
 .../templates/hpc_tools/install_openmpi.sh.j2 |  12 +-
 .../templates/hpc_tools/install_ucx.sh.j2     |  12 +-
 discovery/roles/slurm_config/vars/main.yml    |   4 +
 .../aarch64/rhel/10.0/slurm_custom.json       |   6 +
 10 files changed, 152 insertions(+), 91 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index bc3068843a..efef2715c6 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -183,6 +183,18 @@
             {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }}
 {% endif %}
 
+        - path: /usr/local/bin/install_openmpi.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/install_ucx.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }}
+
         - path: /etc/hosts
           append: true
           content: |
@@ -200,6 +212,18 @@
           permissions: '0644'
           content: |
             {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }}
+        
+        - path: /usr/local/bin/install_nvhpc_sdk.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }}
+        
+        - path: /usr/local/bin/configure_nvhpc_env.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }}
 
       runcmd:
         - /usr/local/bin/set-ssh.sh
@@ -218,6 +242,39 @@
         - mount -a
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
+
+{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %}
+        # Add NFS entry and mount
+        - mkdir -p {{ client_mount_path }}
+        - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
+        - mount -a
+{% endif %}
+
+{% if hostvars['localhost']['ucx_support'] %}
+        - echo "===== UCX Setup ====="
+        - echo "UCX support is enabled."
+        - /usr/local/bin/install_ucx.sh
+        # - echo "Build script available at"
+        # - echo "  /usr/local/bin/install_ucx.sh"
+        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
+{% endif %}
+
+{% if hostvars['localhost']['openmpi_support'] %}
+        - echo "===== OpenMPI Setup ====="
+        - echo "OpenMPI support is enabled."
+        - /usr/local/bin/install_openmpi.sh
+        # - echo "Build script available at"
+        # - echo "  /usr/local/bin/install_openmpi.sh"
+        # - echo "Run UCX installation first if UCX support is enabled."
+        # - echo "NFS must be mounted at {{ client_mount_path }} before running."
+{% endif %}
+
+{% if hostvars['localhost']['ldms_support'] %}
+        - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
+
+        - /root/ldms_sampler.sh
+{% endif %}
+
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
         - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
@@ -279,79 +336,7 @@
 
 {% endif %}
 
-{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %}
-        # Add NFS entry and mount
-        - mkdir -p {{ client_mount_path }}
-        - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
-        - mount -a
-{% endif %}
-
-{% if hostvars['localhost']['ucx_support'] %}
-        # UCX build and install
-        - |
-          UCX_BIN={{ client_mount_path }}/benchmarks/ucx
-          mkdir -p {{ client_mount_path }}/compile/ucx
-          mkdir -p {{ client_mount_path }}/benchmarks/ucx
-          cd {{ client_mount_path }}/compile/ucx
-          wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/aarch64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz
-          tar xzf ucx.tar.gz
-          cd ucx-*
-          mkdir -p build
-          cd build
-          ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx
-          make -j 8
-          make install
-{% endif %}
-
-{% if hostvars['localhost']['openmpi_support'] %}
-        # OpenMPI build and install with UCX + Slurm detection
-        - |
-          OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi"
-          OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi"
-          mkdir -p $OPENMPI_SRC
-          mkdir -p $OPENMPI_INSTALL_PREFIX
-
-          cd $OPENMPI_SRC
-          wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/aarch64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz
-
-          tar xzf openmpi.tar.gz
-          cd openmpi-*
-          mkdir -p build
-
-          # Check Slurm
-          if sinfo >/dev/null 2>&1; then
-            SLURM_FLAG="--with-slurm=yes --with-munge=/usr"
-          else
-            SLURM_FLAG="--with-slurm=no"
-          fi
-
-          # Check UCX
-          if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then
-            {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v
-            if [ $? -eq 0 ]; then
-              UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx"
-            else
-              echo "ucx_info failed, disabling UCX"
-              UCX_FLAG=""
-            fi
-          else
-            echo "ucx_info not found, disabling UCX"
-            UCX_FLAG=""
-          fi
-
-          cd build
-          ../configure --prefix=$OPENMPI_INSTALL_PREFIX \
-            --enable-mpi1-compatibility \
-            --enable-prte-prefix-by-default \
-            $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out
-
-          make -j 8
-          make install
-{% endif %}
-
-{% if hostvars['localhost']['ldms_support'] %}
-        - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
-
-        - /root/ldms_sampler.sh
-{% endif %}
+        # nvidia sdk install
+        - /usr/local/bin/install_nvhpc_sdk.sh
+        - /usr/local/bin/configure_nvhpc_env.sh
         - echo "Cloud-Init has completed successfully."
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index 9b3ac1a501..59d5520440 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -396,6 +396,24 @@
           permissions: '0644'
           content: |
             {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }}
+        
+        - path: /usr/local/bin/configure_ucx_openmpi_env.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/setup_nvhpc_sdk.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }}
+
+        - path: /usr/local/bin/export_nvhpc_env.sh
+          owner: root:root
+          permissions: '{{ file_mode_755 }}'
+          content: |
+            {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }}
 
       runcmd:
         - /usr/local/bin/set-ssh.sh
@@ -443,9 +461,18 @@
         - mount -a
 {% endif %}
 
+{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] %}
+        - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
+        - /usr/local/bin/configure_ucx_openmpi_env.sh
+
+{% endif %}
+
 {% if hostvars['localhost']['ldms_support'] %}
         - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
 
         - /root/ldms_sampler.sh
 {% endif %}
+
+        - /usr/local/bin/setup_nvhpc_sdk.sh
+        - /usr/local/bin/export_nvhpc_env.sh
         - echo "Cloud-Init has completed successfully."
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 84440bbdec..1768bc8941 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -474,17 +474,14 @@
         - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
         - mount -a
         - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
-        # - echo "Shared NFS mount is available at: {{ client_mount_path }}"
         - /usr/local/bin/configure_ucx_openmpi_env.sh
-        # - echo ""
-        # - echo "IMPORTANT:"
-        # - echo "1. Install UCX and/or OpenMPI on the LOGIN / COMPILER node first."
-        # - echo "2. Ensure they are installed under the shared mount:"
-        # - echo "   {{ client_mount_path }}/hpc_tools/benchmarks/"
-        # - echo "3. On this node, run the environment setup script when ready:"
-        # - echo ""
-        # - echo "This step is intentionally NOT run automatically."
-        - echo "=================================================="
+
+{% endif %}
+
+{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] %}
+        - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
+        - /usr/local/bin/configure_ucx_openmpi_env.sh
+
 {% endif %}
 
 {% if hostvars['localhost']['ldms_support'] %}
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
index dfc30520b3..d0f788a986 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
@@ -11,7 +11,9 @@ export HOME=/root
 
 NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}"
 NVARCH="$(uname -s)_$(uname -m)"
-NVHPC_VERSION="{{ nvhpc_version | default('25.11') }}"
+NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}"
+NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')"
+
 
 NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION"
 PROFILE_FILE="/etc/profile.d/nvhpc.sh"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
index a0cfdfdbe8..1ff49968b4 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
@@ -5,7 +5,9 @@ CLIENT_MOUNT="{{ client_mount_path }}"
 
 NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
 NVARCH="$(uname -s)_$(uname -m)"
-NVHPC_VERSION="25.11"
+
+NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}"
+NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')"
 
 NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION"
 PROFILE_FILE="/etc/profile.d/nvhpc.sh"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
index 75478a470e..8ff149fca3 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
@@ -5,7 +5,25 @@ LOGFILE="/var/log/nvhpc_sdk_install.log"
 
 echo "===== Starting NVIDIA HPC SDK installation =====" | tee -a "$LOGFILE"
 
-NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}"
+sys_arch="$(uname -m)"
+case "${sys_arch}" in
+    x86_64|amd64) arch="x86_64" ;;
+    aarch64|arm64) arch="aarch64" ;;
+    *)
+        echo "Unsupported architecture: ${sys_arch}"
+        exit 1
+        ;;
+esac
+
+NVHPC_VERSION="2025_2511"
+NVHPC_SHORT_VERSION="$(echo ${NVHPC_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')"
+CUDA_VERSION="13.0"
+
+NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('') }}"
+if [ -z "${NVHPC_PKG_NAME}" ]; then
+    NVHPC_PKG_NAME="nvhpc_${NVHPC_VERSION}_Linux_${arch}_cuda_${CUDA_VERSION}"
+fi
+
 NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
 NVHPC_MOUNT="/shared-nvhpc-sdk"
 NVHPC_TARBALL="$NVHPC_MOUNT/${NVHPC_PKG_NAME}.tar.gz"
@@ -47,7 +65,7 @@ else
 fi
 
 mkdir -p "$NVHPC_INSTALL_DIR_NFS"
-INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin"
+INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_${arch}/${NVHPC_SHORT_VERSION}/compilers/bin"
 
 if [ -x "$INSTALL_BIN_DIR/nvc" ]; then
     echo "[INFO] NVHPC already installed. Skipping installer." | tee -a "$LOGFILE"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
index 9adde78472..5758b20094 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
@@ -29,10 +29,20 @@ echo "===== OpenMPI build started ====="
 mkdir -p "$OPENMPI_BUILD"
 cd "$OPENMPI_BUILD"
 
+sys_arch="$(uname -m)"
+case "${sys_arch}" in
+    x86_64|amd64) arch="x86_64" ;;
+    aarch64|arm64) arch="aarch64" ;;
+    *)
+        echo "Unsupported architecture: ${sys_arch}"
+        exit 1
+        ;;
+esac
+
 if [ ! -f openmpi.tar.gz ]; then
     echo "[INFO] Downloading OpenMPI source code..."
     wget --no-check-certificate \
-      https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \
+      https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/${arch}/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \
       -O openmpi.tar.gz >> "$LOGFILE" 2>&1
     echo "[INFO] OpenMPI download completed"
 else
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
index 0231d77683..55b7483c68 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2
@@ -26,10 +26,20 @@ echo "===== UCX build started ====="
 mkdir -p "$UCX_BUILD"
 cd "$UCX_BUILD"
 
+sys_arch="$(uname -m)"
+case "${sys_arch}" in
+    x86_64|amd64) arch="x86_64" ;;
+    aarch64|arm64) arch="aarch64" ;;
+    *)
+        echo "Unsupported architecture: ${sys_arch}"
+        exit 1
+        ;;
+esac
+
 if [ ! -f ucx.tar.gz ]; then
     echo "[INFO] Downloading UCX source code..."
     wget --no-check-certificate \
-      https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz \
+      https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/${arch}/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz \
       -O ucx.tar.gz >> "$LOGFILE" 2>&1
     echo "[INFO] UCX download completed"
 else
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index f911ce975e..454661fc7e 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -158,5 +158,9 @@ parallel_copy_candidates:
   - name: nvhpc_sdk_x86_64
     src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
     dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
+  
+  - name: nvhpc_sdk_aarch64
+    src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/"
+    dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
 
 backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json
index 2483775495..e1ba8926a2 100644
--- a/input/config/aarch64/rhel/10.0/slurm_custom.json
+++ b/input/config/aarch64/rhel/10.0/slurm_custom.json
@@ -32,7 +32,13 @@
             {"package": "cuda-run",
              "type": "iso",
              "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux_sbsa.run"
+            },
+            {
+            "package": "nvhpc_2025_2511_Linux_aarch64_cuda_13.0",
+            "type": "tarball",
+            "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz"
             }
+             
         ]
     },
     "login_node":{

From 9dbb5587a0945903830b07f70626651f691da23b Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 12:03:04 +0530
Subject: [PATCH 121/172] file path name change

---
 discovery/roles/slurm_config/vars/main.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 454661fc7e..8ec9f5b2cb 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -131,8 +131,10 @@ offline_path_aarch64:
 ssh_private_key_path: /root/.ssh/oim_rsa
 
 # nvidia sdk vars
-nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0"
-nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz"
+# Fully resolved tarball relative paths (no nested Jinja2)
+nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz"
+nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz"
+
 nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk"
 
 # parallel file copy

From 8b7e4e7719d9dbb4e081fcb3bb5329dafa1ecb72 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 12:26:26 +0530
Subject: [PATCH 122/172] lint fix

---
 discovery/roles/slurm_config/vars/main.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 8ec9f5b2cb..981a113610 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -132,8 +132,13 @@ ssh_private_key_path: /root/.ssh/oim_rsa
 
 # nvidia sdk vars
 # Fully resolved tarball relative paths (no nested Jinja2)
-nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz"
-nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz"
+nvhpc_tarball_x86_64_relpath: >
+  offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/
+  nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz
+
+nvhpc_tarball_aarch64_relpath: >
+  offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/
+  nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz
 
 nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk"
 
@@ -160,9 +165,9 @@ parallel_copy_candidates:
   - name: nvhpc_sdk_x86_64
     src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/"
     dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
-  
+
   - name: nvhpc_sdk_aarch64
     src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/"
     dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
 
-backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
+backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
\ No newline at end of file

From fa7a1d17b0bf7ccab519c36092452bdda52cdf65 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 11 Feb 2026 13:19:53 +0530
Subject: [PATCH 123/172] Partiiton normal made default

---
 discovery/roles/slurm_config/defaults/main.yml | 1 -
 discovery/roles/slurm_config/vars/main.yml     | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml
index ad7ab09058..955e4c2a37 100644
--- a/discovery/roles/slurm_config/defaults/main.yml
+++ b/discovery/roles/slurm_config/defaults/main.yml
@@ -67,7 +67,6 @@ __default_config:
     PartitionName:
       - PartitionName: DEFAULT
         Nodes: ALL
-        Default: true
         MaxTime: INFINITE
         State: UP
     # S_P_ARRAY type paramater to be provided this way
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 89166b1f12..43ee995e5a 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -125,6 +125,7 @@ partition_params:
   Nodes: "{{ cmpt_list | join(',') }}"
   MaxTime: "INFINITE"
   State: "UP"
+  Default: "YES"
 openldap_dir_name: "openldap/"
 software_config_file: "{{ input_project_dir }}/software_config.json"
 omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}"

From 6a9642dba625314bde1d5554fd33787d5f9ffa62 Mon Sep 17 00:00:00 2001
From: mcas <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 13:32:28 +0530
Subject: [PATCH 124/172] commit

---
 .../templates/hpc_tools/install_openmpi.sh.j2         | 11 -----------
 .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2         |  4 ++--
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
index 5758b20094..9cd0d8d1a4 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2
@@ -105,22 +105,11 @@ EOF
 
 chmod 644 "$OPENMPI_ENV_FILE"
 
-# Verify installation
-echo "[INFO] Verifying OpenMPI installation..."
-if [ -f "$OPENMPI_PREFIX/bin/ompi_info" ]; then
-    OPENMPI_VERSION=$("$OPENMPI_PREFIX/bin/ompi_info" --version | head -1)
-    echo "[SUCCESS] OpenMPI installation verified - Version: $OPENMPI_VERSION" | tee -a "$LOGFILE"
-else
-    echo "[ERROR] OpenMPI installation verification failed - ompi_info not found" | tee -a "$LOGFILE"
-    exit 1
-fi
 
 # Create installation summary
 echo ""
 echo "===== OpenMPI Installation Summary ====="
 echo "Installation Status: SUCCESS"
-echo "OpenMPI Version: $OPENMPI_VERSION"
-
 echo "Integration Status:"
 if [ "$SLURM_FLAG" = "--with-slurm=yes --with-munge=/usr" ]; then
     echo "  - Slurm Integration: ENABLED"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
index 8169d1f5a6..a101852ea2 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2
@@ -11,7 +11,7 @@ NVHPC_NFS_SHARE="$PARENT_MOUNT/nvhpc"
 NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
 
 mkdir -p "$PARENT_MOUNT"
-mkdir -p "$NVHPC_NFS_SHARE"
+
 
 if ! mountpoint -q "$PARENT_MOUNT"; then
     mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT"
@@ -24,7 +24,7 @@ fi
 
 echo "[INFO] Parent NVHPC export mounted"
 
-
+mkdir -p "$NVHPC_NFS_SHARE"
 # 3. Ensure fstab entry exists (bind mount, NOT NFS)
 if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then
     echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT none bind,_netdev 0 0" >> /etc/fstab

From 12b39befc3316b479c86bc676170674e12f63a99 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 08:58:56 +0000
Subject: [PATCH 125/172] lint fix

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 discovery/roles/slurm_config/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index f7af9c09a4..8406816341 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -197,4 +197,4 @@ parallel_copy_candidates:
     src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/"
     dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/"
 
-backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
\ No newline at end of file
+backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"

From 8c5d7291bc9038a13d71ef11f19d78de33f016f5 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 11 Feb 2026 14:35:50 +0530
Subject: [PATCH 126/172] Failed when flag fix

---
 discovery/roles/slurm_config/tasks/check_ctld_running.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 5af73f984c..0c7626f3dd 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -76,7 +76,7 @@
       delegate_to: "{{ target_host }}"
       when: reachable_hosts | length > 0
       ignore_unreachable: true
-      failed_when: true
+      failed_when: false
 
     - name: Trigger the scontrol reconfigure
       ansible.builtin.command: scontrol reconfigure

From 9e076ab65db2ca1e74d7c0b3b754e9437cc108fd Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 11 Feb 2026 14:50:28 +0530
Subject: [PATCH 127/172] PartitionName validation key added

---
 .../input_validation/common_utils/slurm_conf_utils.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 0c98f64e6c..53516d2523 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -109,7 +109,7 @@ class SlurmParserEnum(str, Enum):
 
 
 slurm_partitionname_options = {
-    "Partition": S_P_STRING,
+    "PartitionName": S_P_STRING,
     "AllocNodes": S_P_CSV,
     "AllowAccounts": S_P_CSV,
     "AllowGroups": S_P_CSV,

From 70d7ce92f8f97e422692514031267782bf4d3f7f Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 11 Feb 2026 15:11:13 +0530
Subject: [PATCH 128/172] Syntax error

---
 .../input_validation/common_utils/slurm_conf_utils.py          | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 53516d2523..3ea4d07a70 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -811,8 +811,7 @@ def validate_config_types(conf_dict, conf_name, module):
 
             elif expected_type == "array":
                 if not isinstance(value, list):
-                    error = f"Expected array (list), got {
-                        type(value).__name__}"
+                    error = f"Expected array (list), got {type(value).__name__}"
                 elif value:
                     if not all(isinstance(item, dict) for item in value):
                         error = "Expected array of dicts, got mixed types"

From 2a1b1907719db5da60d1db3bd9477f54f07fbf83 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 09:52:54 +0000
Subject: [PATCH 129/172] ssh

---
 .../ci-group-login_compiler_node_aarch64.yaml.j2          | 8 ++++++++
 .../cloud_init/ci-group-slurm_node_aarch64.yaml.j2        | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index ca29315db6..dc2ddf9dcd 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -70,6 +70,13 @@
                 done
             fi
 
+        - path: /root/.ssh/config
+          permissions: '0600'
+          content: |
+            Host {{ slurm_control_ssh_patterns }}
+                IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa
+                IdentitiesOnly yes
+
         - path: /usr/local/bin/install_cuda_toolkit.sh
           permissions: '0755'
           content: |
@@ -235,6 +242,7 @@
         - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path}}/hpc_tools  /hpc_tools   nfs defaults,_netdev 0 0" >> /etc/fstab
+        - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/cert  /cert   nfs defaults,_netdev 0 0" >> /etc/fstab
         - echo "{{ cloud_init_nfs_path }}/packages  /var/lib/packages   nfs defaults,_netdev 0 0" >> /etc/fstab
         - chmod {{ file_mode }} /etc/fstab
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index c1a1b225c9..ffa228a769 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -72,6 +72,13 @@
                 done
             fi
 
+        - path: /root/.ssh/config
+          permissions: '0600'
+          content: |
+            Host {{ slurm_control_ssh_patterns }}
+                IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa
+                IdentitiesOnly yes
+
         - path: /usr/local/bin/install_nvidia_driver.sh
           permissions: '0755'
           content: |
@@ -246,6 +253,7 @@
             echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge      /etc/munge       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ trackfile_nfs_path }}    /var/log/track       nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images  /hpc_tools/container_images   nfs defaults,_netdev 0 0" >> /etc/fstab
+            echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts  /hpc_tools/scripts   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/cert  /cert   nfs defaults,_netdev 0 0" >> /etc/fstab
             echo "{{ cloud_init_nfs_path }}/packages  /var/lib/packages   nfs defaults,_netdev 0 0" >> /etc/fstab

From 66c160a18d030653f3377fac17ce068d9ac034b9 Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Wed, 11 Feb 2026 15:40:06 +0530
Subject: [PATCH 130/172] updating warning to fail when user_registry is not
 reachable

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 local_repo/roles/validation/tasks/main.yml | 7 +++----
 local_repo/roles/validation/vars/main.yml  | 1 +
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml
index 41f584dd15..0f578af349 100644
--- a/local_repo/roles/validation/tasks/main.yml
+++ b/local_repo/roles/validation/tasks/main.yml
@@ -44,10 +44,9 @@
     timeout: "{{ time_out }}"
   register: registry_check_result
 
-- name: Warning - Display unreachable registries
-  ansible.builtin.pause:
-    prompt: "{{ registry_check_result.unreachable_registries | join(', ') }}\n{{ user_registry_msg }}"
-    seconds: "{{ warning_wait_time_warning }}"
+- name: Fail - Unreachable registries detected
+  ansible.builtin.fail:
+    msg: "{{ unreachable_registries_fail_msg }}"
   when:
     - registry_check_result.unreachable_registries is defined
     - registry_check_result.unreachable_registries | length > 0
diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml
index 08a082ded7..87e8733498 100644
--- a/local_repo/roles/validation/vars/main.yml
+++ b/local_repo/roles/validation/vars/main.yml
@@ -145,6 +145,7 @@ user_registry_fail_msg: "Failed. Please ensure user_registry is non empty list a
 user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry should have 'host' and 'cert_path' keys defined"
 time_out: 30
 user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager."   # noqa: yaml[line-length]
+unreachable_registries_fail_msg: "Unreachable registries detected: {{ registry_check_result.unreachable_registries | join(', ') }}. {{ user_registry_msg }} Please check registry connectivity and configuration before proceeding."  # noqa: yaml[line-length]
 cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml"  # noqa: yaml[line-length]
 additional_packages_image_warning_msg: |
   WARNING: additional_packages.json contains packages of type 'image', but 'user_registry' is not defined in local_repo_config.yml.

From 889213466485c6b69aa47949a5be5d4daa0688e0 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 11 Feb 2026 15:58:51 +0530
Subject: [PATCH 131/172] Added no_log whereever, StoragePass was displayed in
 log

---
 .../input_validation/common_utils/slurm_conf_utils.py  |  2 --
 .../roles/slurm_config/tasks/build_slurm_conf.yml      |  6 ++++++
 discovery/roles/slurm_config/tasks/confs.yml           | 10 ++++++++++
 .../slurm_config/tasks/extract_path_overrides.yml      |  1 +
 .../roles/slurm_config/tasks/handle_extra_confs.yml    |  2 ++
 5 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
index 3ea4d07a70..a8c50266a0 100644
--- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py
@@ -765,8 +765,6 @@ def validate_config_types(conf_dict, conf_name, module):
     current_conf = all_confs.get(conf_name, {})
     if not current_conf:
         return {'invalid_keys': [], 'type_errors': []}
-    # module.fail_json(msg=f"Invalid configuration name: {conf_name}", conf_dict=conf_dict, current_conf=current_conf)
-    # module.warn(conf_name)
     invalid_keys = list(
         set(conf_dict.keys()).difference(set(current_conf.keys())))
     type_errors = []
diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
index cd72cf33f0..9d5d0f0944 100644
--- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
+++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
@@ -18,6 +18,7 @@
      | combine({'slurm': (apply_config['slurm']
      | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + (node_params | default([]))}))}) }}"
   when: node_params is defined and node_params
+  no_log: true
 
 - name: Append login nodes to NodeName list
   ansible.builtin.set_fact:
@@ -26,6 +27,7 @@
      | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}"
   loop: "{{ login_list }}"
   when: login_list is defined and login_list
+  no_log: true
 
 - name: Append compiler login nodes to NodeName list
   ansible.builtin.set_fact:
@@ -34,6 +36,7 @@
      | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}"
   loop: "{{ compiler_login_list }}"
   when: compiler_login_list is defined and compiler_login_list
+  no_log: true
 
 - name: Append Partition
   ansible.builtin.set_fact:
@@ -41,13 +44,16 @@
      | combine({'slurm': (apply_config['slurm']
      | combine({'PartitionName': (apply_config['slurm'].PartitionName | default([])) + [partition_params]}))}) }}"
   when: node_params is defined and node_params
+  no_log: true
 
 - name: Add gpu parameters to slurm conf
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}"
   when: gpu_params is defined and gpu_params
+  no_log: true
 
 - name: Add dbd parameters to slurm conf
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}"
   when: dbd_list is defined and dbd_list
+  no_log: true
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 5e1e59376a..12236d6ed8 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -15,6 +15,7 @@
 - name: Slurm dict ops
   ansible.builtin.set_fact:
     apply_config: "{{ __default_config }}"
+  no_log: true
 
 - name: Read NodeName parameters
   ansible.builtin.include_tasks: read_node_idrac.yml
@@ -30,6 +31,7 @@
      | combine({'slurmdbd': (apply_config['slurmdbd']
      | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}"
   when: ctld_list
+  no_log: true
 
 - name: Check .conf files existence
   ansible.builtin.stat:
@@ -46,6 +48,7 @@
   delegate_to: localhost
   loop: "{{ configs_input | default({}) | dict2items }}"
   register: parsed_configs_input_results
+  no_log: true
   when:
     - configs_input is defined
     - configs_input
@@ -56,6 +59,7 @@
   ansible.builtin.set_fact:
     parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}"
   loop: "{{ parsed_configs_input_results.results }}"
+  no_log: true
   when:
     - parsed_configs_input_results is defined
     - not item.skipped | default(false)
@@ -64,6 +68,7 @@
   ansible.builtin.set_fact:
     parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}"
   loop: "{{ configs_input | default({}) | dict2items }}"
+  no_log: true
   when:
     - configs_input is defined
     - configs_input
@@ -86,6 +91,7 @@
   loop_control:
     loop_var: existing_conf_set
   register: prepared_conf_lists
+  no_log: true
 
 # All the updates to the confs follow after this point before merge
 - name: Prepend ClusterName and SlurmctldHost to slurm conf sources
@@ -93,12 +99,14 @@
     conf_merge_dict: "{{ conf_merge_dict
      | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}"
   when: "'slurm' in conf_merge_dict"
+  no_log: true
 
 - name: Slurm dbd - DbdHost and StorageHost
   ansible.builtin.set_fact:
     conf_merge_dict: "{{ conf_merge_dict
      | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}"
   when: "'slurmdbd' in conf_merge_dict"
+  no_log: true
 
 - name: Merge the confs
   slurm_conf:
@@ -107,6 +115,7 @@
     conf_name: "{{ item.key }}"
   loop: "{{ conf_merge_dict | dict2items }}"
   register: merged_conf
+  no_log: true
 
 - name: Update slurm_conf_dict with merged configuration for cloud_init read. # TODO: Remove cloud init dependency
   ansible.builtin.set_fact:
@@ -169,6 +178,7 @@
     remote_src: "{{ copy_from_oim }}"
   loop: "{{ merged_conf.results }}"
   register: ctld_conf_files
+  no_log: true
   when:
     - item.ini_lines
 
diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
index ab1bf17aa6..0efcf18962 100644
--- a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
+++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
@@ -24,6 +24,7 @@
   ansible.builtin.set_fact:
     slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}"
   when: "'slurmdbd' in conf_merge_dict"
+  no_log: true
 
 - name: Extract cgroup.conf merged dict
   ansible.builtin.set_fact:
diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
index 307ca01723..544822ec28 100644
--- a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
+++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
@@ -19,6 +19,7 @@
     conf_name: "{{ extra_conf }}"
   register: ex_conf
   delegate_to: localhost
+  no_log: true
   when:
     - "'.' not in extra_conf"
 
@@ -30,6 +31,7 @@
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
+  no_log: true
   when:
     - "'.' not in extra_conf"
     - ex_conf is success

From ca14a61c68fd71392a872b994c2ad00a88214518 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Wed, 11 Feb 2026 17:46:22 +0530
Subject: [PATCH 132/172] Update omnia.sh

---
 omnia.sh | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 320 insertions(+), 11 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 235cc1dbc1..9c46a04dc9 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -979,10 +979,11 @@ start_container_session() {
 }
 
 show_help() {
-    echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]"
+    echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]"
     echo "  -i, --install     Install and start the Omnia core container"
     echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
-    echo "      --upgrade     Upgrade the Omnia core container from image tag 1.0 to 1.1"
+    echo "      --upgrade     Upgrade the Omnia core container to newer version
+    echo "      --rollback    Rollback the Omnia core container to previous version
     echo "  -v, --version     Display Omnia version information"
     echo "  -h, --help        More information about usage"
 }
@@ -1248,15 +1249,6 @@ phase1_validate() {
         return 1
     fi
 
-    if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then
-        echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image"
-        return 1
-    fi
-
-    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)"
-
-   
-
     if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
         echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
         echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
@@ -1372,6 +1364,9 @@ phase4_container_swap() {
 
     if [ ! -f "$quadlet_file" ]; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
@@ -1385,27 +1380,42 @@ phase4_container_swap() {
 
     if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
     echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit"
     if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
         echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
     if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
     systemctl daemon-reload || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     }
 
     systemctl start omnia_core.service || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     }
 
@@ -1419,6 +1429,9 @@ phase4_container_swap() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
@@ -1436,6 +1449,9 @@ phase4_container_swap() {
         fi
     "; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
@@ -1490,6 +1506,296 @@ upgrade_omnia_core() {
     exit 0
 }
 
+# Validate backup directory structure and files
+validate_backup_directory() {
+    local backup_path="$1"
+    
+    echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path"
+    
+    # Check if backup directory exists
+    if ! podman exec -u root omnia_core test -d "$backup_path"; then
+        echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path"
+        return 1
+    fi
+    
+    # Check for required subdirectories
+    for subdir in input metadata configs; do
+        if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then
+            echo "[ERROR] [ROLLBACK] Missing required subdirectory: $backup_path/$subdir"
+            return 1
+        fi
+    done
+    
+    # Check for required files
+    if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then
+        echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml"
+        return 1
+    fi
+    
+    if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then
+        echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container"
+        return 1
+    fi
+    
+    # Verify metadata contains version information
+    if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then
+        echo "[ERROR] [ROLLBACK] Metadata file does not contain version information"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Backup validation successful"
+    return 0
+}
+
+# Stop container gracefully with timeout
+stop_container_gracefully() {
+    local container_name="$1"
+    local timeout="${2:-30}"
+    
+    echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..."
+    
+    # Try graceful stop first
+    if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then
+        echo "[INFO] [ROLLBACK] Container stopped gracefully"
+        return 0
+    fi
+    
+    # Check if container is still running
+    if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
+        echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..."
+        if podman stop "$container_name" >/dev/null 2>&1; then
+            echo "[INFO] [ROLLBACK] Container force stopped"
+            return 0
+        else
+            echo "[ERROR] [ROLLBACK] Failed to stop container"
+            return 1
+        fi
+    fi
+    
+    return 0
+}
+
+# Restore files from backup
+restore_from_backup() {
+    local backup_path="$1"
+    
+    echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path"
+    
+    # Restore input files
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        rm -rf /opt/omnia/input
+        cp -a '$backup_path/input' /opt/omnia/
+    "; then
+        echo "[ERROR] [ROLLBACK] Failed to restore input files"
+        return 1
+    fi
+    
+    # Restore metadata
+    if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then
+        echo "[ERROR] [ROLLBACK] Failed to restore metadata"
+        return 1
+    fi
+    
+    # Restore container config on host
+    if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then
+        echo "[ERROR] [ROLLBACK] Failed to restore container config"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Files restored successfully"
+    return 0
+}
+
+# Main rollback function
+rollback_omnia_core() {
+    echo -e "${GREEN}================================================================================${NC}"
+    echo -e "${GREEN}                         OMNIA CORE ROLLBACK${NC}"
+    echo -e "${GREEN}================================================================================${NC}"
+    echo ""
+    
+    # Audit log start
+    local rollback_start=$(date -Iseconds)
+    echo "[AUDIT] Rollback operation started at: $rollback_start"
+    
+    # Check if omnia_core container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo -e "${RED}ERROR: Omnia core container is not running.${NC}"
+        exit 1
+    fi
+    
+    # Get current version
+    if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then
+        echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}"
+        exit 1
+    fi
+    
+    local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
+    if [ "$current_version" != "2.1.0.0" ]; then
+        echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}"
+        exit 1
+    fi
+    
+    # List available backups
+    echo "[INFO] [ROLLBACK] Scanning for available backups..."
+    local backup_dirs=()
+    while IFS= read -r line; do
+        backup_dirs+=("$line")
+    done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r)
+    
+    if [ ${#backup_dirs[@]} -eq 0 ]; then
+        echo -e "${RED}ERROR: No backup directories found.${NC}"
+        exit 1
+    fi
+    
+    echo ""
+    echo "Available backup versions:"
+    for i in "${!backup_dirs[@]}"; do
+        local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//')
+        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
+        echo "  $((i+1)). Version $version (created: $backup_date)"
+    done
+    
+    # Prompt for backup selection
+    echo ""
+    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
+    read -r selection
+    
+    # Validate selection
+    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then
+        echo -e "${RED}ERROR: Invalid selection.${NC}"
+        exit 1
+    fi
+    
+    local selected_backup="${backup_dirs[$((selection-1))]}"
+    local backup_version=$(basename "$selected_backup" | sed 's/version_//')
+    
+    echo ""
+    echo "Selected backup: Version $backup_version"
+    echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: "
+    read -r confirm
+    
+    if [[ ! "$confirm" =~ ^[yY] ]]; then
+        echo "Rollback cancelled by user."
+        exit 0
+    fi
+    
+    # Validate selected backup - only check if directory exists without podman exec
+    if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
+        # Try to check on host if container check fails
+        # Get shared path from metadata to check on host
+        local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+        local host_backup_path="${selected_backup#/opt/omnia}"
+        if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then
+            echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}"
+            exit 1
+        fi
+    fi
+    
+    echo ""
+    echo "[INFO] [ROLLBACK] Starting rollback process..."
+    
+    # Step 1: Stop 1.1 container gracefully
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..."
+    if ! stop_container_gracefully "omnia_core" 30; then
+        echo -e "${RED}ERROR: Failed to stop container.${NC}"
+        exit 1
+    fi
+    
+    # Step 2: Check for 1.0 image
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..."
+    if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then
+        echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}"
+        echo -e "${YELLOW}Attempting to tag image...${NC}"
+        
+        # Try to tag latest as 1.0 if available
+        if podman inspect omnia_core:latest >/dev/null 2>&1; then
+            podman tag omnia_core:latest omnia_core:1.0
+        else
+            echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}"
+            exit 1
+        fi
+    fi
+    
+    # Step 3: Start 1.0 container
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..."
+    systemctl daemon-reload
+    if ! systemctl start omnia_core.service; then
+        echo -e "${RED}ERROR: Failed to start container service.${NC}"
+        exit 1
+    fi
+    
+    # Step 4: Wait for container to be healthy
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 4: Waiting for container to be healthy..."
+    local health_timeout=60
+    local health_count=0
+    
+    while [ $health_count -lt $health_timeout ]; do
+        if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then
+            echo "[INFO] [ROLLBACK] Container is healthy"
+            break
+        fi
+        sleep 1
+        health_count=$((health_count + 1))
+        echo -n "."
+    done
+    
+    if [ $health_count -ge $health_timeout ]; then
+        echo ""
+        echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}"
+        exit 1
+    fi
+    
+    # Step 5: Validate backup directory structure
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..."
+    if ! validate_backup_directory "$selected_backup"; then
+        echo -e "${RED}ERROR: Backup validation failed.${NC}"
+        exit 1
+    fi
+    
+    # Step 6: Restore files from backup
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..."
+    if ! restore_from_backup "$selected_backup"; then
+        echo -e "${RED}ERROR: Failed to restore from backup.${NC}"
+        exit 1
+    fi
+    
+    # Step 7: Verify container version
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 7: Verifying container version..."
+    local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
+    
+    if [ "$verify_version" != "$backup_version" ]; then
+        echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}"
+        exit 1
+    fi
+    
+    # Audit log end
+    local rollback_end=$(date -Iseconds)
+    echo "[AUDIT] Rollback operation completed at: $rollback_end"
+    echo "[AUDIT] Rolled back from version $current_version to $backup_version"
+    
+    echo ""
+    echo -e "${GREEN}================================================================================${NC}"
+    echo -e "${GREEN}                    ROLLBACK COMPLETED SUCCESSFULLY${NC}"
+    echo -e "${GREEN}================================================================================${NC}"
+    echo ""
+    echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}"
+    echo -e "${GREEN}✓ Container is running and healthy${NC}"
+    echo -e "${GREEN}✓ Configuration restored from backup${NC}"
+    echo ""
+    
+    # Initialize SSH config and start container session
+    init_ssh_config
+    start_container_session
+}
+
 # Main function to check if omnia_core container is already running.
 # If yes, ask the user if they want to enter the container or reinstall.
 # If no, set it up.
@@ -1504,6 +1810,9 @@ main() {
         --upgrade)
             upgrade_omnia_core
             ;;
+        --rollback)
+            rollback_omnia_core
+            ;;
         --version|-v)
             display_version
             ;;

From 46c63c095c51a3f2df5097a3b9739e61e7b8b6ad Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Wed, 11 Feb 2026 18:06:48 +0530
Subject: [PATCH 133/172] cleanup of files under offline_repo dir during pulp
 cleanup

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/modules/pulp_cleanup.py | 104 ++++++++++++++++++++++---
 local_repo/pulp_cleanup.yml            |   2 +
 2 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 00ed27d0dd..f3da3e2004 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -27,6 +27,7 @@
 import csv
 import glob
 import json
+import shutil
 import subprocess
 from typing import Dict, List, Any, Tuple
 
@@ -399,7 +400,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
         return False, f"Pulp deletion error: {str(e)}"
 
 
-def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
+def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]:
     """Cleanup a pip module from Pulp Python repository.
     
     Pip modules are stored as: pip_module<package_name>==<version>
@@ -408,6 +409,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
     result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""}
     messages = []
     pulp_deleted = False
+    content_removed = False
 
     try:
         # Pulp Python repo name format: pip_module<name>
@@ -467,11 +469,17 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
                 messages.append("Status files updated")
                 mark_software_partial(affected, base_path, logger, 'pip_module')
 
-        if pulp_deleted:
+        # Clean up uploaded content from filesystem
+        fs_result = cleanup_content_directory(name, 'pip_module', repo_store_path, logger)
+        if fs_result["status"] == "Success":
+            content_removed = True
+            messages.append(fs_result["message"])
+
+        if pulp_deleted or content_removed:
             result["status"] = "Success"
             result["message"] = "; ".join(messages) if messages else "Cleaned up"
         else:
-            result["message"] = f"pip_module '{name}' not found in Pulp"
+            result["message"] = f"pip_module '{name}' not found in Pulp or filesystem"
 
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
@@ -493,7 +501,7 @@ def get_pulp_file_repo_name(name: str, file_type: str) -> str:
     return name
 
 
-def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -> Dict[str, Any]:
+def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]:
     """Cleanup artifact from Pulp File repository.
     
     Handles: tarball, git, manifest, ansible_galaxy_collection
@@ -503,6 +511,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
     messages = []
     pulp_deleted = False
     status_removed = False
+    content_removed = False
 
     try:
         # Get the expected Pulp repository name
@@ -559,12 +568,18 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
                 messages.append("Status files updated")
                 mark_software_partial(affected, base_path, logger, file_type)
 
+        # Clean up uploaded content from filesystem
+        fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger)
+        if fs_result["status"] == "Success":
+            content_removed = True
+            messages.append(fs_result["message"])
+
         # Determine overall result
-        if pulp_deleted or status_removed:
+        if pulp_deleted or status_removed or content_removed:
             result["status"] = "Success"
             result["message"] = "; ".join(messages) if messages else "Cleaned up"
         else:
-            result["message"] = f"{file_type} '{name}' not found in Pulp or status files"
+            result["message"] = f"{file_type} '{name}' not found in Pulp, status files, or filesystem"
 
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
@@ -572,7 +587,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
     return result
 
 
-def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]:
+def cleanup_file(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]:
     """Cleanup a file artifact.
     
     Routes to appropriate handler:
@@ -583,10 +598,75 @@ def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]:
 
     # Handle pip modules separately - they use Python repositories
     if file_type == "pip_module":
-        return cleanup_pip_module(name, base_path, logger)
+        return cleanup_pip_module(name, base_path, repo_store_path, logger)
 
     # All other file types use Pulp File repository
-    return cleanup_file_repository(name, file_type, base_path, logger)
+    return cleanup_file_repository(name, file_type, base_path, repo_store_path, logger)
+
+
+# =============================================================================
+# FILESYSTEM CONTENT CLEANUP
+# =============================================================================
+
+def cleanup_content_directory(content_name: str, content_type: str, repo_store_path: str, logger) -> Dict[str, Any]:
+    """Remove uploaded content directory from the filesystem.
+
+    Builds the content path the same way as download_common.py:
+        <repo_store_path>/offline_repo/cluster/<arch>/rhel/<version>/<content_type>/<content_name>
+
+    This mirrors how remove_from_status_files iterates over ARCH_SUFFIXES to
+    clean status.csv entries.
+
+    Args:
+        content_name: Name of the content item (e.g., 'helm-v3.19.0-amd64')
+        content_type: Directory category (tarball, git, pip_module, manifest,
+                      ansible_galaxy_collection, rpm_file)
+        repo_store_path: Root store path (e.g., '/opt/omnia')
+        logger: Logger instance
+
+    Returns:
+        Dict with name, type, status, and message keys
+    """
+    result = {"name": content_name, "type": f"filesystem_{content_type}",
+              "status": "Failed", "message": ""}
+    removed_dirs = []
+
+    cluster_path = os.path.join(repo_store_path, "offline_repo", "cluster")
+    if not os.path.exists(cluster_path):
+        result["message"] = f"Content store path not found: {cluster_path}"
+        logger.warning(result["message"])
+        return result
+
+    try:
+        for arch in ARCH_SUFFIXES:
+            # Walk version directories (e.g., rhel/10.0)
+            arch_path = os.path.join(cluster_path, arch)
+            if not os.path.isdir(arch_path):
+                continue
+
+            for version_dir in glob.glob(f"{arch_path}/rhel/*/"):
+                content_dir = os.path.join(version_dir, content_type, content_name)
+                if os.path.exists(content_dir):
+                    logger.info(f"Removing content directory: {content_dir}")
+                    if os.path.isdir(content_dir):
+                        shutil.rmtree(content_dir)
+                    else:
+                        os.remove(content_dir)
+                    removed_dirs.append(content_dir)
+
+        if removed_dirs:
+            result["status"] = "Success"
+            result["message"] = f"Removed content: {', '.join(removed_dirs)}"
+        else:
+            result["message"] = (f"No filesystem content found for "
+                                 f"'{content_name}' under {content_type}")
+            logger.info(result["message"])
+
+    except Exception as e:
+        result["message"] = f"Filesystem cleanup error: {str(e)}"
+        logger.error(f"Failed to cleanup content {content_name}: {e}")
+
+    return result
 
 
 # =============================================================================
@@ -868,7 +948,8 @@ def run_module():
             cleanup_repos=dict(type='list', elements='str', default=[]),
             cleanup_containers=dict(type='list', elements='str', default=[]),
             cleanup_files=dict(type='list', elements='str', default=[]),
-            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT)
+            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT),
+            repo_store_path=dict(type='str', default='/opt/omnia')
         ),
         supports_check_mode=True
     )
@@ -877,6 +958,7 @@ def run_module():
     cleanup_containers = module.params['cleanup_containers']
     cleanup_files = module.params['cleanup_files']
     base_path = module.params['base_path']
+    repo_store_path = module.params['repo_store_path']
 
     # Setup logger - setup_standard_logger expects a directory, creates standard.log inside
     log_dir = os.path.join(base_path, "cleanup")
@@ -915,7 +997,7 @@ def run_module():
 
     # Process files
     for file in cleanup_files:
-        result = cleanup_file(file, base_path, logger)
+        result = cleanup_file(file, base_path, repo_store_path, logger)
         all_results.append(result)
         logger.info(f"File {file}: {result['status']} - {result['message']}")
 
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index 5d409bbc1f..93e379833b 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -77,6 +77,8 @@
         cleanup_repos: "{{ repo_list | default([]) }}"
         cleanup_containers: "{{ container_list | default([]) }}"
         cleanup_files: "{{ file_list | default([]) }}"
+        base_path: "{{ base_path | default('/opt/omnia/log/local_repo') }}"
+        repo_store_path: "{{ repo_store_path | default('/opt/omnia') }}"
       register: cleanup_result
 
   post_tasks:

From e4441cf2cf085afd1a866a10779205cc39ca7795 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 12:37:33 +0000
Subject: [PATCH 134/172] making version and name dynamic

---
 .../hpc_tools/configure_nvhpc_env.sh.j2        | 18 +++++++++++++++---
 .../templates/hpc_tools/export_nvhpc_env.sh.j2 | 17 ++++++++++++++---
 .../hpc_tools/install_nvhpc_sdk.sh.j2          | 15 ++++++++-------
 discovery/roles/slurm_config/vars/main.yml     | 11 +++++------
 4 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
index d0f788a986..958ac6e27c 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2
@@ -9,10 +9,22 @@ echo "===== Configuring NVIDIA HPC SDK environment ====="
 # Cloud-init safe defaults
 export HOME=/root
 
-NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}"
+NVCOMPILERS="/opt/nvidia/nvhpc"
 NVARCH="$(uname -s)_$(uname -m)"
-NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}"
-NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')"
+sys_arch="$(uname -m)"
+case "${sys_arch}" in
+    x86_64|amd64) arch="x86_64" ;;
+    aarch64|arm64) arch="aarch64" ;;
+esac
+
+# Select package name based on detected architecture (rendered from slurm_config vars)
+case "${arch}" in
+    x86_64)  NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;;
+    aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;;
+esac
+
+# Derive version from package name
+NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/' | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')
 
 
 NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
index 1ff49968b4..db2a35df60 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2
@@ -5,9 +5,20 @@ CLIENT_MOUNT="{{ client_mount_path }}"
 
 NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
 NVARCH="$(uname -s)_$(uname -m)"
-
-NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}"
-NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')"
+sys_arch="$(uname -m)"
+case "${sys_arch}" in
+    x86_64|amd64) arch="x86_64" ;;
+    aarch64|arm64) arch="aarch64" ;;
+esac
+
+# Select package name based on detected architecture (rendered from slurm_config vars)
+case "${arch}" in
+    x86_64)  NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;;
+    aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;;
+esac
+
+# Derive version from package name
+NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/' | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')
 
 NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION"
 PROFILE_FILE="/etc/profile.d/nvhpc.sh"
diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
index 8ff149fca3..dd6a55f3ea 100644
--- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
+++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2
@@ -15,14 +15,15 @@ case "${sys_arch}" in
         ;;
 esac
 
-NVHPC_VERSION="2025_2511"
-NVHPC_SHORT_VERSION="$(echo ${NVHPC_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')"
-CUDA_VERSION="13.0"
+# Select package name based on detected architecture (rendered from slurm_config vars)
+case "${arch}" in
+    x86_64)  NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;;
+    aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;;
+esac
 
-NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('') }}"
-if [ -z "${NVHPC_PKG_NAME}" ]; then
-    NVHPC_PKG_NAME="nvhpc_${NVHPC_VERSION}_Linux_${arch}_cuda_${CUDA_VERSION}"
-fi
+# Derive version from package name: nvhpc_YYYY_YYMM_Linux_<arch>_cuda_X.Y
+NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/')
+NVHPC_SHORT_VERSION=$(echo "$NVHPC_VERSION" | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')
 
 NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
 NVHPC_MOUNT="/shared-nvhpc-sdk"
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 8406816341..22c0d7b11b 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -159,13 +159,12 @@ ssh_private_key_path: /root/.ssh/oim_rsa
 
 # nvidia sdk vars
 # Fully resolved tarball relative paths (no nested Jinja2)
-nvhpc_tarball_x86_64_relpath: >
-  offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/
-  nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz
+# nvidia sdk vars
+nvhpc_pkg_name_x86_64: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0"
+nvhpc_pkg_name_aarch64: "nvhpc_2025_2511_Linux_aarch64_cuda_13.0"
 
-nvhpc_tarball_aarch64_relpath: >
-  offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/
-  nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz
+nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_pkg_name_x86_64 }}/{{ nvhpc_pkg_name_x86_64 }}.tar.gz"
+nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/{{ nvhpc_pkg_name_aarch64 }}/{{ nvhpc_pkg_name_aarch64 }}.tar.gz"
 
 nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk"
 

From e005855e702316bc1942848e4dd7dc8a600b1437 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 12:39:51 +0000
Subject: [PATCH 135/172] adding the min and sec

---
 common/library/module_utils/local_repo/config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 6196e8c7e6..6997233fe6 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -102,10 +102,10 @@
 }
 
 CLI_FILE_PATH = "/root/.config/pulp/cli.toml"
-POST_TIMEOUT = 3600
-TAR_POLL_VAL = 25
-FILE_POLL_VAL = 1
-ISO_POLL_VAL = 15
+POST_TIMEOUT = 3600  # seconds
+TAR_POLL_VAL = 25    # minutes
+FILE_POLL_VAL = 1    # minutes
+ISO_POLL_VAL = 15    # minutes
 FILE_URI = "/pulp/api/v3/content/file/files/"
 PULP_SSL_CA_CERT = "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt"
 # ----------------------------

From 1af974feb43bc62f3427ebabdfd0ae2bda959180 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 11 Feb 2026 18:16:15 +0530
Subject: [PATCH 136/172] Update ci-group-slurm_node_x86_64.yaml.j2

Signed-off-by: sakshi-singla-1735 <sakshi.s@dell.com>
---
 .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 88eeb45e16..7ee17aa10d 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -494,8 +494,8 @@
         - mkdir -p {{ client_mount_path }}
         - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab
         - mount -a
-        - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
-        - /usr/local/bin/configure_ucx_openmpi_env.sh
+        # - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled."
+        # - /usr/local/bin/configure_ucx_openmpi_env.sh
 
 {% endif %}
 

From 5be8766e5f45816a78add59a81db98a8e0704746 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Thu, 12 Feb 2026 05:08:14 +0000
Subject: [PATCH 137/172] slurmdbd innodb fix

---
 .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2     | 4 +++-
 .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2  | 1 +
 .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2   | 1 +
 discovery/roles/slurm_config/vars/main.yml                    | 4 ++--
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
index 49d2f635e9..4d8aa716cd 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
@@ -340,7 +340,9 @@
             chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql
             chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb
             chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? 
-            chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb
+            chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
+            chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb /var/log/slurm
+
             #firewall
             systemctl enable firewalld
             systemctl start firewalld
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index 3dc8f65514..7931fc70c4 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -108,6 +108,7 @@
                     bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build
                     if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then
                         echo "[SUCCESS] NVIDIA driver installed successfully."
+                        nvidia-smi -pm 1
                     else
                         echo "[ERROR] NVIDIA driver installation failed."
                     fi
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index 62a4e9e063..04cec708e1 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -116,6 +116,7 @@
                     bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build
                     if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then
                         echo "[SUCCESS] NVIDIA driver installed successfully."
+                        nvidia-smi -pm 1
                     else
                         echo "[ERROR] NVIDIA driver installation failed."
                     fi
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 43ee995e5a..3afa329923 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -66,8 +66,8 @@ gpu_slurm_conf:
   SelectType: select/cons_tres
   SelectTypeParameters: CR_Core_Memory
   SlurmdParameters: l3cache_as_socket
-innodb_buffer_pool_size: 1G
-innodb_lock_wait_timeout: 120
+innodb_buffer_pool_size: 4G
+innodb_lock_wait_timeout: 900
 # TODO tmp
 nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
 bmc_username: "{{ hostvars['localhost']['bmc_username'] }}"

From 4a31bc3d2e2217d049643f9cef1b1286b4caada7 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Thu, 12 Feb 2026 05:32:29 +0000
Subject: [PATCH 138/172] variablize the dir

---
 .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
index 4d8aa716cd..b523eeb297 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
@@ -340,8 +340,8 @@
             chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql
             chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb
             chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? 
-            chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm
-            chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb /var/log/slurm
+            chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }}
+            chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }}
 
             #firewall
             systemctl enable firewalld

From 7ef0c3153135cfdd1d82b59f09ceb9bcc30da584 Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Thu, 12 Feb 2026 11:44:15 +0530
Subject: [PATCH 139/172] removing doca-ofed from nfs share

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../templates/doca-ofed/doca-install.sh.j2          |  3 ---
 discovery/roles/k8s_config/vars/main.yml            | 13 ++-----------
 discovery/roles/slurm_config/vars/main.yml          | 12 ++----------
 3 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2
index 111abcb3a1..db8a7cb9cc 100644
--- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2
+++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2
@@ -44,9 +44,6 @@ else
     dnf install -y kernel-headers-$(uname -r)
 fi
 
-echo "Bootstrap doca-ofed package..."
-rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm"
-
 echo "Installing doca-ofed..."
 if rpm -q doca-ofed >/dev/null 2>&1; then
     echo "doca-ofed package is already installed."
diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml
index 433b8e9f76..a80fb9b257 100644
--- a/discovery/roles/k8s_config/vars/main.yml
+++ b/discovery/roles/k8s_config/vars/main.yml
@@ -78,19 +78,10 @@ packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64"
 offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso"
 offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso"
 packages_layout_x86_64:
-  - doca-ofed
   - cuda
 packages_layout_aarch64:
-  - doca-ofed
   - cuda
 print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}"
-offline_path_x86_64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed"
-offline_path_aarch64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
-
+offline_path_x86_64: []
+offline_path_aarch64: []
 ssh_private_key_path: /root/.ssh/oim_rsa
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 43ee995e5a..3616b55068 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -141,19 +141,11 @@ packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64"
 offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso"
 offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso"
 packages_layout_x86_64:
-  - doca-ofed
   - cuda
 packages_layout_aarch64:
-  - doca-ofed
   - cuda
 print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}"
-offline_path_x86_64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed"
-offline_path_aarch64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
+offline_path_x86_64: []
+offline_path_aarch64: []
 
 ssh_private_key_path: /root/.ssh/oim_rsa

From 346667afa1e87aafb623bc66fcf0d41b3959d67a Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Thu, 12 Feb 2026 06:26:30 +0000
Subject: [PATCH 140/172] slurmdbd restart in controller

---
 .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2       | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
index b523eeb297..d5f9ef9ba6 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2
@@ -558,4 +558,6 @@
 
          - /root/ldms_sampler.sh
 {% endif %}
+         - systemctl restart slurmdbd
+         - systemctl restart slurmctld
          - echo "Cloud-Init has completed successfully."

From b4f064ee0d7feed5bf0b3bd6233e992a5bd133e1 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 15:52:26 +0530
Subject: [PATCH 141/172] Upgrade of input credential files to 2.1

---
 .../tasks/display_warnings.yml                |  53 ++++++
 .../import_input_parameters/tasks/main.yml    |  12 ++
 .../restore_omnia_config_credentials.yml      | 171 ++++++++++++++++++
 .../restore_user_registry_credential.yml      | 130 +++++++++++++
 .../tasks/set_backup_location.yml             |  33 ++++
 .../templates/omnia_config_credentials.yml.j2 |  48 +++++
 .../import_input_parameters/vars/main.yml     |  66 ++++++-
 7 files changed, 512 insertions(+), 1 deletion(-)
 create mode 100644 upgrade/roles/import_input_parameters/tasks/display_warnings.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
 create mode 100644 upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2

diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
new file mode 100644
index 0000000000..ac1eb69998
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -0,0 +1,53 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Display collected warnings
+  ansible.builtin.debug:
+    msg: |
+      =================================
+           UPGRADE WARNINGS SUMMARY
+      =================================
+
+      {% if upgrade_warnings | length > 0 %}
+      {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected.
+      You will now be shown the detailed list.
+      {% else %}
+      No warnings detected. Upgrade completed successfully!
+      {% endif %}
+  when: upgrade_warnings is defined
+
+
+- name: Pause for user to review warnings
+  ansible.builtin.pause:
+    prompt: |
+      ╔════════════════════════════════════════════╗
+      ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
+      ╚════════════════════════════════════════════╝
+
+      {% if upgrade_warnings | length > 0 %}
+      {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected:
+
+      {% for warning in upgrade_warnings %}
+      {{ loop.index }}. {{ warning }}
+      {% endfor %}
+
+      Please review these warnings carefully.
+      Press ENTER to continue or CTRL+C to abort.
+      {% else %}
+      No warnings detected. Upgrade completed successfully!
+
+      Press ENTER to continue...
+      {% endif %}
+  when: upgrade_warnings is defined
diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml
index ff77cf2c0e..2aacba7451 100644
--- a/upgrade/roles/import_input_parameters/tasks/main.yml
+++ b/upgrade/roles/import_input_parameters/tasks/main.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Set backup location based on oim_metadata.yml
+  ansible.builtin.include_tasks: set_backup_location.yml
+
 - name: Validate backup location for upgrade input processing
   ansible.builtin.include_tasks: precheck_backup_location.yml
 
@@ -39,3 +42,12 @@
 
 - name: Restore input files from backup
   ansible.builtin.include_tasks: restore_input_files.yml
+
+- name: Restore user_registry_credential.yml from backup
+  ansible.builtin.include_tasks: restore_user_registry_credential.yml
+
+- name: Restore omnia_config_credentials.yml from backup
+  ansible.builtin.include_tasks: restore_omnia_config_credentials.yml
+
+- name: Display upgrade warnings summary
+  ansible.builtin.include_tasks: display_warnings.yml
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
new file mode 100644
index 0000000000..0abafee26b
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -0,0 +1,171 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup omnia_config_credentials.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/omnia_config_credentials.yml"
+  register: backup_omnia_config_credentials_stat
+
+- name: Check if backup omnia_config_credentials_key exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/.omnia_config_credentials_key"
+  register: backup_omnia_config_credentials_key_stat
+
+- name: Add warning for missing omnia_config_credentials.yml to list
+  ansible.builtin.set_fact:
+    upgrade_warnings: >-
+      {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }}
+  when: 
+    - not backup_omnia_config_credentials_stat.stat.exists
+    - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))"
+
+- name: Process omnia_config_credentials.yml when present in backup
+  block:
+    - name: Check if backup file is encrypted
+      ansible.builtin.command:
+        cmd: cat "{{ backup_location }}/omnia_config_credentials.yml"
+      register: backup_omnia_config_credentials_content
+      changed_when: false
+      failed_when: false
+      no_log: true
+
+    - name: "Case 1: Key present and file encrypted - Process and update"
+      block:
+        - name: Copy encrypted omnia_config_credentials.yml from backup to temp location
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/omnia_config_credentials.yml"
+            dest: "{{ input_project_dir }}/omnia_config_credentials.yml.tmp"
+            mode: '0600'
+            remote_src: true
+
+        - name: Copy omnia_config_credentials_key from backup
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/.omnia_config_credentials_key"
+            dest: "{{ input_project_dir }}/.omnia_config_credentials_key"
+            mode: '0600'
+            remote_src: true
+
+        - name: Decrypt omnia_config_credentials.yml using the key
+          ansible.builtin.shell:
+            cmd: |
+              ansible-vault decrypt "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" \
+                --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \
+                --output "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+          args:
+            executable: /bin/bash
+          no_log: true
+          register: vault_decrypt_result
+          failed_when: vault_decrypt_result.rc != 0
+
+        - name: Read decrypted content
+          ansible.builtin.slurp:
+            src: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+          register: decrypted_content
+          no_log: true
+
+        - name: Parse YAML content and extract credentials
+          ansible.builtin.set_fact:
+            credentials_dict: >-
+              {{ decrypted_content.content | b64decode | from_yaml }}
+          no_log: true
+
+      rescue:
+        - name: Fail with decryption error message
+          ansible.builtin.fail:
+            msg: "{{ msg_omnia_config_decrypt_error }}"
+
+    - name: "Case 1.1: Apply template and encrypt"
+      block:
+        - name: Set template variables from credentials
+          ansible.builtin.set_fact:
+            provision_password: "{{ credentials_dict.provision_password | default('') }}"
+            bmc_username: "{{ credentials_dict.bmc_username | default('') }}"
+            bmc_password: "{{ credentials_dict.bmc_password | default('') }}"
+            minio_s3_password: "{{ credentials_dict.minio_s3_password | default('') }}"
+            pulp_password: "{{ credentials_dict.pulp_password | default('') }}"
+            docker_username: "{{ credentials_dict.docker_username | default('') }}"
+            docker_password: "{{ credentials_dict.docker_password | default('') }}"
+            slurm_db_password: "{{ credentials_dict.slurm_db_password | default('') }}"
+            openldap_db_username: "{{ credentials_dict.openldap_db_username | default('') }}"
+            openldap_db_password: "{{ credentials_dict.openldap_db_password | default('') }}"
+            mysqldb_user: "{{ credentials_dict.mysqldb_user | default('') }}"
+            mysqldb_password: "{{ credentials_dict.mysqldb_password | default('') }}"
+            mysqldb_root_password: "{{ credentials_dict.mysqldb_root_password | default('') }}"
+            csi_username: "{{ credentials_dict.csi_username | default('') }}"
+            csi_password: "{{ credentials_dict.csi_password | default('') }}"
+            ldms_sampler_password: "{{ credentials_dict.ldms_sampler_password | default('') }}"
+          no_log: true
+
+        - name: Write updated content using template
+          ansible.builtin.template:
+            src: omnia_config_credentials.yml.j2
+            dest: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+            mode: '0600'
+          no_log: true
+
+        - name: Encrypt updated file using the same key
+          ansible.builtin.shell:
+            cmd: |
+              ansible-vault encrypt "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" \
+                --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \
+                --output "{{ input_project_dir }}/omnia_config_credentials.yml"
+          args:
+            executable: /bin/bash
+          no_log: true
+          register: vault_encrypt_result
+          failed_when: vault_encrypt_result.rc != 0
+
+        - name: Clean up temporary files
+          ansible.builtin.file:
+            path: "{{ item }}"
+            state: absent
+          loop:
+            - "{{ input_project_dir }}/omnia_config_credentials.yml.tmp"
+            - "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+
+        - name: Display success message
+          ansible.builtin.debug:
+            msg: "{{ msg_omnia_config_credentials_success }}"
+
+      rescue:
+        - name: Fail with template/encryption error message
+          ansible.builtin.fail:
+            msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}"
+      when: >-
+        backup_omnia_config_credentials_key_stat.stat.exists and
+        backup_omnia_config_credentials_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
+
+    - name: "Case 2: Both key and file missing - Add info warning"
+      ansible.builtin.set_fact:
+        upgrade_warnings: >-
+          {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }}
+      when: >-
+        not backup_omnia_config_credentials_key_stat.stat.exists and
+        (backup_omnia_config_credentials_content.stdout is not defined or 
+         '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and
+        "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))"
+
+    - name: "Case 3: Error - Mismatched state"
+      ansible.builtin.fail:
+        msg: "{{ msg_omnia_config_credentials_error }}"
+      when: >-
+        (not backup_omnia_config_credentials_key_stat.stat.exists and 
+         backup_omnia_config_credentials_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or
+        (backup_omnia_config_credentials_key_stat.stat.exists and 
+         backup_omnia_config_credentials_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
+  when: backup_omnia_config_credentials_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
new file mode 100644
index 0000000000..de337310b8
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -0,0 +1,130 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup user_registry_credential.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/user_registry_credential.yml"
+  register: backup_user_registry_credential_stat
+
+- name: Check if user_registry_credential.yml exists in current directory
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/user_registry_credential.yml"
+  register: user_registry_credential_stat
+
+- name: Check if backup local_repo_credentials_key exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/.local_repo_credentials_key"
+  register: backup_local_repo_credentials_key_stat
+
+- name: Add warning for missing user_registry_credential.yml to list
+  ansible.builtin.set_fact:
+    upgrade_warnings: >-
+      {{ upgrade_warnings + [
+        "WARNING: user_registry_credential.yml not found in backup at " +
+        backup_location + "/user_registry_credential.yml. " +
+        "This might be due to complete Omnia execution not being completed. " +
+        "Skipping restoration of this file."
+      ] }}
+  when: 
+    - not backup_user_registry_credential_stat.stat.exists
+    - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))"
+
+- name: Process user_registry_credential.yml when present in backup
+  block:
+    - name: Check if backup file is encrypted
+      ansible.builtin.command:
+        cmd: cat "{{ backup_location }}/user_registry_credential.yml"
+      register: backup_user_registry_content
+      changed_when: false
+      failed_when: false
+      no_log: true
+
+    - name: "Case 1: Key present and file encrypted - Copy both"
+      block:
+        - name: Decrypt user_registry_credential.yml using the key
+          ansible.builtin.shell:
+            cmd: |
+              ansible-vault decrypt "{{ input_project_dir }}/user_registry_credential.yml.tmp" \
+                --vault-password-file "{{ input_project_dir }}/.local_repo_credentials_key" \
+                --output "{{ input_project_dir }}/user_registry_credential.yml.decrypted"
+          args:
+            executable: /bin/bash
+          no_log: true
+          register: vault_decrypt_result
+          failed_when: vault_decrypt_result.rc != 0
+
+        - name: Copy encrypted user_registry_credential.yml from backup
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/user_registry_credential.yml"
+            dest: "{{ input_project_dir }}/user_registry_credential.yml"
+            mode: '0600'
+            remote_src: true
+
+        - name: Copy local_repo_credentials_key from backup
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/.local_repo_credentials_key"
+            dest: "{{ input_project_dir }}/.local_repo_credentials_key"
+            mode: '0600'
+            remote_src: true
+
+        - name: Display success message for encrypted file restoration
+          ansible.builtin.debug:
+            msg: |
+              user_registry_credential.yml restored from backup.
+              Backup: {{ backup_location }}/user_registry_credential.yml
+              Target: {{ input_project_dir }}/user_registry_credential.yml
+              Status: Encrypted (key file also restored)
+      rescue:
+        - name: Fail with decryption error message
+          ansible.builtin.fail:
+            msg: "{{ msg_user_registry_decrypt_error }}"
+      when: >-
+        backup_local_repo_credentials_key_stat.stat.exists and
+        backup_user_registry_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
+
+    - name: "Case 2: Both key and file missing - Add info warning"
+      ansible.builtin.set_fact:
+        upgrade_warnings: >-
+          {{ upgrade_warnings + [
+            "INFO: Both user_registry_credential.yml and .local_repo_credentials_key " +
+            "are not present in backup. This is expected if registry credentials " +
+            "were not configured in the source installation."
+          ] }}
+      when: >-
+        not backup_local_repo_credentials_key_stat.stat.exists and
+        (backup_user_registry_content.stdout is not defined or 
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and
+        "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))"
+
+    - name: "Case 3: Error - Mismatched state"
+      ansible.builtin.fail:
+        msg: |
+          ERROR: Inconsistent state detected for user_registry_credential.yml:
+          {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %}
+          - File is encrypted but key file (.local_repo_credentials_key) is missing
+          {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
+          - Key file exists but file is not encrypted
+          {% endif %}
+          Please check the backup integrity and ensure both files are present
+          in consistent states.
+      when: >-
+        (not backup_local_repo_credentials_key_stat.stat.exists and 
+         backup_user_registry_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or
+        (backup_local_repo_credentials_key_stat.stat.exists and 
+         backup_user_registry_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout)
+  when: backup_user_registry_credential_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
new file mode 100644
index 0000000000..4f6a96e83f
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
@@ -0,0 +1,33 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Read oim_metadata.yml to get upgrade_backup_dir
+  ansible.builtin.slurp:
+    src: /opt/omnia/.data/oim_metadata.yml
+  register: oim_metadata_slurp
+
+- name: Parse oim_metadata.yml
+  ansible.builtin.set_fact:
+    oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}"
+
+- name: Set backup_location from metadata
+  ansible.builtin.set_fact:
+    backup_location: "{{ oim_metadata.upgrade_backup_dir }}/input/project_default"
+  when: oim_metadata.upgrade_backup_dir is defined
+
+- name: Fail if upgrade_backup_dir is not defined in metadata
+  ansible.builtin.fail:
+    msg: "{{ msg_upgrade_backup_dir_missing }}"
+  when: oim_metadata.upgrade_backup_dir is not defined
diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2
new file mode 100644
index 0000000000..4b3b63d8c7
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2
@@ -0,0 +1,48 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# Provision credentials
+provision_password: "{{ provision_password | default('') }}"
+bmc_username: "{{ bmc_username | default('') }}"
+bmc_password: "{{ bmc_password | default('') }}"
+
+# Prepare_oim credentials
+minio_s3_password: "{{ minio_s3_password | default('') }}"
+pulp_password: "{{ pulp_password | default('') }}"
+docker_username: "{{ docker_username | default('') }}"
+docker_password: "{{ docker_password | default('') }}"
+
+# Omnia credentials
+slurm_db_password: "{{ slurm_db_password | default('') }}"
+
+# Security credentials
+openldap_db_username: "{{ openldap_db_username | default('') }}"
+openldap_db_password: "{{ openldap_db_password | default('') }}"
+
+# iDrac Telemetry credentials
+mysqldb_user: "{{ mysqldb_user | default('') }}"
+mysqldb_password: "{{ mysqldb_password | default('') }}"
+mysqldb_root_password: "{{ mysqldb_root_password | default('') }}"
+
+# csi powerscale credentials
+csi_username: "{{ csi_username | default('') }}"
+csi_password: "{{ csi_password | default('') }}"
+
+# LDMS sampler
+ldms_sampler_password: "{{ ldms_sampler_password | default('') }}"
+
+# postgres credentials
+postgres_user: "{{ postgres_user | default('') }}"
+postgres_password: "{{ postgres_password | default('') }}"
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index c27f111cde..5eee4a2f50 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -13,18 +13,82 @@
 # limitations under the License.
 ---
 
-backup_location: /opt/omnia/backups/upgrade/input/project_default
+# backup_location will be set from oim_metadata.yml upgrade_backup_dir
+# Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default
+backup_location: ""
 
 backup_dir_mode: '0755'
 default_file_mode: '0644'
 
+# List to collect warnings during execution
+upgrade_warnings: []
+
 # Precheck backup location messages
 msg_backup_location_missing: "backup_location must be provided"
+msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.data/oim_metadata.yml"
 
 # Restore input files messages
 msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
+msg_user_registry_credential_missing: |- 
+  \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m
+  This might be due to complete Omnia execution not being completed.
+  Skipping restoration of this file.
+
+# Omnia config credentials messages
+msg_omnia_config_credentials_missing: |- 
+  WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml.
+  This might be due to complete Omnia execution not being completed.
+  Skipping restoration of this file.
+
+msg_omnia_config_credentials_info_missing: |- 
+  INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key 
+  are not present in backup. This is expected if credentials 
+  were not configured in the source installation.
+
+msg_omnia_config_credentials_success: |- 
+  omnia_config_credentials.yml restored and updated from backup.
+  Backup: {{ backup_location }}/omnia_config_credentials.yml
+  Target: {{ input_project_dir }}/omnia_config_credentials.yml
+  Status: Updated with postgres credentials and re-encrypted (key file also restored)
+
+msg_omnia_config_credentials_error: |- 
+  ERROR: Inconsistent state detected for omnia_config_credentials.yml:
+  {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %}
+  - File is encrypted but key file (.omnia_config_credentials_key) is missing
+  {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %}
+  - Key file exists but file is not encrypted
+  {% endif %}
+  Please check the backup integrity and ensure both files are present
+  in consistent states.
+
+# Rescue warning messages
+msg_user_registry_decrypt_error: |- 
+  ERROR: Failed to decrypt user_registry_credential.yml. 
+  The backup key file may be corrupted or incompatible. 
+  Please check the backup integrity and ensure the key file 
+  matches the encrypted file.
+
+msg_omnia_config_decrypt_error: |- 
+  ERROR: Failed to decrypt omnia_config_credentials.yml. 
+  The backup key file may be corrupted or incompatible. 
+  Please check the backup integrity and ensure the key file 
+  matches the encrypted file.
+
+msg_omnia_config_template_error: |- 
+  ERROR: Failed to generate updated omnia_config_credentials.yml. 
+  Template processing may have failed due to invalid data format. 
+  Please check the backup file format and ensure it contains valid YAML.
+
+msg_omnia_config_encrypt_error: |- 
+  ERROR: Failed to encrypt updated omnia_config_credentials.yml. 
+  The key file may be corrupted or there may be permission issues. 
+  Please check the key file integrity and file permissions.
+
+msg_decryption_failed: "Decryption failed. Check warnings for details."
+msg_template_failed: "Template processing failed. Check warnings for details."
+msg_encryption_failed: "Encryption failed. Check warnings for details."
 
 # Network spec transformation messages
 msg_backup_network_spec_missing: "Backup network_spec.yml missing"

From d3b9c749b5096eaa4ca708def872e51ad38e1ed4 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Thu, 12 Feb 2026 16:16:44 +0530
Subject: [PATCH 142/172] Added new package type rpm_repo

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../input_validation/common_utils/config.py   |   1 +
 .../library/module_utils/local_repo/config.py |   2 +-
 .../local_repo/parse_and_download.py          | 183 ++++++++++++------
 .../module_utils/local_repo/software_utils.py |   6 +-
 common/library/modules/parallel_tasks.py      | 163 ++++++++++------
 common/library/modules/pulp_cleanup.py        | 177 +++++++++++------
 local_repo/pulp_cleanup.yml                   |  13 +-
 7 files changed, 354 insertions(+), 191 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py
index e6e8a09042..0f369f3950 100644
--- a/common/library/module_utils/input_validation/common_utils/config.py
+++ b/common/library/module_utils/input_validation/common_utils/config.py
@@ -147,6 +147,7 @@
     "rpm": ["package", "repo_name"],
     "rpm_list": ["package_list", "repo_name"],
     "rpm_file": ["package", "url"],
+    "rpm_repo": ["package", "repo_name"],
     "ansible_galaxy_collection": ["package", "version"],
     "git": ["package", "version", "url"],
     "image": ["package", ["tag", "digest"]],  # Special: one of tag or digest
diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 0518e2bb01..cfc3b20c9d 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -51,7 +51,7 @@
 # Used by software_utils.py
 # ----------------------------
 PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git',
-                 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file']
+                 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file', 'rpm_repo']
 CSV_COLUMNS = {"column1": "name", "column2": "status"}
 SOFTWARE_CONFIG_SUBDIR = "config"
 RPM_LABEL_TEMPLATE = "RPMs for {key}"
diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 367f9561f5..72efd4566b 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=import-error,no-name-in-module
+"""
+Utility functions for parsing and downloading artifacts.
+
+This module provides common functions for command execution, status file management,
+and repository operations used across the local repo management system.
+"""
+
 import os
 import subprocess
 import json
 import re
 from multiprocessing import Lock
-from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
+from ansible.module_utils.local_repo.config import ARCH_SUFFIXES, STATUS_CSV_HEADER
 
 
 def mask_sensitive_data(cmd_string):
@@ -57,35 +64,87 @@ def execute_command(cmd_string, logger, type_json=False):
             stderr=subprocess.PIPE,
             shell=True,
         )
-
-        status["returncode"] = cmd.returncode
-        status["stdout"] = cmd.stdout.strip() if cmd.stdout else None
-        status["stderr"] = cmd.stderr.strip() if cmd.stderr else None
-
-        if cmd.returncode != 0:
-            logger.error(f"Command failed with return code {cmd.returncode}")
-            logger.error(f"Error: {status['stderr']}")
-            return False
-
-        if type_json and status["stdout"]:
-            try:
-                status["stdout"] = json.loads(status["stdout"])
-            except json.JSONDecodeError as error:
-                logger.error(f"Failed to parse JSON output: {error}")
-                return False
-
-        return status
-
-    except Exception as error:
-        logger.error(f"Error executing command: {error}")
+        logger.info(f"Command succeeded: {cmd_string}")
+        return True
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Command failed: {cmd_string} - {e}")
+        return False
+    except subprocess.TimeoutExpired as e:
+        logger.error(f"Command timed out: {cmd_string} - {e}")
+        return False
+    except OSError as e:
+        logger.error(f"OS error during command: {cmd_string} - {e}")
         return False
 
     finally:
         logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30)
 
+def get_arch_from_status_path(status_file_path):
+    """Extract architecture from status file path.
+    
+    Args:
+        status_file_path: Path like '/opt/omnia/log/local_repo/x86_64/software_name/status.csv'
+        
+    Returns:
+        str: Architecture ('x86_64' or 'aarch64') or None if not found
+    """
+    for arch in ARCH_SUFFIXES:
+        if f"/{arch}/" in status_file_path:
+            return arch
+    return None
+
+def _prefix_repo_name_with_arch(repo_name: str, status_file_path: str, logger) -> str:
+    """Add architecture prefix to repo_name if not already present.
+    
+    Args:
+        repo_name: Repository name to prefix
+        status_file_path: Path to extract architecture from
+        logger: Logger instance
+        
+    Returns:
+        str: Repository name with architecture prefix
+    """
+    if not repo_name:
+        return repo_name
+        
+    arch = get_arch_from_status_path(status_file_path)
+    if arch and not any(repo_name.startswith(f"{prefix}_") for prefix in ARCH_SUFFIXES):
+        prefixed_name = f"{arch}_{repo_name}"
+        logger.info(f"Auto-prefixed repo_name with architecture: {prefixed_name}")
+        return prefixed_name
+    return repo_name
+
+
+def _update_existing_line(line: str, package_name: str, package_type: str, status: str, repo_name: str, status_file_path: str) -> str:
+    """Update an existing line in status file.
+    
+    Args:
+        line: Existing line content
+        package_name: Package name to match
+        package_type: Package type
+        status: New status
+        repo_name: Repository name
+        status_file_path: Path for architecture extraction
+        
+    Returns:
+        str: Updated line content
+    """
+    parts = line.strip().split(',')
+    if len(parts) >= 4:
+        final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+        parts[2] = final_repo_name if final_repo_name else ''
+        parts[3] = status
+        return ','.join(parts) + '\n'
+    
+    # Handle short lines
+    final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+    return f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n"
+
+
 def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None):
     """
-    Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes.
+    Writes or updates the status of a package in the status file.
+    
     Args:
         status_file_path: Path to the status file
         package_name: Name of the package
@@ -97,44 +156,56 @@ def write_status_to_file(status_file_path, package_name, package_type, status, l
     """
     logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30)
 
+    # Auto-prefix repo_name with architecture if needed
+    repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, logger)
+
     try:
         with file_lock:  # Ensure only one process can write at a time
             if os.path.exists(status_file_path):
-                with open(status_file_path, "r") as f:
-                    lines = f.readlines()
-
-                updated = False
-                with open(status_file_path, "w") as f:
-                      # Write header (new files always have repo_name column)
-                    if lines:
-                        f.write(lines[0])  # Keep existing header
-
-                    # Write data lines
-                    for line in lines[1:]:  # Skip header
-                        if line.startswith(f"{package_name},"):
-                           # f.write(f"{package_name},{package_type},{status}\n")
-                            # Update existing line with repo_name (order: name,type,repo_name,status)
-                            parts = line.strip().split(',')
-                            if len(parts) >= 4:
-                                parts[2] = repo_name if repo_name else ''
-                                parts[3] = status
-                                f.write(','.join(parts) + '\n')
-                            else:
-                                f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
-                            updated = True
-                        else:
-                            f.write(line)
-
-                    if not updated:
-                        f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
+                _update_existing_file(status_file_path, package_name, package_type, status, repo_name)
             else:
-                with open(status_file_path, "w") as f:
-                    f.write(STATUS_CSV_HEADER)
-                    f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
+                _create_new_file(status_file_path, package_name, package_type, status, repo_name)
 
             logger.info(f"Status written to {status_file_path} for {package_name}.")
-    except Exception as e:
+    except OSError as e:
         logger.error(f"Failed to write to status file: {status_file_path}. Error: {str(e)}")
-        raise RuntimeError(f"Failed to write to status file: {status_file_path}. Error: {str(e)}")
+        raise RuntimeError(
+            f"Failed to write to status file: {status_file_path}. Error: {str(e)}"
+        ) from e
     finally:
         logger.info("#" * 30 + f" {write_status_to_file.__name__} end " + "#" * 30)
+
+
+def _update_existing_file(status_file_path, package_name, package_type, status, repo_name):
+    """Update existing status file with new package status."""
+    with open(status_file_path, "r", encoding='utf-8') as f:
+        lines = f.readlines()
+
+    updated = False
+    with open(status_file_path, "w", encoding='utf-8') as f:
+        # Write header
+        if lines:
+            f.write(lines[0])
+
+        # Write data lines
+        for line in lines[1:]:  # Skip header
+            if line.startswith(f"{package_name},"):
+                updated_line = _update_existing_line(
+                    line, package_name, package_type, status, repo_name, status_file_path
+                )
+                f.write(updated_line)
+                updated = True
+            else:
+                f.write(line)
+
+        if not updated:
+            final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+            f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n")
+
+
+def _create_new_file(status_file_path, package_name, package_type, status, repo_name):
+    """Create new status file with package status."""
+    with open(status_file_path, "w", encoding='utf-8') as f:
+        f.write(STATUS_CSV_HEADER)
+        final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+        f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n")
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index a915f25f8b..3e06ddc7cd 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -179,7 +179,7 @@ def transform_package_dict(data, arch_val,logger):
         repo_mapping = {}
 
         for item in items:
-            if item.get("type") == "rpm":
+            if item.get("type") in ("rpm", "rpm_repo"):
                 rpm_packages.append(item["package"])
                 # Preserve repo_name if available
                 if "repo_name" in item:
@@ -832,7 +832,7 @@ def remove_duplicates_from_trans(trans):
 
             if group == "default_packages":  # Handle nested rpm_list case
                 for pkg in items:
-                    if pkg.get("type") == "rpm" and "rpm_list" in pkg:
+                    if pkg.get("type") in ("rpm", "rpm_repo") and "rpm_list" in pkg:
                         pkg["rpm_list"] = list(dict.fromkeys(pkg["rpm_list"]))
                 continue
 
@@ -856,7 +856,7 @@ def remove_duplicates_from_trans(trans):
                 elif type_ == "git":
                     key = (item.get("url"), item.get("version"))
 
-                elif type_ == "rpm" and "rpm_list" in item:
+                elif type_ in ("rpm", "rpm_repo") and "rpm_list" in item:
                     item["rpm_list"] = list(dict.fromkeys(item["rpm_list"]))
                     key = item.get("package")
 
diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py
index 5951a525b2..17c14cf51f 100644
--- a/common/library/modules/parallel_tasks.py
+++ b/common/library/modules/parallel_tasks.py
@@ -34,7 +34,9 @@
 from ansible.module_utils.local_repo.download_image import process_image
 from ansible.module_utils.local_repo.download_rpm import process_rpm
 from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
-from ansible.module_utils.local_repo.common_functions import generate_vault_key, process_file, is_encrypted
+from ansible.module_utils.local_repo.common_functions import (
+    generate_vault_key, process_file, is_encrypted
+)
 from ansible.module_utils.local_repo.software_utils import (
     load_json,
     set_version_variables,
@@ -125,7 +127,10 @@ def update_status_csv(csv_dir, software, overall_status,slogger):
     slogger.info(f"Successfully updated status CSV at {status_file}")
 
 
-def determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password):
+def determine_function(
+    task, repo_store_path, csv_file_path, user_data, version_variables, arc,
+    user_registries, docker_username, docker_password
+):
     """
     Determines the appropriate function and its arguments to process a given task.
 
@@ -160,27 +165,55 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_
 
         task_type = task.get("type")
         if task_type == "manifest":
-            return process_manifest, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_manifest, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "git":
-            return process_git, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_git, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "tarball":
-            return process_tarball, [task, repo_store_path, status_file, version_variables, cluster_os_type, cluster_os_version, arc]
+            return process_tarball, [
+                task, repo_store_path, status_file, version_variables,
+                cluster_os_type, cluster_os_version, arc
+            ]
         if task_type == "shell":
-            return process_shell, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_shell, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "ansible_galaxy_collection":
-            return process_ansible_galaxy_collection, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_ansible_galaxy_collection, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "iso":
-            return process_iso, [task, repo_store_path, status_file,
-                                 cluster_os_type, cluster_os_version, version_variables, arc]
+            return process_iso, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, version_variables, arc
+            ]
         if task_type == "pip_module":
-            return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_pip, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "image":
-            return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password]
+            return process_image, [
+                task, status_file, version_variables, user_registries,
+                docker_username, docker_password
+            ]
         if task_type == "rpm_file":
-            return process_rpm_file, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
-        if task_type == "rpm":
-            return process_rpm, [task, repo_store_path, status_file,
-                                 cluster_os_type, cluster_os_version, repo_config_value, arc]
+            return process_rpm_file, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
+        if task_type in ("rpm", "rpm_repo"):
+            return process_rpm, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, repo_config_value, arc
+            ]
 
         raise ValueError(f"Unknown task type: {task_type}")
     except Exception as e:
@@ -272,57 +305,43 @@ def main():
     Args:
         tasks (list): A list of tasks (dictionaries) that need to be processed in parallel.
         nthreads (int): The number of worker processes to run in parallel.
-        timeout (int): The maximum time allowed for all tasks to execute. If `None`, no timeout is enforced.
+        timeout (int): The maximum time allowed for all tasks to execute.
+                    If `None`, no timeout is enforced.
         log_dir (str): The directory where log files for the worker processes will be saved.
         log_file (str): The path to the log file for the overall task execution.
         slog_file (str): The path to the log file for the standard logger.
         csv_file_path (str): The path to a CSV file that may be needed for processing some tasks.
         repo_store_path (str): The path to the repository where task-related files are stored.
         software (list): A list of software names.
-        user_json_file (str): The path to the JSON file containing use
-        show_softwares_status (bool): Whether to display the software status; optional, defaults to False.  
-        overall_status_dict (dict): A list containing overall software status information; optional, defaults to an empty dict.
-          Dictionary containing software status information grouped by software names.  
-          Each key (e.g., 'service_k8s') maps to a list of dictionaries,  
-          where each dictionary contains:
-              - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'.  
-              - 'overall_status' (str): Status of the software on that architecture, e.g., 'SUCCESS'.  
-          Example:
-              {
-                  "service_k8s": [
-                      {"arch": "x86_64", "overall_status": "SUCCESS"},
-                      {"arch": "aarch64", "overall_status": "SUCCESS"}
-                  ]
-              }
-          Defaults to an empty dict if not provided.
+        user_json_file (str): The path to the JSON file containing user data.
+        show_softwares_status (bool): Whether to display the software status;
+                                optional, defaults to False.
+        overall_status_dict (dict): A dictionary containing overall software status
+                                information; optional, defaults to an empty dict.
+            Dictionary containing software status information grouped by software names.
+            Each key (e.g., 'service_k8s') maps to a list of dictionaries,
+            where each dictionary contains:
+                - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'.
+                - 'overall_status' (str): Status of the software on that architecture,
+                                        e.g., 'SUCCESS'.
+            Example:
+                {
+                    "service_k8s": [
+                        {"arch": "x86_64", "overall_status": "SUCCESS"},
+                        {"arch": "aarch64", "overall_status": "SUCCESS"}
+                    ]
+                }
+            Defaults to an empty dict if not provided.
 
     Returns:
         tuple: A tuple containing:
-            - overall_status (str): The overall status of task execution ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT").
-            - task_results_data (list): A list of dictionaries, each containing the result of an individual task.
+            - overall_status (str): The overall status of task execution
+                                 ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT").
+            - task_results_data (list): A list of dictionaries, each containing
+                                    the result of an individual task.
     Raises:
         Exception: If an error occurs during execution.
     """
-    # module_args = {
-    #     "tasks": {"type": "list", "required": True},
-    #     "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS},
-    #     "timeout": {"type": "int", "required": False, "default": DEFAULT_TIMEOUT},
-    #     "log_dir": {"type": "str", "required": False, "default": LOG_DIR_DEFAULT},
-    #     "log_file": {"type": "str", "required": False, "default": DEFAULT_LOG_FILE},
-    #     "slog_file": {"type": "str", "required": False, "default": DEFAULT_SLOG_FILE},
-    #     "csv_file_path": {"type": "str", "required": False, "default": CSV_FILE_PATH_DEFAULT},
-    #     "repo_store_path": {"type": "str", "required": False, "default": DEFAULT_REPO_STORE_PATH},
-    #     "software": {"type": "list", "elements": "str", "required": True},
-    #     "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT},
-    #     "show_softwares_status": {"type": "bool", "required": False, "default": False},
-    #     "overall_status_dict": {"type": "dict","required": True},
-    #     "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT},
-    #     "arch": {"type": "str", "required": False},
-    #     "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT},
-    #     "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH},
-    #     "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH},
-    #     "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH}
-    # }
 
     module_args = {
         "tasks": {"type": "list", "required": True},
@@ -337,10 +356,19 @@ def main():
         "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT},
         "show_softwares_status": {"type": "bool", "required": False, "default": False},
         "overall_status_dict": {"type": "dict","required": True},
-        "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT},
+        "local_repo_config_path": {
+            "type": "str", "required": False,
+            "default": LOCAL_REPO_CONFIG_PATH_DEFAULT
+        },
         "arch": {"type": "str", "required": False},
-        "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH},
-        "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH}
+        "omnia_credentials_yaml_path": {
+            "type": "str", "required": False,
+            "default": OMNIA_CREDENTIALS_YAML_PATH
+        },
+        "omnia_credentials_vault_path": {
+            "type": "str", "required": False,
+            "default": OMNIA_CREDENTIALS_VAULT_PATH
+        }
     }
     module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
     tasks = module.params["tasks"]
@@ -386,24 +414,29 @@ def main():
         cluster_os_type = user_data['cluster_os_type']
         cluster_os_version = user_data['cluster_os_version']
 
-        subgroup_dict, software_names = get_subgroup_dict(user_data,slogger)
-        version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger)
+        subgroup_dict, software_names = get_subgroup_dict(user_data, slogger)
+        version_variables = set_version_variables(
+            user_data, software_names, cluster_os_version, slogger
+        )
         slogger.info(f"Cluster OS: {cluster_os_type}")
         slogger.info(f"Version Variables: {version_variables}")
         # gen_result = {}
         # if not os.path.isfile(user_reg_key_path):
         #     gen_result = generate_vault_key(user_reg_key_path)
         # if gen_result is None:
-        #     module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}")
+        #     module.fail_json(
+        #         msg=f"Unable to generate local_repo key at path: {user_reg_key_path}"
+        #     )
 
         overall_status, task_results = execute_parallel(
             tasks, determine_function, nthreads, repo_store_path, csv_file_path,
-            log_dir, user_data, version_variables, arc, slogger, local_repo_config_path,
-            omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout
+            log_dir, user_data, version_variables, arc, slogger,
+            local_repo_config_path, omnia_credentials_yaml_path,
+            omnia_credentials_vault_path, timeout
         )
 
         # if not is_encrypted(user_reg_cred_input):
-        #     process_file(user_reg_cred_input,user_reg_key_path,'encrypt')
+        #     process_file(user_reg_cred_input, user_reg_key_path, 'encrypt')
 
         end_time = datetime.now()
         formatted_end_time = end_time.strftime("%I:%M:%S %p")
@@ -442,7 +475,9 @@ def main():
 
 
     except Exception as e:
-        result["table_output"] = table_output if "table_output" in locals() else "No table generated."
+        result["table_output"] = (
+            table_output if "table_output" in locals() else "No table generated."
+        )
         slogger.error(f"Execution failed: {str(e)}")
         module.fail_json(msg=f"Error during execution: {str(e)}", **result)
 
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index f3da3e2004..a3c155ebdb 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -137,7 +137,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]:
 
     # Must contain at least one '/' to indicate registry/image format
     if '/' not in image_name:
-        return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)"
+        return False, (
+            f"Invalid format '{image_name}'. Must include registry "
+            "(e.g., registry.k8s.io/pause, docker.io/library/busybox)"
+        )
 
     # Must have a registry part (contains '.' or is a known registry)
     parts = image_name.split('/')
@@ -145,7 +148,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]:
 
     # Check if registry looks valid (contains dot or is localhost)
     if '.' not in registry and registry != 'localhost' and ':' not in registry:
-        return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)"
+        return False, (
+            f"Invalid registry '{registry}' in '{image_name}'. "
+            "Registry must be a domain (e.g., docker.io, registry.k8s.io)"
+        )
 
     return True, ""
 
@@ -173,7 +179,9 @@ def detect_file_type(name: str) -> str:
     if '==' in name:
         return "pip_module"
     # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix)
-    if '.' in name and '/' not in name and '==' not in name and any(x in name.lower() for x in ['ansible', 'community', 'galaxy']):
+    if '.' in name and '/' not in name and '==' not in name and any(
+        x in name.lower() for x in ['ansible', 'community', 'galaxy']
+    ):
         return "ansible_galaxy_collection"
     if name.startswith('ansible_galaxy_collection'):
         return "ansible_galaxy_collection"
@@ -296,7 +304,9 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]
 
     # Check existence
     if not container_exists(pulp_name, logger):
-        result["message"] = f"Container not found in Pulp (looked for: {pulp_name})"
+        result["message"] = (
+            f"Container not found in Pulp (looked for: {pulp_name})"
+        )
         return result
 
     try:
@@ -368,7 +378,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
         # 1. Remove content from repository
         if content_href:
             remove_result = run_cmd(
-                f"pulp file repository content remove --repository {repo_name} --href {content_href}",
+                f"pulp file repository content remove --repository {repo_name} "
+                f"--href {content_href}",
                 logger
             )
             if remove_result["rc"] == 0:
@@ -376,7 +387,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
             else:
                 # Try alternative: modify repository to remove content
                 run_cmd(
-                    f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'",
+                    f"pulp file repository content modify --repository {repo_name} "
+                    f"--remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'",
                     logger
                 )
 
@@ -444,7 +456,9 @@ def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger)
                     messages.append("Orphan cleanup completed")
         else:
             # Try listing repos to find partial match
-            repo_list = run_cmd(pulp_python_commands["list_repositories"], logger)
+            repo_list = run_cmd(
+                pulp_python_commands["list_repositories"], logger
+            )
             if repo_list["rc"] == 0:
                 repos = safe_json_parse(repo_list["stdout"])
                 for repo in repos:
@@ -533,7 +547,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor
                 messages.append("Repository deleted")
         else:
             # Try listing repos to find partial match
-            repo_list = run_cmd(pulp_file_commands["list_repositories"], logger)
+            repo_list = run_cmd(
+                pulp_file_commands["list_repositories"], logger
+            )
             if repo_list["rc"] == 0:
                 repos = safe_json_parse(repo_list["stdout"])
                 for repo in repos:
@@ -569,7 +585,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor
                 mark_software_partial(affected, base_path, logger, file_type)
 
         # Clean up uploaded content from filesystem
-        fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger)
+        fs_result = cleanup_content_directory(
+            name, file_type, repo_store_path, logger
+        )
         if fs_result["status"] == "Success":
             content_removed = True
             messages.append(fs_result["message"])
@@ -673,67 +691,82 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p
 # STATUS FILE UPDATES
 # =============================================================================
 
-def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[str]:
+def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[str, List[str]]:
     """Remove RPMs that belong to a specific repository from status files.
-    
+
     Uses the repo_name column in status.csv to accurately identify RPMs from the repository.
-    
+    Now that all repo_names include architecture prefixes, the logic is simplified.
+
     Args:
-        repo_name: Repository name (e.g., 'x86_64_appstream')
+        repo_name: Repository name (e.g., 'x86_64_appstream', 'aarch64_epel')
         base_path: Base path for status files
         logger: Logger instance
-        
+
     Returns:
-        List of software names that were affected
+        Dict mapping architecture to list of affected software names
     """
-    affected_software = []
+    affected_software = {}
     logger.info(f"Removing RPMs from status.csv for repository: {repo_name}")
-    try:
-        for arch in ARCH_SUFFIXES:
-            for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"):
-                rows = []
-                removed = False
-                has_repo_column = False
 
-                # Check if file has repo_name column
-                with open(status_file, 'r', encoding='utf-8') as f:
-                    header = f.readline().strip().lower()
-                    has_repo_column = "repo_name" in header
+    # Extract architecture from repo_name (all repo_names should now have arch prefixes)
+    target_arch = None
+    for arch in ARCH_SUFFIXES:
+        if repo_name.startswith(f"{arch}_"):
+            target_arch = arch
+            break
+    
+    if not target_arch:
+        logger.error(f"Repository name {repo_name} does not have architecture prefix")
+        return {}
+    
+    logger.info(f"Processing architecture: {target_arch}")
+    affected_software[target_arch] = []
+    
+    try:        
+        for status_file in glob.glob(f"{base_path}/{target_arch}/*/status.csv"):
+            rows = []
+            removed = False
+            has_repo_column = False
 
-                with open(status_file, 'r', encoding='utf-8') as f:
-                    reader = csv.DictReader(f)
-                    fieldnames = reader.fieldnames
-                    for row in reader:
-                        name = row.get('name', '')
-                        row_type = row.get('type', '')
-                        rpm_repo = row.get('repo_name', '')
-
-                        logger.info(f"Processing row: {row}")
-                        # For RPMs, check if they belong to the deleted repository
-                        if row_type == 'rpm' or row_type == 'rpm_file':
-                            if has_repo_column and rpm_repo == repo_name:
-                                removed = True
-                                logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)")
-                            else:
-                                rows.append(row)
+            # Check if file has repo_name column
+            with open(status_file, 'r', encoding='utf-8') as f:
+                header = f.readline().strip().lower()
+                has_repo_column = "repo_name" in header
+
+            with open(status_file, 'r', encoding='utf-8') as f:
+                reader = csv.DictReader(f)
+                fieldnames = reader.fieldnames
+                for row in reader:
+                    name = row.get('name', '')
+                    row_type = row.get('type', '')
+                    rpm_repo = row.get('repo_name', '')
+
+                    logger.info(f"Processing row: {row}")
+                    # For RPMs, check if they belong to the deleted repository
+                    if row_type in ('rpm', 'rpm_repo', 'rpm_file'):
+                        if has_repo_column and rpm_repo == repo_name:
+                            removed = True
+                            logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)")
                         else:
                             rows.append(row)
+                    else:
+                        rows.append(row)
 
-                if removed and fieldnames:
-                    with open(status_file, 'w', newline='', encoding='utf-8') as f:
-                        writer = csv.DictWriter(f, fieldnames=fieldnames)
-                        writer.writeheader()
-                        writer.writerows(rows)
+            if removed and fieldnames:
+                with open(status_file, 'w', newline='', encoding='utf-8') as f:
+                    writer = csv.DictWriter(f, fieldnames=fieldnames)
+                    writer.writeheader()
+                    writer.writerows(rows)
 
-                    # Track affected software
-                    software_name = os.path.basename(os.path.dirname(status_file))
-                    if software_name not in affected_software:
-                        affected_software.append(software_name)
+                # Track affected software
+                software_name = os.path.basename(os.path.dirname(status_file))
+                if software_name not in affected_software[target_arch]:
+                    affected_software[target_arch].append(software_name)
 
         return affected_software
     except Exception as e:
         logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}")
-        return []
+        return {}
 
 def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]:
     """Remove artifact from status.csv files and return affected software names by architecture.
@@ -798,10 +831,10 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
 
 def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None):
     """Mark software entries as partial in software.csv.
-    
+
     Args:
-        affected_software: Either a List[str] of software names (from remove_rpms_from_repository)
-                          or a Dict[str, List[str]] mapping arch to software names (from remove_from_status_files)
+        affected_software: Either a List[str] of software names (legacy support)
+                          or a Dict[str, List[str]] mapping arch to software names
         base_path: Base path for software.csv
         logger: Logger instance
         artifact_type: Type of artifact being removed (for logging purposes)
@@ -811,8 +844,11 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty
         logger.info("No affected software to mark as partial")
         return
 
-    # Normalize input: if a flat list is passed, apply to all architectures
+    # Normalize input: convert to arch_software_map if needed
     if isinstance(affected_software, list):
+        # Legacy list input - this should not happen with new remove_rpms_from_repository
+        # but we keep it for backward compatibility
+        logger.warning("Received list input to mark_software_partial, applying to all architectures (legacy behavior)")
         arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES}
     else:
         arch_software_map = affected_software
@@ -869,7 +905,7 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) ->
         with open(status_file, 'r', encoding='utf-8') as f:
             reader = csv.DictReader(f)
             for row in reader:
-                if row.get('type', '').lower() == 'rpm':
+                if row.get('type', '').lower() in ('rpm', 'rpm_repo'):
                     return True
         return False
     except OSError as e:
@@ -892,7 +928,9 @@ def mark_all_software_partial(base_path: str, logger):
     try:
         for arch in ARCH_SUFFIXES:
             software_file = f"{base_path}/{arch}/software.csv"
-            logger.info(f"Processing software file: {software_file}")
+            logger.info(
+                f"Processing software file: {software_file}"
+            )
 
             if not os.path.exists(software_file):
                 logger.info(f"Software file not found: {software_file}")
@@ -948,8 +986,12 @@ def run_module():
             cleanup_repos=dict(type='list', elements='str', default=[]),
             cleanup_containers=dict(type='list', elements='str', default=[]),
             cleanup_files=dict(type='list', elements='str', default=[]),
-            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT),
-            repo_store_path=dict(type='str', default='/opt/omnia')
+            base_path=dict(
+                type='str', default=CLEANUP_BASE_PATH_DEFAULT
+            ),
+            repo_store_path=dict(
+                type='str', default='/opt/omnia'
+            )
         ),
         supports_check_mode=True
     )
@@ -966,16 +1008,25 @@ def run_module():
     logger = setup_standard_logger(log_dir)
 
     # Handle 'all' keyword for repositories only
-    cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all'
+    cleanup_all_repos = (
+        cleanup_repos and len(cleanup_repos) == 1 and 
+        cleanup_repos[0].lower() == 'all'
+    )
     #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all':
     if cleanup_all_repos:
         logger.info("cleanup_repos='all' - fetching all repositories from Pulp")
         cleanup_repos = get_all_repositories(logger)
         if not cleanup_repos:
-            module.fail_json(msg="Failed to retrieve repository list from Pulp. Please check if Pulp services are running.")
+            module.fail_json(
+                msg="Failed to retrieve repository list from Pulp. "
+                "Please check if Pulp services are running."
+            )
         logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}")
 
-    logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}")
+    logger.info(
+        f"Starting cleanup - repos: {cleanup_repos}, "
+        f"containers: {cleanup_containers}, files: {cleanup_files}"
+    )
 
     all_results = []
 
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index 93e379833b..6f54e5f45f 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -15,10 +15,15 @@
 # Pulp Cleanup Playbook - Clean Architecture
 #
 # Usage:
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel", "baseos"]}'
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_containers": ["nginx", "redis"]}'
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_files": ["git", "chart-0.48.0"]}'
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel"], "cleanup_containers": ["nginx"]}' -e force=true
+#   # Repository cleanup (include architecture prefix)
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel,aarch64_epel"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_appstream"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_containers=nginx,redis"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_files=git,chart-0.48.0"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=nginx -e force=true"
+#
+#   # Examples: x86_64_epel, aarch64_epel, x86_64_appstream, aarch64_baseos
+#   # Note: Use architecture prefix (x86_64_ or aarch64_) for repository names
 
 - name: Pulp Cleanup
   hosts: localhost

From 2898ff029a86ea9c326bea156f2162d9548e1d86 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Thu, 12 Feb 2026 17:36:48 +0530
Subject: [PATCH 143/172] input config changes

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 input/config/aarch64/rhel/10.0/slurm_custom.json | 5 +----
 input/config/x86_64/rhel/10.0/slurm_custom.json  | 5 +----
 input/local_repo_config.yml                      | 4 +++-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json
index 2483775495..2bdfda0ab9 100644
--- a/input/config/aarch64/rhel/10.0/slurm_custom.json
+++ b/input/config/aarch64/rhel/10.0/slurm_custom.json
@@ -9,10 +9,7 @@
             {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"},
             {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"},
             {"package": "apptainer", "type": "rpm", "repo_name": "epel" },
-            {"package": "doca-ofed",
-             "type": "iso",
-             "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm"
-            }
+	    {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
         ]
     },
     "slurm_control_node": {
diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json
index 9531239fd2..8781885cca 100644
--- a/input/config/x86_64/rhel/10.0/slurm_custom.json
+++ b/input/config/x86_64/rhel/10.0/slurm_custom.json
@@ -7,10 +7,7 @@
             {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"},
             {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"},
             {"package": "apptainer", "type": "rpm", "repo_name": "epel" },
-            {"package": "doca-ofed",
-             "type": "iso",
-             "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"
-            }
+	    {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
         ]
     },
     "slurm_control_node": {
diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml
index 2f318f1deb..8428e6d94c 100644
--- a/input/local_repo_config.yml
+++ b/input/local_repo_config.yml
@@ -138,10 +138,12 @@ omnia_repo_url_rhel_x86_64:
   - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"}
   - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"}
   - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"}
-  - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"}
+  - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"}
+  - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"}
 omnia_repo_url_rhel_aarch64:
   - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"}
   - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"}
+  - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/repodata/repomd.xml.key", name: "doca"}
 # Example:
 # additional_repos_x86_64:
 #  - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" }

From 680aef3efb7c0249d2d88447e9f0d7f83541a80f Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 17:44:57 +0530
Subject: [PATCH 144/172] Fixed ansible lint issues

---
 .../tasks/display_warnings.yml                | 18 ++++------
 .../restore_omnia_config_credentials.yml      | 23 ++++++++-----
 .../restore_user_registry_credential.yml      | 33 ++++++++++---------
 .../import_input_parameters/vars/main.yml     | 10 +++---
 4 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
index ac1eb69998..2cc6dfed26 100644
--- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -20,13 +20,11 @@
            UPGRADE WARNINGS SUMMARY
       =================================
 
-      {% if upgrade_warnings | length > 0 %}
       {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected.
       You will now be shown the detailed list.
-      {% else %}
-      No warnings detected. Upgrade completed successfully!
-      {% endif %}
-  when: upgrade_warnings is defined
+  when:
+    - upgrade_warnings is defined
+    - upgrade_warnings | length > 0
 
 
 - name: Pause for user to review warnings
@@ -36,7 +34,6 @@
       ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
       ╚════════════════════════════════════════════╝
 
-      {% if upgrade_warnings | length > 0 %}
       {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected:
 
       {% for warning in upgrade_warnings %}
@@ -45,9 +42,6 @@
 
       Please review these warnings carefully.
       Press ENTER to continue or CTRL+C to abort.
-      {% else %}
-      No warnings detected. Upgrade completed successfully!
-
-      Press ENTER to continue...
-      {% endif %}
-  when: upgrade_warnings is defined
+  when:
+    - upgrade_warnings is defined
+    - upgrade_warnings | length > 0
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index 0abafee26b..71e8fb7db2 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -31,16 +31,21 @@
     - not backup_omnia_config_credentials_stat.stat.exists
     - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))"
 
+- name: Check if backup file is encrypted
+  ansible.builtin.command:
+    cmd: cat "{{ backup_location }}/omnia_config_credentials.yml"
+  register: backup_omnia_config_credentials_content
+  changed_when: false
+  failed_when: false
+  no_log: true
+  when: backup_omnia_config_credentials_stat.stat.exists
+
 - name: Process omnia_config_credentials.yml when present in backup
+  when: >-
+    backup_omnia_config_credentials_key_stat.stat.exists and
+    backup_omnia_config_credentials_content.stdout is defined and
+    '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
   block:
-    - name: Check if backup file is encrypted
-      ansible.builtin.command:
-        cmd: cat "{{ backup_location }}/omnia_config_credentials.yml"
-      register: backup_omnia_config_credentials_content
-      changed_when: false
-      failed_when: false
-      no_log: true
-
     - name: "Case 1: Key present and file encrypted - Process and update"
       block:
         - name: Copy encrypted omnia_config_credentials.yml from backup to temp location
@@ -68,6 +73,7 @@
           no_log: true
           register: vault_decrypt_result
           failed_when: vault_decrypt_result.rc != 0
+          changed_when: false
 
         - name: Read decrypted content
           ansible.builtin.slurp:
@@ -126,6 +132,7 @@
           no_log: true
           register: vault_encrypt_result
           failed_when: vault_encrypt_result.rc != 0
+          changed_when: false
 
         - name: Clean up temporary files
           ansible.builtin.file:
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index de337310b8..fe02a3d750 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -31,25 +31,26 @@
 - name: Add warning for missing user_registry_credential.yml to list
   ansible.builtin.set_fact:
     upgrade_warnings: >-
-      {{ upgrade_warnings + [
-        "WARNING: user_registry_credential.yml not found in backup at " +
-        backup_location + "/user_registry_credential.yml. " +
-        "This might be due to complete Omnia execution not being completed. " +
-        "Skipping restoration of this file."
-      ] }}
-  when: 
+      {{ upgrade_warnings + [msg_user_registry_credential_missing] }}
+  when:
     - not backup_user_registry_credential_stat.stat.exists
     - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))"
 
+- name: Check if backup file is encrypted
+  ansible.builtin.command:
+    cmd: cat "{{ backup_location }}/user_registry_credential.yml"
+  register: backup_user_registry_content
+  changed_when: false
+  failed_when: false
+  no_log: true
+  when: backup_user_registry_credential_stat.stat.exists
+
 - name: Process user_registry_credential.yml when present in backup
+  when: >-
+    backup_local_repo_credentials_key_stat.stat.exists and
+    backup_user_registry_content.stdout is defined and
+    '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
   block:
-    - name: Check if backup file is encrypted
-      ansible.builtin.command:
-        cmd: cat "{{ backup_location }}/user_registry_credential.yml"
-      register: backup_user_registry_content
-      changed_when: false
-      failed_when: false
-      no_log: true
 
     - name: "Case 1: Key present and file encrypted - Copy both"
       block:
@@ -64,6 +65,7 @@
           no_log: true
           register: vault_decrypt_result
           failed_when: vault_decrypt_result.rc != 0
+          changed_when: false
 
         - name: Copy encrypted user_registry_credential.yml from backup
           ansible.builtin.copy:
@@ -118,8 +120,7 @@
           {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
           - Key file exists but file is not encrypted
           {% endif %}
-          Please check the backup integrity and ensure both files are present
-          in consistent states.
+          Please check the backup integrity and ensure both files are present in consistent states.
       when: >-
         (not backup_local_repo_credentials_key_stat.stat.exists and 
          backup_user_registry_content.stdout is defined and 
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 5eee4a2f50..9808da58bc 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -31,14 +31,16 @@ msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.dat
 msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
-msg_user_registry_credential_missing: |- 
-  \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m
+msg_user_registry_credential_missing: |-
+  [93mWARNING:[0m user_registry_credential.yml not found in backup at
+  {{ backup_location }}/user_registry_credential.yml
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.
 
 # Omnia config credentials messages
-msg_omnia_config_credentials_missing: |- 
-  WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml.
+msg_omnia_config_credentials_missing: |-
+  WARNING: omnia_config_credentials.yml not found in backup at
+  {{ backup_location }}/omnia_config_credentials.yml.
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.
 

From ad7a5c08a6cf917814aefea6bef04145ad485534 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 18:05:14 +0530
Subject: [PATCH 145/172] fixed lint issues

---
 .../restore_omnia_config_credentials.yml      | 34 +++++++--------
 .../restore_user_registry_credential.yml      | 43 +++++++++++--------
 .../import_input_parameters/vars/main.yml     |  2 +-
 3 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index 71e8fb7db2..a129603dcc 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -27,7 +27,7 @@
   ansible.builtin.set_fact:
     upgrade_warnings: >-
       {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }}
-  when: 
+  when:
     - not backup_omnia_config_credentials_stat.stat.exists
     - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))"
 
@@ -93,6 +93,10 @@
             msg: "{{ msg_omnia_config_decrypt_error }}"
 
     - name: "Case 1.1: Apply template and encrypt"
+      when: >
+        backup_omnia_config_credentials_key_stat.stat.exists and
+        backup_omnia_config_credentials_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
       block:
         - name: Set template variables from credentials
           ansible.builtin.set_fact:
@@ -150,29 +154,25 @@
         - name: Fail with template/encryption error message
           ansible.builtin.fail:
             msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}"
-      when: >-
-        backup_omnia_config_credentials_key_stat.stat.exists and
-        backup_omnia_config_credentials_content.stdout is defined and
-        '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
 
     - name: "Case 2: Both key and file missing - Add info warning"
-      ansible.builtin.set_fact:
-        upgrade_warnings: >-
-          {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }}
-      when: >-
+      when: >
         not backup_omnia_config_credentials_key_stat.stat.exists and
-        (backup_omnia_config_credentials_content.stdout is not defined or 
+        (backup_omnia_config_credentials_content.stdout is not defined or
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and
         "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))"
+      ansible.builtin.set_fact:
+        upgrade_warnings: >
+          {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }}
 
     - name: "Case 3: Error - Mismatched state"
-      ansible.builtin.fail:
-        msg: "{{ msg_omnia_config_credentials_error }}"
-      when: >-
-        (not backup_omnia_config_credentials_key_stat.stat.exists and 
-         backup_omnia_config_credentials_content.stdout is defined and 
+      when: >
+        (not backup_omnia_config_credentials_key_stat.stat.exists and
+         backup_omnia_config_credentials_content.stdout is defined and
          '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or
-        (backup_omnia_config_credentials_key_stat.stat.exists and 
-         backup_omnia_config_credentials_content.stdout is defined and 
+        (backup_omnia_config_credentials_key_stat.stat.exists and
+         backup_omnia_config_credentials_content.stdout is defined and
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
+      ansible.builtin.fail:
+        msg: "{{ msg_omnia_config_credentials_error }}"
   when: backup_omnia_config_credentials_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index fe02a3d750..69a6a391a2 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -53,6 +53,10 @@
   block:
 
     - name: "Case 1: Key present and file encrypted - Copy both"
+      when: >
+        backup_local_repo_credentials_key_stat.stat.exists and
+        backup_user_registry_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
       block:
         - name: Decrypt user_registry_credential.yml using the key
           ansible.builtin.shell:
@@ -92,12 +96,13 @@
         - name: Fail with decryption error message
           ansible.builtin.fail:
             msg: "{{ msg_user_registry_decrypt_error }}"
-      when: >-
-        backup_local_repo_credentials_key_stat.stat.exists and
-        backup_user_registry_content.stdout is defined and
-        '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
 
     - name: "Case 2: Both key and file missing - Add info warning"
+      when: >-
+        not backup_local_repo_credentials_key_stat.stat.exists and
+        (backup_user_registry_content.stdout is not defined or
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and
+        "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))"
       ansible.builtin.set_fact:
         upgrade_warnings: >-
           {{ upgrade_warnings + [
@@ -105,27 +110,27 @@
             "are not present in backup. This is expected if registry credentials " +
             "were not configured in the source installation."
           ] }}
-      when: >-
-        not backup_local_repo_credentials_key_stat.stat.exists and
-        (backup_user_registry_content.stdout is not defined or 
-         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and
-        "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))"
 
     - name: "Case 3: Error - Mismatched state"
+      when: >-
+        (not backup_local_repo_credentials_key_stat.stat.exists and
+         backup_user_registry_content.stdout is defined and
+         '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or
+        (backup_local_repo_credentials_key_stat.stat.exists and
+         backup_user_registry_content.stdout is defined and
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout)
       ansible.builtin.fail:
         msg: |
           ERROR: Inconsistent state detected for user_registry_credential.yml:
-          {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %}
+          {% if not backup_local_repo_credentials_key_stat.stat.exists and
+             backup_user_registry_content.stdout is defined and
+             '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %}
           - File is encrypted but key file (.local_repo_credentials_key) is missing
-          {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
+          {% elif backup_local_repo_credentials_key_stat.stat.exists and
+             backup_user_registry_content.stdout is defined and
+             '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
           - Key file exists but file is not encrypted
           {% endif %}
-          Please check the backup integrity and ensure both files are present in consistent states.
-      when: >-
-        (not backup_local_repo_credentials_key_stat.stat.exists and 
-         backup_user_registry_content.stdout is defined and 
-         '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or
-        (backup_local_repo_credentials_key_stat.stat.exists and 
-         backup_user_registry_content.stdout is defined and 
-         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout)
+          Please check the backup integrity and ensure both files are present
+          in consistent states.
   when: backup_user_registry_credential_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 9808da58bc..3bdf596641 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -32,7 +32,7 @@ msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
 msg_user_registry_credential_missing: |-
-  [93mWARNING:[0m user_registry_credential.yml not found in backup at
+  WARNING: user_registry_credential.yml not found in backup at
   {{ backup_location }}/user_registry_credential.yml
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.

From 31c5600391bad02cd31c9c2d3ad167100371f5d2 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 18:13:29 +0530
Subject: [PATCH 146/172] Fixed ansible lint issues

---
 .../restore_omnia_config_credentials.yml      |  2 +-
 .../restore_user_registry_credential.yml      |  2 +-
 .../import_input_parameters/vars/main.yml     | 46 ++++++++++---------
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index a129603dcc..e04964e461 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -175,4 +175,4 @@
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
       ansible.builtin.fail:
         msg: "{{ msg_omnia_config_credentials_error }}"
-  when: backup_omnia_config_credentials_stat.stat.exists
+
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index 69a6a391a2..47b62fedb1 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -133,4 +133,4 @@
           {% endif %}
           Please check the backup integrity and ensure both files are present
           in consistent states.
-  when: backup_user_registry_credential_stat.stat.exists
+
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 3bdf596641..2bd20f0076 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -44,48 +44,52 @@ msg_omnia_config_credentials_missing: |-
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.
 
-msg_omnia_config_credentials_info_missing: |- 
-  INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key 
-  are not present in backup. This is expected if credentials 
+msg_omnia_config_credentials_info_missing: |-
+  INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key
+  are not present in backup. This is expected if credentials
   were not configured in the source installation.
 
-msg_omnia_config_credentials_success: |- 
+msg_omnia_config_credentials_success: |-
   omnia_config_credentials.yml restored and updated from backup.
   Backup: {{ backup_location }}/omnia_config_credentials.yml
   Target: {{ input_project_dir }}/omnia_config_credentials.yml
   Status: Updated with postgres credentials and re-encrypted (key file also restored)
 
-msg_omnia_config_credentials_error: |- 
+msg_omnia_config_credentials_error: |-
   ERROR: Inconsistent state detected for omnia_config_credentials.yml:
-  {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %}
+  {% if not backup_omnia_config_credentials_key_stat.stat.exists and
+     backup_omnia_config_credentials_content.stdout is defined and
+     '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %}
   - File is encrypted but key file (.omnia_config_credentials_key) is missing
-  {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %}
+  {% elif backup_omnia_config_credentials_key_stat.stat.exists and
+     backup_omnia_config_credentials_content.stdout is defined and
+     '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %}
   - Key file exists but file is not encrypted
   {% endif %}
   Please check the backup integrity and ensure both files are present
   in consistent states.
 
 # Rescue warning messages
-msg_user_registry_decrypt_error: |- 
-  ERROR: Failed to decrypt user_registry_credential.yml. 
-  The backup key file may be corrupted or incompatible. 
-  Please check the backup integrity and ensure the key file 
+msg_user_registry_decrypt_error: |-
+  ERROR: Failed to decrypt user_registry_credential.yml.
+  The backup key file may be corrupted or incompatible.
+  Please check the backup integrity and ensure the key file
   matches the encrypted file.
 
-msg_omnia_config_decrypt_error: |- 
-  ERROR: Failed to decrypt omnia_config_credentials.yml. 
-  The backup key file may be corrupted or incompatible. 
-  Please check the backup integrity and ensure the key file 
+msg_omnia_config_decrypt_error: |-
+  ERROR: Failed to decrypt omnia_config_credentials.yml.
+  The backup key file may be corrupted or incompatible.
+  Please check the backup integrity and ensure the key file
   matches the encrypted file.
 
-msg_omnia_config_template_error: |- 
-  ERROR: Failed to generate updated omnia_config_credentials.yml. 
-  Template processing may have failed due to invalid data format. 
+msg_omnia_config_template_error: |-
+  ERROR: Failed to generate updated omnia_config_credentials.yml.
+  Template processing may have failed due to invalid data format.
   Please check the backup file format and ensure it contains valid YAML.
 
-msg_omnia_config_encrypt_error: |- 
-  ERROR: Failed to encrypt updated omnia_config_credentials.yml. 
-  The key file may be corrupted or there may be permission issues. 
+msg_omnia_config_encrypt_error: |-
+  ERROR: Failed to encrypt updated omnia_config_credentials.yml.
+  The key file may be corrupted or there may be permission issues.
   Please check the key file integrity and file permissions.
 
 msg_decryption_failed: "Decryption failed. Check warnings for details."

From da5423411cb969b8ddfd41856c195c4e8e443ac1 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 18:21:52 +0530
Subject: [PATCH 147/172] fixed ansible lint issues

---
 .../tasks/restore_omnia_config_credentials.yml                   | 1 -
 .../tasks/restore_user_registry_credential.yml                   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index e04964e461..6a20f371f8 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -175,4 +175,3 @@
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
       ansible.builtin.fail:
         msg: "{{ msg_omnia_config_credentials_error }}"
-
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index 47b62fedb1..158b029ed3 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -133,4 +133,3 @@
           {% endif %}
           Please check the backup integrity and ensure both files are present
           in consistent states.
-

From cdaa98d829d7e32ee0a13955145a96c6b67f25db Mon Sep 17 00:00:00 2001
From: "balajikumaran.cs" <balajikumaran.cs@dellteam.com>
Date: Thu, 12 Feb 2026 19:05:57 +0530
Subject: [PATCH 148/172] offline build-image and discovery updates (#3956)

* Use Pulp-hosted builder images for x86_64 builds

* added x86_64 image-builder image

* Update default_packages.json

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Refine image build prereqs and regctl handling

* Update omnia_metadata_file path to use variable

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Airgap: move telemetry/NFS prep offline and package installs to prepare_oim

* added nolog true

* Update prepare_oim_completion.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update aarch64_prereq.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update main.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update main.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update main.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Replace command with podman_image module for image tasks

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Replace Podman command with Ansible module

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Align podman image pull with retries and tagging for x86_64 and aarch64

* Fix podman tagging for x86_64 and aarch64 images

---------

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>
---
 .../roles/image_creation/vars/main.yml        |  5 +-
 .../roles/prepare_arm_node/tasks/main.yml     | 58 ++++++++------
 .../roles/prepare_arm_node/vars/main.yml      | 10 ++-
 build_image_x86_64/build_image_x86_64.yml     |  4 +-
 .../image_creation/tasks/build_image_tag.yml  | 28 -------
 .../tasks/prepare_pulp_image.yml              | 79 +++++++++++++++++++
 .../roles/image_creation/vars/main.yml        | 10 ++-
 .../roles/nfs_client/tasks/nfs_client.yml     |  5 --
 discovery/roles/nfs_client/vars/main.yml      |  7 --
 discovery/roles/telemetry/tasks/main.yml      |  4 +
 .../telemetry/tasks/telemetry_prereq.yml      | 27 ++++---
 .../tasks/update_ldms_agg_config.yml          |  5 --
 discovery/roles/telemetry/vars/main.yml       | 14 ++--
 .../x86_64/rhel/10.0/default_packages.json    |  3 +-
 prepare_oim/prepare_oim.yml                   | 10 +++
 .../common/tasks/aarch64_prereq.yml           | 26 ++++++
 .../deploy_containers/common/tasks/main.yml   |  2 +-
 .../common/tasks/package_installation.yml     | 29 +++++++
 .../common/tasks/prepare_oim_completion.yml   | 20 ++++-
 .../deploy_containers/common/vars/main.yml    | 28 ++++++-
 20 files changed, 272 insertions(+), 102 deletions(-)
 delete mode 100644 build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml
 create mode 100644 build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml
 create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml
 create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml

diff --git a/build_image_aarch64/roles/image_creation/vars/main.yml b/build_image_aarch64/roles/image_creation/vars/main.yml
index 67d11422ef..984f2497d8 100644
--- a/build_image_aarch64/roles/image_creation/vars/main.yml
+++ b/build_image_aarch64/roles/image_creation/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@ input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
 omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml"
 dir_permissions_644: "0644"
 dir_permissions_755: "0755"
+aarch64_local_tag: "aarch64-image-builder/ochami"
 openchami_dir: "/opt/omnia/openchami"
 openchami_clone_path: /opt/omnia/openchami/deployment-recipes
 job_retry: "120"
@@ -32,7 +33,7 @@ ochami_compute_mounts:
   - -v {{ openchami_work_dir }}/images/rhel-{{ item.key }}-{{ rhel_tag }}.yaml:/home/builder/config.yaml:z
 ochami_aarch64_image:
   - --entrypoint /bin/bash
-  - localhost/arm-image/ochami
+  - "localhost/{{ aarch64_local_tag }}"
 ochami_base_command:
   - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG'
 
diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml
index 1801448611..4a9d150850 100644
--- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml
+++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml
@@ -167,32 +167,42 @@
 
 - name: Build full Podman image path
   ansible.builtin.set_fact:
-    pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1"
-
-- name: Pull aarch64 image using Podman
-  ansible.builtin.command:
-    cmd: "podman pull {{ pulp_aarch_image }}"
-  register: podman_pull_result
-  ignore_errors: true
-  changed_when: false
+    pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/{{ pulp_aarch64_image_name }}"
+
+- name: Pull and tag aarch64 image
+  block:
+    - name: Pull aarch64 image using Podman
+      containers.podman.podman_image:
+        name: "{{ pulp_aarch_image }}"
+        state: present
+      register: podman_pull_result
+      retries: "{{ pull_image_retries }}"
+      delay: "{{ pull_image_delay }}"
+      until: podman_pull_result is not failed
+      changed_when: false
+
+    - name: Tag pulled image
+      containers.podman.podman_tag:
+        image: "{{ pulp_aarch_image }}"
+        target_names:
+          - "{{ aarch64_local_tag }}"
+      changed_when: false
+
+  rescue:
+    - name: Fail if Podman pull failed
+      ansible.builtin.fail:
+        msg: "Failed to pull image {{ pulp_aarch_image }}"
+
+- name: Check if regctl binary exists
+  ansible.builtin.stat:
+    path: "{{ ochami_aarch_64_dir }}/regctl"
+  register: regctl_stat
+  delegate_to: localhost
 
-- name: Fail if Podman pull failed
+- name: Fail if regctl binary not found
   ansible.builtin.fail:
-    msg: "{{ aarch64_image_fail_msg }}"
-  when: podman_pull_result.rc != 0
-
-- name: Tag pulled image
-  ansible.builtin.command:
-    cmd: "podman tag {{ pulp_aarch_image }} arm-image/ochami"
-  when: podman_pull_result.rc == 0
-  changed_when: false
-
-- name: Download regctl binary to NFS shared path
-  ansible.builtin.get_url:
-    url: "{{ aarch64_regctl_url }}"
-    dest: "{{ ochami_aarch_64_dir }}/regctl"
-    mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
-  delegate_to: localhost
+    msg: "{{ regctl_not_found_msg }}"
+  when: not regctl_stat.stat.exists
 
 - name: Copy regctl binary to /usr/local/bin on target host
   ansible.builtin.copy:
diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml
index d240f27de4..c0ce2868aa 100644
--- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml
+++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,10 +15,13 @@
 
 # input files
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
+pulp_aarch64_image_name: "dellhpcomniaaisolution/image-build-aarch64:1.1"
+aarch64_local_tag: "aarch64-image-builder/ochami"
+pull_image_retries: "3"
+pull_image_delay: "10"
 network_spec: "{{ input_project_dir }}/network_spec.yml"
 ochami_aarch_64_dir: "/opt/omnia/openchami/aarch64"
 pulp_repo_store_path: "{{ ochami_aarch_64_dir }}/pulp.repo"
-aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64"
 pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo"
 pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt"
 anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt"
@@ -39,3 +42,6 @@ aarch64_image_fail_msg: >
   Unable to pull the Ochami aarch64 image builder image.
   Make sure you have added the default package for aarch64 in the software_config.json file and ran local_repo.yml.
   If not, add that package and rerun local_repo.yml.
+regctl_not_found_msg: >
+  regctl binary not found at {{ ochami_aarch_64_dir }}/regctl.
+  Please run prepare_oim.yml playbook to download the regctl binary.
diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml
index 85ecaf93cd..676d8adbd6 100644
--- a/build_image_x86_64/build_image_x86_64.yml
+++ b/build_image_x86_64/build_image_x86_64.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@
     - name: Tag OpenCHAMI image
       ansible.builtin.include_role:
         name: image_creation
-        tasks_from: build_image_tag.yml
+        tasks_from: prepare_pulp_image.yml
 
 - name: OpenCHAMI build image for x86_64
   hosts: localhost
diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml
deleted file mode 100644
index 0b7a56072d..0000000000
--- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-- name: Pull image-build image
-  ansible.builtin.command:
-    cmd: "podman pull {{ image_build_el10 }}"
-  register: pull_result
-  retries: "{{ pull_image_retries }}"
-  delay: "{{ pull_image_delay }}"
-  until: pull_result.rc == 0
-  changed_when: "'Image is up to date' not in pull_result.stdout"
-
-- name: Fail if image not pulled successfully
-  ansible.builtin.fail:
-    msg: "{{ pull_result.stdout }}"
-  when: pull_result.rc != 0
diff --git a/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml
new file mode 100644
index 0000000000..22f336b849
--- /dev/null
+++ b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml
@@ -0,0 +1,79 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Load network specification
+- name: Load network spec file
+  ansible.builtin.include_vars:
+    file: "{{ network_spec }}"
+  register: include_network_spec
+  no_log: true
+
+- name: Fail if network spec cannot be loaded
+  ansible.builtin.fail:
+    msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}"
+  when: include_network_spec is failed
+
+# Parse network spec data
+- name: Parse network spec
+  ansible.builtin.set_fact:
+    network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}"
+  with_dict: "{{ Networks }}"
+
+# Set PXE IP fact
+- name: Set PXE IP fact
+  ansible.builtin.set_fact:
+    oim_pxe_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}"
+    cacheable: true
+
+# Copy pulp certificate and update CA trust
+- name: Copy pulp webserver certificate to anchors
+  ansible.builtin.copy:
+    src: "{{ pulp_webserver_cert_path }}"
+    dest: "{{ anchors_path }}"
+    mode: "{{ dir_permissions_644 }}"
+  become: true
+
+- name: Update CA trust
+  ansible.builtin.command: update-ca-trust
+  register: update_ca
+  changed_when: false
+
+- name: Build full Podman image path for x86_64
+  ansible.builtin.set_fact:
+    pulp_x86_image: "{{ oim_pxe_ip }}:2225/{{ pulp_x86_64_image_name }}"
+
+- name: Pull and tag x86_64 image
+  block:
+    - name: Pull x86_64 image using Podman
+      containers.podman.podman_image:
+        name: "{{ pulp_x86_image }}"
+        state: present
+      register: pull_result
+      retries: "{{ pull_image_retries }}"
+      delay: "{{ pull_image_delay }}"
+      until: pull_result is not failed
+      changed_when: false
+
+    - name: Tag pulled image for x86_64 build
+      containers.podman.podman_tag:
+        image: "{{ pulp_x86_image }}"
+        target_names:
+          - "{{ x86_64_local_tag }}"
+      changed_when: false
+
+  rescue:
+    - name: Fail if Podman pull failed
+      ansible.builtin.fail:
+        msg: "Failed to pull image {{ pulp_x86_image }}."
diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml
index a05a39d37d..60dcf0bc6f 100644
--- a/build_image_x86_64/roles/image_creation/vars/main.yml
+++ b/build_image_x86_64/roles/image_creation/vars/main.yml
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0"
+pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.0"
+x86_64_local_tag: "x86_64-image-builder/ochami"
 pull_image_retries: "3"
 pull_image_delay: "10"
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
@@ -23,6 +24,9 @@ openchami_dir: "/opt/omnia/openchami"
 openchami_clone_path: /opt/omnia/openchami/deployment-recipes
 job_retry: "120"
 job_delay: "30"
+network_spec: "{{ input_project_dir }}/network_spec.yml"
+pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt"
+anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt"
 openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir"
 ochami_mounts:
   - --user 0 --privileged
@@ -35,7 +39,7 @@ ochami_compute_mounts:
 
 ochami_x86_64_image:
   - --entrypoint /bin/bash
-  - docker.io/dellhpcomniaaisolution/image-build-el10:1.0
+  - "localhost/{{ x86_64_local_tag }}"
 ochami_base_command:
   - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG'
 
@@ -54,3 +58,5 @@ compute_image_failure_msg: |
 # build_compute_image.yml
 openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2"
 openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml"
+
+network_spec_syntax_fail_msg: "Failed to load network_spec.yml due to syntax error"
diff --git a/discovery/roles/nfs_client/tasks/nfs_client.yml b/discovery/roles/nfs_client/tasks/nfs_client.yml
index 079933c26b..ca8a3c7660 100644
--- a/discovery/roles/nfs_client/tasks/nfs_client.yml
+++ b/discovery/roles/nfs_client/tasks/nfs_client.yml
@@ -32,11 +32,6 @@
     nfs_server_ip: "{{ hostvars['127.0.0.1']['admin_nic_ip'] }}"
   when: item.server_ip == "localhost"
 
-- name: Package installation for NFS
-  ansible.builtin.package:
-    name: "{{ nfs_packages[ansible_os_family] }}"
-    state: present
-
 - name: Mount facts items to dict
   ansible.builtin.set_fact:
     nfs_src: "{{ nfs_server_ip }}:{{ item.server_share_path }}"
diff --git a/discovery/roles/nfs_client/vars/main.yml b/discovery/roles/nfs_client/vars/main.yml
index b5e01fd82a..a3c20c054c 100644
--- a/discovery/roles/nfs_client/vars/main.yml
+++ b/discovery/roles/nfs_client/vars/main.yml
@@ -20,13 +20,6 @@ software_config_file: "{{ hostvars['localhost']['input_project_dir'] }}/software
 # Usage: nfs_client.yml
 mounted_dir_perm: "0755"
 default_client_mount_options: "nosuid,rw,sync,hard,intr"
-nfs_packages:
-  RedHat:
-    - nfs-utils
-    - nfs4-acl-tools
-  Debian:
-    - nfs-common
-    - nfs4-acl-tools
 slurm_nfs_fail_msg: "Failed to mount NFS share. Please check if the NFS server is reachable or NFS is configured properly."
 
 omnia_config_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml
index c5a3dbefba..825c3988d7 100644
--- a/discovery/roles/telemetry/tasks/main.yml
+++ b/discovery/roles/telemetry/tasks/main.yml
@@ -28,6 +28,10 @@
   when:
     - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support']
   block:
+    - name: Set NFS info fact
+      ansible.builtin.set_fact:
+        oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
+
     - name: Service cluster prerequisite
       ansible.builtin.include_tasks: telemetry_prereq.yml
 
diff --git a/discovery/roles/telemetry/tasks/telemetry_prereq.yml b/discovery/roles/telemetry/tasks/telemetry_prereq.yml
index d720c57822..7eb45a89ab 100644
--- a/discovery/roles/telemetry/tasks/telemetry_prereq.yml
+++ b/discovery/roles/telemetry/tasks/telemetry_prereq.yml
@@ -47,23 +47,24 @@
     state: directory
     mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
 
-- name: Git clone for iDRAC Telemetry script
+- name: Ensure iDRAC Telemetry scripting destination exists
+  ansible.builtin.file:
+    path: "{{ idrac_telemetry_scripting_git_clone_path }}"
+    state: directory
+    mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
+
+- name: Copy iDRAC Telemetry Scripting to NFS share
   block:
-    - name: Checkout iDRAC Telemetry GitHub repo
-      ansible.builtin.git:
-        repo: "{{ idrac_telemetry_scripting_repo }}"
+    - name: Copy pre-cloned iDRAC Telemetry Scripting directory
+      ansible.builtin.copy:
+        src: "{{ idrac_telemetry_scripting_src_path }}/"
         dest: "{{ idrac_telemetry_scripting_git_clone_path }}"
-        version: "{{ idrac_telemetry_scripting_stable_commit }}"
-        update: false
-      register: clone_idrac_script
-      until: clone_idrac_script is succeeded
-      retries: "{{ max_retries }}"
-      delay: "{{ delay_count }}"
+        remote_src: true
+        mode: preserve
   rescue:
-    - name: Fail if iDRAC telemetry Git clone fails
+    - name: Fail if iDRAC telemetry copy fails
       ansible.builtin.fail:
-        msg: "{{ idrac_script_git_clone_error_msg.splitlines() | join(' ') }}"
-      when: clone_idrac_script is failed
+        msg: "{{ idrac_telemetry_scripting_copy_fail_msg.splitlines() | join(' ') }}"
 
 - name: Set kafka_support to true
   ansible.builtin.set_fact:
diff --git a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml
index db4d4b1d3f..ee6c0c7d75 100644
--- a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml
+++ b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml
@@ -13,11 +13,6 @@
 #  limitations under the License.
 ---
 
-- name: Install make
-  ansible.builtin.package:
-    name: make
-    state: present
-
 - name: Verify values.yaml exists
   ansible.builtin.stat:
     path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/values.yaml"
diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml
index 473fd74e19..5c5838ce29 100644
--- a/discovery/roles/telemetry/vars/main.yml
+++ b/discovery/roles/telemetry/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -32,14 +32,12 @@ telemetry_namespace: "telemetry"
 idrac_telemetry_k8s_name: idrac-telemetry
 
 # iDRAC Telemetry scripting repository
-idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git"
-idrac_telemetry_scripting_stable_commit: "f6999f5"
+idrac_telemetry_scripting_src_path: "{{ oim_shared_path }}/omnia/telemetry/iDRAC-Telemetry-Scripting"
 idrac_telemetry_scripting_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Scripting"
-idrac_script_git_clone_error_msg: |
-  Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }}
-  to {{ idrac_telemetry_scripting_git_clone_path }} directory in NFS share.
-max_retries: 10
-delay_count: 5
+idrac_telemetry_scripting_copy_fail_msg: |
+  Failed to copy iDRAC Telemetry Scripting from {{ idrac_telemetry_scripting_src_path }}
+  to {{ idrac_telemetry_scripting_git_clone_path }}. Please ensure prepare_oim.yml has been
+  executed successfully before running discovery.
 
 # Pre-built container images for iDRAC telemetry components
 # These default to your published images but can be overridden via telemetry_images
diff --git a/input/config/x86_64/rhel/10.0/default_packages.json b/input/config/x86_64/rhel/10.0/default_packages.json
index 813f9ad993..6002894568 100644
--- a/input/config/x86_64/rhel/10.0/default_packages.json
+++ b/input/config/x86_64/rhel/10.0/default_packages.json
@@ -34,7 +34,8 @@
       {"package": "wget", "type": "rpm", "repo_name": "x86_64_appstream"},
       {"package": "cloud-init", "type": "rpm", "repo_name": "x86_64_appstream"},
       {"package": "glibc-langpack-en", "type": "rpm", "repo_name": "x86_64_baseos"},
-      {"package": "gedit", "type": "rpm", "repo_name": "epel"}
+      {"package": "gedit", "type": "rpm", "repo_name": "epel"},
+      {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.0", "type": "image" }
     ]
   }
 }
diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml
index a78d21e8d9..50c48fd3e5 100644
--- a/prepare_oim/prepare_oim.yml
+++ b/prepare_oim/prepare_oim.yml
@@ -63,6 +63,11 @@
         name: deploy_containers/common
         tasks_from: add_known_hosts.yml
 
+    - name: Download aarch64 prerequisites  # noqa:role-name[path]
+      ansible.builtin.include_role:
+        name: deploy_containers/common
+        tasks_from: aarch64_prereq.yml
+
 - name: OpenLDAP Pre_req generate ssha password
   hosts: localhost
   connection: local
@@ -156,6 +161,11 @@
         name: deploy_containers/common
         tasks_from: omnia_service.yml
 
+    - name: Install required packages  # noqa:role-name[path]
+      ansible.builtin.include_role:
+        name: deploy_containers/common
+        tasks_from: package_installation.yml
+
 - name: Prepare oim completion
   hosts: localhost
   connection: local
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml
new file mode 100644
index 0000000000..f5eae768bb
--- /dev/null
+++ b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml
@@ -0,0 +1,26 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Create openchami aarch64 directory if not exists
+  ansible.builtin.file:
+    path: "{{ ochami_aarch64_dir }}"
+    state: directory
+    mode: "{{ dir_permissions_755 }}"
+
+- name: Download regctl binary (aarch64)
+  ansible.builtin.get_url:
+    url: "{{ regctl_aarch64_url }}"
+    dest: "{{ ochami_aarch64_dir }}/regctl"
+    mode: "{{ dir_permissions_755 }}"
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/main.yml b/prepare_oim/roles/deploy_containers/common/tasks/main.yml
index 78c28e98ba..00287c628c 100644
--- a/prepare_oim/roles/deploy_containers/common/tasks/main.yml
+++ b/prepare_oim/roles/deploy_containers/common/tasks/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml
new file mode 100644
index 0000000000..1d84877307
--- /dev/null
+++ b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml
@@ -0,0 +1,29 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+- name: Install required packages
+  block:
+    - name: Install required packages
+      ansible.builtin.package:
+        name: "{{ item }}"
+        state: present
+      loop: "{{ oim_packages }}"
+      register: oim_pkg_result
+  rescue:
+    - name: Fail if required package installation fails
+      ansible.builtin.fail:
+        msg: >-
+          {{ prepare_oim_pkg_fail_msg.splitlines() | join(' ') }}
+          Failed package(s): {{ oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='item') | list | join(', ') }}
+          Error: {{ (oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='msg') | list | first) | default('') }}
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml
index 7c86cfaf6b..52e4009219 100644
--- a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml
+++ b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -32,6 +32,24 @@
     mode: "{{ file_permissions }}"
   when: not bmc_group_data_status.stat.exists
 
+- name: Clone iDRAC Telemetry Scripting repository
+  block:
+    - name: Checkout iDRAC Telemetry GitHub repo
+      ansible.builtin.git:
+        repo: "{{ idrac_telemetry_scripting_repo }}"
+        dest: "{{ idrac_telemetry_scripting_clone_dest }}"
+        version: "{{ idrac_telemetry_scripting_stable_commit }}"
+        update: false
+      register: clone_idrac_script
+      until: clone_idrac_script is succeeded
+      retries: "{{ max_retries }}"
+      delay: "{{ delay_count }}"
+  rescue:
+    - name: Fail if iDRAC telemetry Git clone fails
+      ansible.builtin.fail:
+        msg: "{{ idrac_script_git_clone_fail_msg.splitlines() | join(' ') }}"
+      when: clone_idrac_script is failed
+
 - name: Prepare oim completion
   ansible.builtin.debug:
     msg: "{{ prepare_oim_completion_msg.splitlines() | join(' ') }}"
diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml
index 30bb7b8125..855e7350b1 100644
--- a/prepare_oim/roles/deploy_containers/common/vars/main.yml
+++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,12 +28,34 @@ internal_nfs_services:
 
 ntp_firewall_service: ntp
 
+# Packages required on OIM
+oim_packages:
+  - nfs-utils
+  - nfs4-acl-tools
+  - git
+  - make
+prepare_oim_pkg_fail_msg: |
+  Failed to install required packages. Please ensure the repository is
+  configured on OIM and rerun the playbook.
+
 # Usage: prepare_oim_completion.yml
 telemetry_dir: "/opt/omnia/telemetry"
 dir_permissions_755: "0755"
 bmc_group_data_filename: "{{ telemetry_dir }}/bmc_group_data.csv"
 bmc_group_data_template: "bmc_group_data.j2"
 file_permissions: "0644"
+idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git"
+idrac_telemetry_scripting_stable_commit: "f6999f5"
+idrac_telemetry_scripting_clone_dest: "{{ telemetry_dir }}/iDRAC-Telemetry-Scripting"
+max_retries: 10
+delay_count: 5
+git_install_timeout: 300
+git_install_fail_msg: |
+  Failed to install git. Please ensure the OS repository is configured on OIM.
+  Configure the repository and rerun the playbook.
+idrac_script_git_clone_fail_msg: |
+  Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }}
+  to {{ idrac_telemetry_scripting_clone_dest }}. Please check network connectivity and rerun the playbook.
 prepare_oim_completion_msg: |
   The playbook prepare_oim.yml has completed successfully. To create the offline repositories and
   registry for the cluster nodes, please execute the playbook local_repo/local_repo.yml as the next step.
@@ -58,3 +80,7 @@ network_services:
 # Usage: configure_chrony.yml
 chrony_conf_path: "/etc/chrony.conf"
 chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP server configuration in network_spec.yml and re-run prepare_oim playbook."
+
+# Usage: aarch64_prereq.yml
+ochami_aarch64_dir: "/opt/omnia/openchami/aarch64"
+regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64"

From 01dece90e8c421745419a1b81a46df85a3fa15eb Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 12 Feb 2026 19:24:06 +0530
Subject: [PATCH 149/172] Added flow if any munge key update, will be useful if
 munge key changes

---
 .../slurm_config/tasks/check_ctld_running.yml | 19 +----
 discovery/roles/slurm_config/tasks/confs.yml  |  2 +-
 .../slurm_config/tasks/create_slurm_dir.yml   | 19 ++++-
 .../tasks/read_slurm_hostnames.yml            |  1 +
 .../slurm_config/tasks/update_hosts_munge.yml | 84 +++++++++++++++++++
 discovery/roles/slurm_config/vars/main.yml    |  2 +-
 6 files changed, 106 insertions(+), 21 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/update_hosts_munge.yml

diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 0c7626f3dd..5f2d41a904 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -61,22 +61,11 @@
       ansible.builtin.set_fact:
         reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}"
 
-    - name: Update /etc/hosts with controller hostname and IP
-      ansible.builtin.lineinfile:
-        path: /etc/hosts
-        regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}'
-        line: "{{ host_entry.value }} {{ host_entry.key }}"
-        state: present
-      loop: "{{ reachable_hosts | product(ip_name_map | dict2items) | list }}"
+    - name: Update basics on reachable_hosts
+      ansible.builtin.include_tasks: update_hosts_munge.yml
+      loop: "{{ reachable_hosts }}"
       loop_control:
-        loop_var: host_combo
-      vars:
-        target_host: "{{ host_combo[0] }}"
-        host_entry: "{{ host_combo[1] }}"
-      delegate_to: "{{ target_host }}"
-      when: reachable_hosts | length > 0
-      ignore_unreachable: true
-      failed_when: false
+        loop_var: slurmhost_ip
 
     - name: Trigger the scontrol reconfigure
       ansible.builtin.command: scontrol reconfigure
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 12236d6ed8..799d4cd757 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -172,7 +172,7 @@
   ansible.builtin.copy:
     content: "{{ item.ini_lines | join('\n') }}\n"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf"
-    mode: "{{ conf_file_mode }}"
+    mode: "0640"
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 81a08adfca..45e37ac243 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -84,11 +84,21 @@
     share_prefix: "{{ slurm_config_path }}"
   when: conf_in_nfs
 
-- name: Clear the share directory
+- name: Clear Slurm-related files and directories
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}"
+    path: "{{ slurm_config_path }}/{{ slurm_item }}"
     state: absent
-  when: clear_slurm_files
+  loop: "{{ (ctld_list | default([])
+   + cmpt_list | default([])
+   + login_list | default([])
+   + compiler_login_list | default([])
+   + dbd_list | default([])
+   + ['munge.key']) | flatten }}"
+  loop_control:
+    loop_var: slurm_item
+  failed_when: false
+  when:
+    - clear_slurm_files
 
 - name: Create the slurm directory in share
   ansible.builtin.file:
@@ -151,8 +161,9 @@
   ansible.builtin.copy:
     src: "{{ slurm_config_path }}/munge.key"
     dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key"
-    mode: "{{ common_mode }}"
+    mode: "0600"
     remote_src: true
+  register: munge_key_copy
   loop: "{{ (ctld_list | default([])) +
             (cmpt_list | default([])) +
             (compiler_login_list | default([])) +
diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml
index df19821983..0f7b3a16b2 100644
--- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml
+++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml
@@ -46,6 +46,7 @@
 - name: Get bmc_ip
   ansible.builtin.set_fact:
     bmc_ip_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='bmc_ip') }}"
+    name_ip_map: "{{ dict(ip_name_map.values() | zip(ip_name_map.keys())) }}"
 
 - name: Assign slurm lists
   ansible.builtin.set_fact:
diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
new file mode 100644
index 0000000000..ecaaad2beb
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -0,0 +1,84 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Update /etc/hosts with controller hostname and IP
+  ansible.builtin.lineinfile:
+    path: /etc/hosts
+    regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}'
+    line: "{{ host_entry.value }} {{ host_entry.key }}"
+    state: present
+  loop: "{{ ip_name_map | dict2items | list }}"
+  loop_control:
+    loop_var: host_entry
+  ignore_unreachable: true
+  failed_when: false
+  delegate_to: "{{ slurmhost_ip }}"
+
+- name: Get munge changes
+  ansible.builtin.set_fact:
+    munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}"
+  when: munge_key_copy is defined
+
+- name: Block when munge key changed
+  when:
+    - munge_key_changed is defined
+    - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false)
+    - restart_slurm_services
+  delegate_to: "{{ slurmhost_ip }}"
+  ignore_errors: true
+  ignore_unreachable: true
+  block:
+    - name: Update munge key permissions
+      ansible.builtin.file:
+        path: /etc/munge/munge.key
+        owner: munge
+        group: munge
+        mode: '0600'
+      register: munge_key_permissions_result
+
+    - name: Restart munge service if key changed
+      ansible.builtin.service:
+        name: munge
+        state: restarted
+      register: munge_restart_result
+      when:
+        - munge_key_permissions_result is defined
+        - munge_key_permissions_result is success
+
+    - name: Restart slurmctld if munge restarted
+      ansible.builtin.service:
+        name: slurmctld
+        state: restarted
+      when:
+        - name_ip_map[slurmhost_ip] in ctld_list
+        - munge_restart_result is defined
+        - munge_restart_result is success
+
+    - name: Restart slurmd if munge restarted
+      ansible.builtin.service:
+        name: slurmd
+        state: restarted
+      when:
+        - name_ip_map[slurmhost_ip] in (cmpt_list + login_list + compiler_login_list)
+        - munge_restart_result is defined
+        - munge_restart_result is success
+
+    - name: Restart slurmdbd if munge restarted
+      ansible.builtin.service:
+        name: slurmdbd
+        state: restarted
+      when:
+        - name_ip_map[slurmhost_ip] in dbd_list
+        - munge_restart_result is defined
+        - munge_restart_result is success
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 43ee995e5a..93aa0d2786 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -79,7 +79,7 @@ cluster_name: cluster # TODO: direct load vars omnia_config.yml
 slurm_uid: 6001
 slurm_user: slurm
 slurm_user_group: slurm
-restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] }}"
+restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] | default(true) }}"
 slurm_db_username: "{{ hostvars['localhost']['slurm_db_username'] | default('dbuser') }}"
 slurm_db_password: "{{ hostvars['localhost']['slurm_db_password'] }}"
 slurm_db_host: "{{ hostvars['localhost']['slurm_db_host'] | default(false) }}"

From 19a000cb663e94ed23a2e15c866c67b2bf4b7d26 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 12 Feb 2026 19:44:38 +0530
Subject: [PATCH 150/172] lint issue fix

---
 discovery/roles/slurm_config/tasks/update_hosts_munge.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
index ecaaad2beb..a326fa820d 100644
--- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -36,7 +36,6 @@
     - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false)
     - restart_slurm_services
   delegate_to: "{{ slurmhost_ip }}"
-  ignore_errors: true
   ignore_unreachable: true
   block:
     - name: Update munge key permissions
@@ -82,3 +81,7 @@
         - name_ip_map[slurmhost_ip] in dbd_list
         - munge_restart_result is defined
         - munge_restart_result is success
+  rescue:
+    - name: Handle munge restart failure
+      ansible.builtin.debug:
+        msg: "Failed task {{ ansible_failed_task.name }} on {{ slurmhost_ip }}"

From 471d4e781435703aa2dba6d55e41139ca9a8ede7 Mon Sep 17 00:00:00 2001
From: Katakam Rakesh Naga Sai
 <125246792+Katakam-Rakesh@users.noreply.github.com>
Date: Thu, 12 Feb 2026 20:12:46 +0530
Subject: [PATCH 151/172] Update main.yml for copyright

Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com>
---
 discovery/roles/k8s_config/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml
index a80fb9b257..601cc07097 100644
--- a/discovery/roles/k8s_config/vars/main.yml
+++ b/discovery/roles/k8s_config/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 94a244fe9534c5feb3d950116c19e8f9b701aee9 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 21:55:11 +0530
Subject: [PATCH 152/172] centralize oim_metadata.yml path and remove static
 backup_location variable

---
 .../import_input_parameters/tasks/set_backup_location.yml    | 2 +-
 upgrade/roles/import_input_parameters/vars/main.yml          | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
index 4f6a96e83f..94156606e5 100644
--- a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
+++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
@@ -15,7 +15,7 @@
 
 - name: Read oim_metadata.yml to get upgrade_backup_dir
   ansible.builtin.slurp:
-    src: /opt/omnia/.data/oim_metadata.yml
+    src: "{{ oim_metadata_path }}"
   register: oim_metadata_slurp
 
 - name: Parse oim_metadata.yml
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 2bd20f0076..ebaa33e492 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -15,7 +15,10 @@
 
 # backup_location will be set from oim_metadata.yml upgrade_backup_dir
 # Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default
-backup_location: ""
+# Set dynamically from metadata, no static variable needed
+
+# Path to oim_metadata.yml
+oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml"
 
 backup_dir_mode: '0755'
 default_file_mode: '0644'

From b64916bd08990d83d4f5cf0cd6895604c20f7d14 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 13 Feb 2026 10:02:03 +0530
Subject: [PATCH 153/172] Update omnia.sh

---
 omnia.sh | 77 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 9c46a04dc9..81e2094ccc 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -766,7 +766,7 @@ Description=${container_name^} Container
 [Container]
 ContainerName=${container_name}
 HostName=${container_name}
-Image=${container_name}:1.1
+Image=${container_name}:2.1
 Network=host
 
 # Capabilities
@@ -1001,16 +1001,16 @@ install_omnia_core() {
         fi
     fi
 
-    local omnia_core_tag="1.1"
+    local omnia_core_tag="2.1"
     local omnia_core_registry=""
     
-    # Check if local omnia_core:1.1 exists
+    # Check if local omnia_core:2.1 exists
     if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
         echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
     # Check if latest exists for backward compatibility
     elif podman inspect omnia_core:latest >/dev/null 2>&1; then
         echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}"
-        # Tag it as 1.1 for consistency
+        # Tag it as 2.1 for consistency
         podman tag omnia_core:latest omnia_core:${omnia_core_tag}
     else
         echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
@@ -1018,11 +1018,11 @@ install_omnia_core() {
         echo ""
         echo -e "${YELLOW}One way to build the image locally:${NC}"
         echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
         echo -e "2. Navigate to the repository directory:"
         echo -e "   cd omnia-artifactory"
         echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core omnia_branch=<version/branch_name>"
+        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
         echo ""
         echo -e "${YELLOW}Then re-run:${NC}"
         echo -e "   ./omnia.sh --install"
@@ -1200,6 +1200,7 @@ phase1_validate() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running"
+        display_cleanup_instructions
         return 1
     fi
 
@@ -1249,9 +1250,19 @@ phase1_validate() {
         return 1
     fi
 
-    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
-        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
+    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
+        echo ""
+        echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}"
+        echo ""
+        echo -e "${YELLOW}To build the core image locally:${NC}"
+        echo -e "1. Clone the Omnia Artifactory repository:"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
+        echo -e "2. Navigate to the repository directory:"
+        echo -e "   cd omnia-artifactory"
+        echo -e "3. Build the core image locally (loads into local Podman by default):"
+        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
+        echo ""
         return 1
     fi
 
@@ -1267,7 +1278,7 @@ phase2_approval() {
     echo "OMNIA UPGRADE SUMMARY"
     echo "============================================"
     echo "Current Container Tag: 1.0"
-    echo "Target Container Tag:  1.1"
+    echo "Target Container Tag:  2.1"
     echo "Current Omnia Release: 2.0.0.0"
     echo "Target Omnia Release:  2.1.0.0"
     echo "New Features:"
@@ -1386,17 +1397,17 @@ phase4_container_swap() {
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit"
-    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available"
+    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit"
+    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
         return 1
     fi
 
-    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then
-        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file"
+    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
@@ -1413,13 +1424,13 @@ phase4_container_swap() {
 
     systemctl start omnia_core.service || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
         return 1
     }
 
-    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 1.1 health check (60s)"
+    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)"
     for i in $(seq 1 60); do
         if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
             break
@@ -1429,7 +1440,7 @@ phase4_container_swap() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
         return 1
@@ -1607,6 +1618,23 @@ restore_from_backup() {
     return 0
 }
 
+# Display cleanup instructions for failed upgrade/rollback
+display_cleanup_instructions() {
+    echo ""
+    echo -e "${RED}================================================================================${NC}"
+    echo -e "${RED}                    ROLLBACK FAILED${NC}"
+    echo -e "${RED}================================================================================${NC}"
+    echo ""
+    echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}"
+    echo ""
+    echo -e "${YELLOW}Run the following on the OIM host:${NC}"
+    echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf <shared_path>${NC}"
+    echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}"
+    echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}"
+    echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}"
+    echo ""
+}
+
 # Main rollback function
 rollback_omnia_core() {
     echo -e "${GREEN}================================================================================${NC}"
@@ -1695,11 +1723,12 @@ rollback_omnia_core() {
     echo ""
     echo "[INFO] [ROLLBACK] Starting rollback process..."
     
-    # Step 1: Stop 1.1 container gracefully
+    # Step 1: Stop 2.1 container gracefully
     echo ""
-    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..."
+    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..."
     if ! stop_container_gracefully "omnia_core" 30; then
         echo -e "${RED}ERROR: Failed to stop container.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1715,6 +1744,7 @@ rollback_omnia_core() {
             podman tag omnia_core:latest omnia_core:1.0
         else
             echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}"
+            display_cleanup_instructions
             exit 1
         fi
     fi
@@ -1725,6 +1755,7 @@ rollback_omnia_core() {
     systemctl daemon-reload
     if ! systemctl start omnia_core.service; then
         echo -e "${RED}ERROR: Failed to start container service.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1747,6 +1778,7 @@ rollback_omnia_core() {
     if [ $health_count -ge $health_timeout ]; then
         echo ""
         echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1755,6 +1787,7 @@ rollback_omnia_core() {
     echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..."
     if ! validate_backup_directory "$selected_backup"; then
         echo -e "${RED}ERROR: Backup validation failed.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1763,6 +1796,7 @@ rollback_omnia_core() {
     echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..."
     if ! restore_from_backup "$selected_backup"; then
         echo -e "${RED}ERROR: Failed to restore from backup.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1773,6 +1807,7 @@ rollback_omnia_core() {
     
     if [ "$verify_version" != "$backup_version" ]; then
         echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     

From a39e26f82cbe954e492e6438a745dce13e042b1f Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 13 Feb 2026 06:38:40 +0000
Subject: [PATCH 154/172] updating /etc/hosts entries

---
 .../discovery_validations/tasks/update_hosts.yml | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml
index 43e7d3fc63..85c9ecf611 100644
--- a/discovery/roles/discovery_validations/tasks/update_hosts.yml
+++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml
@@ -13,16 +13,22 @@
 #  limitations under the License.
 ---
 
-- name: Add hosts file entry for cluster
+- name: Ensure 127.0.0.1 localhost entry exists
   ansible.builtin.shell: |
     set -o pipefail
-    grep -qxF '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' {{ hosts_file_path }} || \
-    echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }}
+    grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
+  changed_when: true
+
+- name: Remove stale entries for IPs that are being updated
+  ansible.builtin.shell: |
+    set -o pipefail
+    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp
   changed_when: true
   loop: "{{ read_mapping_file.dict | dict2items }}"
 
-- name: Ensure 127.0.0.1 localhost entry exists uniquely using echo
+- name: Add hosts file entry for cluster
   ansible.builtin.shell: |
     set -o pipefail
-    grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
+    echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }}
   changed_when: true
+  loop: "{{ read_mapping_file.dict | dict2items }}"

From 00fd2e2942b97d2610cb720ba4b647bde3d876c6 Mon Sep 17 00:00:00 2001
From: Katakam Rakesh Naga Sai
 <125246792+Katakam-Rakesh@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:43:26 +0530
Subject: [PATCH 155/172] Update service_k8s.json

Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com>
---
 input/config/x86_64/rhel/10.0/service_k8s.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json
index afc073a19f..0ef4408a7f 100644
--- a/input/config/x86_64/rhel/10.0/service_k8s.json
+++ b/input/config/x86_64/rhel/10.0/service_k8s.json
@@ -33,7 +33,7 @@
       { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" },
       { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" },
       { "package": "apptainer", "type": "rpm", "repo_name": "epel" },
-      {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"}
+	  { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
     ]
   },
   "service_kube_control_plane": {

From 7b98e5ecd47d1d46b51aba587d4ee6eb99feeb7e Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 13 Feb 2026 07:19:48 +0000
Subject: [PATCH 156/172] lint issue fixed

---
 discovery/roles/discovery_validations/tasks/update_hosts.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml
index 85c9ecf611..f040dd997f 100644
--- a/discovery/roles/discovery_validations/tasks/update_hosts.yml
+++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml
@@ -22,7 +22,9 @@
 - name: Remove stale entries for IPs that are being updated
   ansible.builtin.shell: |
     set -o pipefail
-    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp
+    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp
+    cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }}
+    rm -f {{ hosts_file_path }}.tmp
   changed_when: true
   loop: "{{ read_mapping_file.dict | dict2items }}"
 

From 6ff5423831736dc86ea5227bd1702b553ccf81af Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Fri, 13 Feb 2026 07:26:03 +0000
Subject: [PATCH 157/172] Add user registry to crio.conf

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../tasks/fetch_additional_images.yml             |  9 +++++++++
 ...ervice_kube_control_plane_first_x86_64.yaml.j2 | 15 ++++++++++++---
 ...roup-service_kube_control_plane_x86_64.yaml.j2 | 15 ++++++++++++---
 .../ci-group-service_kube_node_x86_64.yaml.j2     | 14 +++++++++++---
 discovery/roles/configure_ochami/vars/main.yml    |  1 +
 5 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
index 2fecb895e8..ca13f0c414 100644
--- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
+++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
@@ -42,3 +42,12 @@
   ansible.builtin.debug:
     var: additional_images_dict
     verbosity: 2
+
+- name: Read local_repo_config.yml  
+  ansible.builtin.include_vars:
+    file: "{{ local_repo_config_path }}"
+    name: local_repo_config
+
+- name: Set fact for user_registry
+  ansible.builtin.set_fact:
+    user_registry: "{{ local_repo_config.user_registry | default([]) }}"
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2
index b8b71bf099..b98df53d7d 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2
@@ -169,6 +169,16 @@
             location = "gcr.io"
             [[registry.mirror]]
             location = "{{ pulp_mirror }}"
+{% if user_registry | default([]) | length > 0 %}
+{% for registry in user_registry %}
+
+            [[registry]]
+            prefix = "{{ registry.host }}"
+            location = "{{ registry.host }}"
+            [[registry.mirror]]
+            location = "{{ pulp_mirror }}"
+{% endfor %}
+{% endif %}
 
         - path: /tmp/kube-vip.yaml
           owner: root:root
@@ -415,13 +425,12 @@
         - update-ca-trust extract
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
+        - mkdir -p /etc/containers/registries.conf.d
+        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - systemctl start crio.service
         - systemctl enable crio.service
         - sudo systemctl enable --now kubelet
-        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }}
-        - systemctl daemon-reload
-        - systemctl restart crio
         - kubeadm config images pull --kubernetes-version={{ service_k8s_version }}
 {% set role_name = 'service_kube_control_plane_first' %}
 {% include 'pull_additional_images.yaml.j2' %}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2
index f3ba7a7330..922f63f852 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2
@@ -147,6 +147,16 @@
             location = "gcr.io"
             [[registry.mirror]]
             location = "{{ pulp_mirror }}"
+{% if user_registry | default([]) | length > 0 %}
+{% for registry in user_registry %}
+
+            [[registry]]
+            prefix = "{{ registry.host }}"
+            location = "{{ registry.host }}"
+            [[registry.mirror]]
+            location = "{{ pulp_mirror }}"
+{% endfor %}
+{% endif %}
         - path: /tmp/kube-vip.yaml
           owner: root:root
           permissions: '0644'
@@ -323,12 +333,11 @@
         - update-ca-trust extract
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
+        - mkdir -p /etc/containers/registries.conf.d
+        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - systemctl start crio.service
         - systemctl enable crio.service
         - sudo systemctl enable --now kubelet
-        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
-        - systemctl daemon-reload
-        - systemctl restart crio
         - kubeadm config images pull --kubernetes-version={{ service_k8s_version }}
 {% set role_name = 'service_kube_control_plane' %}
 {% include 'pull_additional_images.yaml.j2' %}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2
index b380030ddd..df98035baa 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2
@@ -146,7 +146,16 @@
             location = "gcr.io"
             [[registry.mirror]]
             location = "{{ pulp_mirror }}"
+{% if user_registry | default([]) | length > 0 %}
+{% for registry in user_registry %}
 
+            [[registry]]
+            prefix = "{{ registry.host }}"
+            location = "{{ registry.host }}"
+            [[registry.mirror]]
+            location = "{{ pulp_mirror }}"
+{% endfor %}
+{% endif %}
       runcmd:
         - /usr/local/bin/set-ssh.sh
         - "systemctl enable chronyd"
@@ -226,12 +235,11 @@
         - update-ca-trust extract
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
+        - mkdir -p /etc/containers/registries.conf.d
+        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - systemctl start crio.service
         - systemctl enable crio.service
         - sudo systemctl enable --now kubelet
-        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
-        - systemctl daemon-reload
-        - systemctl restart crio
         - kubeadm config images pull --kubernetes-version={{ service_k8s_version }}
 {% set role_name = 'service_kube_node' %}
 {% include 'pull_additional_images.yaml.j2' %}
diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml
index 7f75daa01d..053ee15c0d 100644
--- a/discovery/roles/configure_ochami/vars/main.yml
+++ b/discovery/roles/configure_ochami/vars/main.yml
@@ -108,3 +108,4 @@ cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cud
 # Usage: fetch_additional_images.yml
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
 software_config_file_path: "{{ input_project_dir }}/software_config.json"
+local_repo_config_path: "{{ input_project_dir }}/local_repo_config.yml"

From a70b838c3a7e4707d0f0235b0c350e13d598c36f Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 13 Feb 2026 08:14:30 +0000
Subject: [PATCH 158/172] duplicated hostnames

---
 discovery/roles/discovery_validations/tasks/update_hosts.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml
index f040dd997f..bd046032bc 100644
--- a/discovery/roles/discovery_validations/tasks/update_hosts.yml
+++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml
@@ -19,10 +19,11 @@
     grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
   changed_when: true
 
-- name: Remove stale entries for IPs that are being updated
+- name: Remove stale entries for IPs and hostnames that are being updated
   ansible.builtin.shell: |
     set -o pipefail
-    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp
+    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \
+    grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp
     cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }}
     rm -f {{ hosts_file_path }}.tmp
   changed_when: true

From aba17ded12da3c66de984e0cabb6dce24f7ca1a4 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 13 Feb 2026 14:05:55 +0530
Subject: [PATCH 159/172] Update omnia.sh

---
 omnia.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 81e2094ccc..b7a086545d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -164,7 +164,7 @@ setup_omnia_core() {
 # It removes the container and performs the necessary cleanup steps.
 cleanup_omnia_core() {
     # Block if critical service containers exist
-    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+    critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$')
     if [ -n "$critical_running" ]; then
         echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
         echo "$critical_running"
@@ -272,7 +272,7 @@ cleanup_config(){
 # Otherwise, it prints an error message.
 remove_container() {
     # Block if critical service containers exist
-    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+    critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$')
     if [ -n "$critical_running" ]; then
         echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
         echo "$critical_running"
@@ -1083,7 +1083,7 @@ install_omnia_core() {
             # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function
             if [ "$choice" = "2" ]; then
                 # Block if critical service containers exist
-                critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+                critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$')
                 if [ -n "$critical_running" ]; then
                     echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
                     echo "$critical_running"

From 7c79b599c8fd89b75cdaf2eb082d9b95449cf84a Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Fri, 13 Feb 2026 08:47:06 +0000
Subject: [PATCH 160/172] resolve input validation + lint

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../validation_flows/common_validation.py           | 13 +++++++++++++
 .../tasks/fetch_additional_images.yml               |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 198c527440..f577a4e9b8 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -233,6 +233,19 @@ def validate_software_config(
             )
         )
 
+    # Check for required subgroups when specific software names are present
+    software_requiring_subgroups = ["additional_packages", "slurm_custom", "service_k8s"]
+    for software_name in software_requiring_subgroups:
+        if software_name in software_names:
+            if software_name not in data or not data[software_name]:
+                errors.append(
+                    create_error_msg(
+                        "Validation Error: ",
+                        software_name,
+                        f"is present in softwares but corresponding subgroup '{software_name}' is missing or empty in software_config.json. Please refer examples directory for the correct format."
+                    )
+                )
+
     for software_pkg in data['softwares']:
         software = software_pkg['name']
         arch_list = software_pkg.get('arch')
diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
index ca13f0c414..d4e8425749 100644
--- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
+++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
@@ -43,11 +43,11 @@
     var: additional_images_dict
     verbosity: 2
 
-- name: Read local_repo_config.yml  
+- name: Read local_repo_config.yml
   ansible.builtin.include_vars:
     file: "{{ local_repo_config_path }}"
     name: local_repo_config
 
 - name: Set fact for user_registry
   ansible.builtin.set_fact:
-    user_registry: "{{ local_repo_config.user_registry | default([]) }}"
\ No newline at end of file
+    user_registry: "{{ local_repo_config.user_registry | default([]) }}"

From 40f1595cd15c9f59b4c653c679a0acfaa1eb6c57 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Fri, 13 Feb 2026 16:09:23 +0530
Subject: [PATCH 161/172] Removed slurmd dependency issue where ssh key changes
 on slurmctld, live

---
 ...-group-login_compiler_node_aarch64.yaml.j2 |  8 +++--
 ...i-group-login_compiler_node_x86_64.yaml.j2 |  8 +++--
 .../ci-group-login_node_aarch64.yaml.j2       |  7 +++-
 .../ci-group-login_node_x86_64.yaml.j2        |  7 +++-
 .../ci-group-slurm_node_aarch64.yaml.j2       |  8 +++--
 .../ci-group-slurm_node_x86_64.yaml.j2        |  7 ++--
 .../slurm_config/tasks/check_ctld_running.yml | 32 +++++++++++++------
 discovery/roles/slurm_config/tasks/confs.yml  |  2 ++
 .../slurm_config/tasks/create_slurm_dir.yml   | 12 +------
 .../slurm_config/tasks/update_hosts_munge.yml |  1 +
 .../slurm_config/templates/slurmd.service.j2  | 22 -------------
 11 files changed, 62 insertions(+), 52 deletions(-)
 delete mode 100644 discovery/roles/slurm_config/templates/slurmd.service.j2

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index dc2ddf9dcd..8918f03050 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -209,6 +209,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -278,12 +284,10 @@
 
 {% if hostvars['localhost']['ldms_support'] %}
         - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
-
         - /root/ldms_sampler.sh
 {% endif %}
 
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index 2c23b868c0..51121a2e82 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -209,6 +209,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -282,12 +288,10 @@
 
 {% if hostvars['localhost']['ldms_support'] %}
         - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
-
         - /root/ldms_sampler.sh
 {% endif %}
 
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
index 8b3d771592..4aacc2222d 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
@@ -102,6 +102,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -131,7 +137,6 @@
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
index 4e68ba8d81..524553bd55 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
@@ -108,6 +108,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -142,7 +148,6 @@
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index 06a04a6068..dacade639b 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -277,8 +277,6 @@
 
             echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) (aarch64) ====="
 
-            echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/"
-            yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
             bash /usr/local/bin/check_slurm_controller_status.sh
 
             echo "[INFO] Setting ownership for Slurm directories"
@@ -415,6 +413,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index c1b532908e..d21fcf9c5c 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -244,6 +244,11 @@
 {% for key in ip_name_map | sort %}
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
 
         - path: /usr/local/bin/configure_dirs_and_mounts.sh
           permissions: '{{ file_mode_755 }}'
@@ -288,8 +293,6 @@
 
             echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) ====="
 
-            echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/"
-            yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
             bash /usr/local/bin/check_slurm_controller_status.sh
 
             echo "[INFO] Setting ownership for Slurm directories"
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 5f2d41a904..7d908169ab 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -14,30 +14,37 @@
 ---
 - name: Check if remote host is reachable via SSH
   ansible.builtin.wait_for:
-    host: "{{ item }}"
+    host: "{{ ctld }}"
     port: 22 # TODO: make it configurable
     timeout: 10
     state: started
   delegate_to: localhost
   register: ssh_check
   ignore_errors: true
-  ignore_unreachable: true
 
-- name: Block when ssh_check is success
-  when: ssh_check is success
+- name: Enter slurm controller when pingable
+  when:
+    - ssh_check is success
+  ignore_unreachable: true
   block:
     - name: Initialize ctld_state dict
       ansible.builtin.set_fact:
-        ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}"
+        ctld_state: "{{ ctld_state | default({}) | combine({ctld: false}) }}"
 
     - name: Check if slurmctld is running on remote host
       ansible.builtin.service_facts:
-      delegate_to: "{{ item }}"
+      delegate_to: "{{ ctld }}"
       register: service_facts
+      ignore_unreachable: true
+
+    - name: Fail if slurmctld is unreachable
+      ansible.builtin.fail:
+        msg: "Failed to connect to {{ ctld }}."
+      when: service_facts is unreachable
 
     - name: Update ctld_state if slurmctld is running
       ansible.builtin.set_fact:
-        ctld_state: "{{ ctld_state | combine({item: true}) }}"
+        ctld_state: "{{ ctld_state | combine({ctld: true}) }}"
       when:
         - service_facts is success
         - ansible_facts.services['slurmctld.service'] is defined
@@ -72,6 +79,13 @@
       changed_when: scontrol_reconfig.rc == 0
       failed_when: false
       register: scontrol_reconfig
-      delegate_to: "{{ item }}"
+      delegate_to: "{{ ctld }}"
       when:
-        - ctld_state[item] is true
+        - ctld_state[ctld] is true
+
+  rescue:
+    - name: Fail if slurmctld is not running on any host
+      ansible.builtin.debug:
+        msg: "Failed to 'scontrol reconfigure' on {{ ctld }}.
+         As task '{{ ansible_failed_task.name }}' failed.
+         results: {{ ansible_failed_result }}"
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 799d4cd757..c5f7953b0d 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -197,3 +197,5 @@
     - ctld_list
     - ctld_conf_files is changed
   loop: "{{ ctld_list }}"
+  loop_control:
+    loop_var: ctld
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 45e37ac243..e4ac760d77 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -194,17 +194,7 @@
     group: "{{ root_group }}"
     mode: "{{ common_mode }}"
   when: cmpt_list
-  loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}"
-
-- name: Create logout_user.sh and slurmd.service in login and login_compiler
-  ansible.builtin.template:
-    src: "{{ item.1 }}.j2"
-    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "{{ conf_file_mode }}"
-  when: login_list or compiler_login_list
-  loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}"
+  loop: "{{ cmpt_list | product(['logout_user.sh']) }}"
 
 - name: Get the slurm NFS path
   ansible.builtin.debug:
diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
index a326fa820d..64c36dbeaf 100644
--- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -30,6 +30,7 @@
     munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}"
   when: munge_key_copy is defined
 
+# TODO: Clean unreachable handling
 - name: Block when munge key changed
   when:
     - munge_key_changed is defined
diff --git a/discovery/roles/slurm_config/templates/slurmd.service.j2 b/discovery/roles/slurm_config/templates/slurmd.service.j2
deleted file mode 100644
index 294d1fda75..0000000000
--- a/discovery/roles/slurm_config/templates/slurmd.service.j2
+++ /dev/null
@@ -1,22 +0,0 @@
-[Unit]
-Description=Slurm node daemon
-After=munge.service network-online.target remote-fs.target sssd.service
-Wants=network-online.target
-
-[Service]
-Type=notify
-EnvironmentFile=-/etc/sysconfig/slurmd
-EnvironmentFile=-/etc/default/slurmd
-RuntimeDirectory=slurm
-RuntimeDirectoryMode=0755
-ExecStart=/usr/sbin/slurmd --systemd $SLURMD_OPTIONS {{ conf_server }}
-ExecReload=/bin/kill -HUP $MAINPID
-KillMode=process
-LimitNOFILE=131072
-LimitMEMLOCK=infinity
-LimitSTACK=infinity
-Delegate=yes
-TasksMax=infinity
-
-[Install]
-WantedBy=multi-user.target
\ No newline at end of file

From 3f516a3dd38d4923dd318e9600fb110f457700cf Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Fri, 13 Feb 2026 20:32:27 +0530
Subject: [PATCH 162/172] Fix for local repo is failing as cuda run package
 download issue

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../local_repo/parse_and_download.py          | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 72efd4566b..c8b8278eef 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -64,6 +64,26 @@ def execute_command(cmd_string, logger, type_json=False):
             stderr=subprocess.PIPE,
             shell=True,
         )
+        status["returncode"] = cmd.returncode
+        status["stdout"] = cmd.stdout.strip() if cmd.stdout else None
+        status["stderr"] = cmd.stderr.strip() if cmd.stderr else None
+
+        if cmd.returncode != 0:
+            logger.error(f"Command failed with return code {cmd.returncode}")
+            logger.error(f"Error: {status['stderr']}")
+            return False
+
+        if type_json:
+            if not status["stdout"]:
+                logger.error("Command succeeded but returned empty output when JSON was expected")
+                return False
+            try:
+                status["stdout"] = json.loads(status["stdout"])
+            except json.JSONDecodeError as error:
+                logger.error(f"Failed to parse JSON output: {error}")
+                logger.error(f"Raw output was: {status['stdout']}")
+                return False
+
         logger.info(f"Command succeeded: {cmd_string}")
         return True
     except subprocess.CalledProcessError as e:

From f531576a0a3ff35bb969225716f15b73c1329ce7 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 14:27:10 +0530
Subject: [PATCH 163/172] Addition of user guidance messages for cluster
 reprovisioning and rollback after upgrade to 2.1 (#3978)

* Added user guidance messages in rollback_omnia.yml and upgrade_cluster.yml

* Modification of Rollback guidance message

* Update main.yml

* Update main.yml

* Update main.yml

* Update main.yml

* Update main.yml

* Update main.yml
---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 55 ++++++++++++++++++--
 upgrade/rollback_omnia.yml                   | 54 +++++++++++++++++++
 2 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 upgrade/rollback_omnia.yml

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 196366870b..90b25611b5 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -13,6 +13,55 @@
 # limitations under the License.
 ---
 
-- name: Include import input parameters
-  ansible.builtin.include_role:
-    name: import_input_parameters
+
+- name: Display cluster reprovision guidance
+  ansible.builtin.pause:
+    prompt: "{{ '\x1b[32m' }}===================================================
+          CLUSTER REPROVISION REQUIRED
+      ===========================================================
+
+      Cluster reprovisioning is required after upgrade to enable new features.
+
+      Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning:
+
+        1. local_repo_config.yml
+
+            - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64)
+
+            - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64)
+
+        2. network_spec.yml (ib_network section)
+
+            - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable)
+
+            - Ensure host IB interfaces map to the IB network entries
+
+        3. omnia_config.yml (slurm_cluster.config_source)
+
+            - Use the new structure: config_source: { type: <local|url>, location: <path_or_url> }
+
+            - Populate location to point to your Slurm config bundle (local path or remote URL)
+
+      Do NFS cleanup (if NFS share is used for k8s/slurm)
+
+         - Clean stale mounts and ensure the NFS share is accessible before reprovision
+
+         - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment
+
+
+      Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster:
+
+        1. ansible-playbook local_repo/local_repo.yml
+
+        2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
+
+        3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
+
+        ansible-playbook build_image_aarch64/build_image_aarch64.yml
+
+        4. ansible-playbook discovery/discovery.yml
+
+      Please follow the omnia documentation for steps in more detail.
+
+    {{ '\x1b[0m' }}"
+    seconds: 1
diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml
new file mode 100644
index 0000000000..c0d5080c22
--- /dev/null
+++ b/upgrade/rollback_omnia.yml
@@ -0,0 +1,54 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Rollback Omnia guidance
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  vars:
+    oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml"
+  tasks:
+    - name: Read oim_metadata.yml for backup details
+      ansible.builtin.slurp:
+        src: "{{ oim_metadata_path }}"
+      register: oim_metadata_slurp
+      ignore_errors: true
+
+    - name: Parse oim_metadata.yml
+      ansible.builtin.set_fact:
+        oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}"
+      when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined
+
+    - name: Derive backup_version from upgrade_backup_dir
+      ansible.builtin.set_fact:
+        backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1'))
+          | default('previous version', true) }}"
+      when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined
+
+    - name: Display rollback guidance (green)
+      ansible.builtin.debug:
+        msg:
+          - "================================="
+          - "       OMNIA ROLLBACK"
+          - "================================="
+          - ""
+          - "[Rollback Actions]"
+          - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)."
+          - "2. Target version: {{ backup_version | default('previous version from the backup location') }}."
+          - "3. How to run:"
+          - "   - Exit the Omnia core container shell if you are inside it."
+          - "   - From the OIM host prompt, execute: ./omnia.sh --rollback"
+          - "4. Note: ensure the backup location is accessible on the OIM host before running rollback."
+    - name: End play
+      ansible.builtin.meta: end_play

From 8066a19d5542f2acaaf042e8dd5ccb92cdbb9b32 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Mon, 16 Feb 2026 12:04:27 +0000
Subject: [PATCH 164/172] fix status return in execute command

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 common/library/module_utils/local_repo/parse_and_download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index c8b8278eef..15bed1efb3 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -85,7 +85,7 @@ def execute_command(cmd_string, logger, type_json=False):
                 return False
 
         logger.info(f"Command succeeded: {cmd_string}")
-        return True
+        return status
     except subprocess.CalledProcessError as e:
         logger.error(f"Command failed: {cmd_string} - {e}")
         return False

From f0928443075d08a01973bb8b6f3921d9b16c0ea4 Mon Sep 17 00:00:00 2001
From: Nethravathi M G <146437298+nethramg@users.noreply.github.com>
Date: Mon, 16 Feb 2026 23:12:44 +0530
Subject: [PATCH 165/172] Initial iDRAC Telemetry Node addition and deletion
 changes  (#3972)

* Initial set of changes for iDRAC Telemetry add and remove node

* Ansible link and pylint fixes

* Ansible lint fixes

* Updated Copyrights to 2026

* Addressed the comments
---
 .../modules/delete_idracips_from_mysqldb.py   | 251 ++++++++++++++++++
 .../modules/disable_idrac_telemetry.py        | 184 +++++++++++++
 .../initiate_telemetry_service_cluster.yml    |   5 +-
 .../tasks/remove_deleted_nodes.yml            | 101 +++++++
 .../templates/telemetry_report.j2             |  18 ++
 telemetry/roles/idrac_telemetry/vars/main.yml |  24 +-
 6 files changed, 581 insertions(+), 2 deletions(-)
 create mode 100644 common/library/modules/delete_idracips_from_mysqldb.py
 create mode 100644 common/library/modules/disable_idrac_telemetry.py
 create mode 100644 telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml

diff --git a/common/library/modules/delete_idracips_from_mysqldb.py b/common/library/modules/delete_idracips_from_mysqldb.py
new file mode 100644
index 0000000000..cd81b943e2
--- /dev/null
+++ b/common/library/modules/delete_idracips_from_mysqldb.py
@@ -0,0 +1,251 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/python
+"""Module to delete iDRAC IPs from MySQL database.
+This module connects to a Kubernetes pod running MySQL and deletes iDRAC IPs
+that are not present in bmc_data.csv. It handles retries and delays for robustness."""
+
+import time
+from ansible.module_utils.basic import AnsibleModule
+from kubernetes import client, config
+from kubernetes.stream import stream
+from kubernetes.config.config_exception import ConfigException
+
+
+def load_kube_context():
+    """Load Kubernetes configuration for accessing the cluster."""
+    try:
+        config.load_kube_config()
+    except ConfigException:
+        config.load_incluster_config()
+
+
+def run_mysql_query_in_pod(namespace, pod, container, mysql_user, mysql_password, query):
+    """Run a MySQL query in the specified pod.
+
+    Args:
+        namespace: Kubernetes namespace
+        pod: Pod name
+        container: Container name
+        mysql_user: MySQL username
+        mysql_password: MySQL password
+        query: MySQL query to execute
+
+    Returns:
+        dict: Result containing return code and output
+    """
+    core_v1 = client.CoreV1Api()
+    mysql_command = [
+        "mysql",
+        "-u", mysql_user,
+        "-N", "-B",
+        f"-p{mysql_password}",
+        "-e", query
+    ]
+
+    try:
+        ws = stream(
+            core_v1.connect_get_namespaced_pod_exec,
+            name=pod,
+            namespace=namespace,
+            container=container,
+            command=mysql_command,
+            stderr=True,
+            stdin=False,
+            stdout=True,
+            tty=False,
+            _preload_content=False
+        )
+
+        stdout = ""
+        stderr = ""
+
+        while ws.is_open():
+            ws.update(timeout=1)
+            if ws.peek_stdout():
+                stdout += ws.read_stdout()
+            if ws.peek_stderr():
+                stderr += ws.read_stderr()
+        ws.close()
+
+        rc = ws.returncode
+
+        if rc != 0:
+            return {
+                "rc": rc,
+                "result": stderr.strip() if stderr else "Unknown error"
+            }
+
+        query_result = [
+            line.strip() for line in stdout.strip().splitlines()
+            if line.strip() and not line.strip().startswith("mysql:")
+        ]
+
+        return {
+            "rc": rc,
+            "result": query_result
+        }
+
+    except (ConfigException, OSError) as e:
+        return {
+            "rc": 1,
+            "result": str(e)
+        }
+
+
+def delete_idrac_from_mysql(
+    namespace,
+    pod,
+    container,
+    mysqldb_name,
+    mysql_user,
+    mysql_password,
+    ip_to_delete,
+    retries=3,
+    delay=3
+):
+    """Delete a single iDRAC IP from MySQL database.
+
+    Args:
+        namespace: Kubernetes namespace
+        pod: Pod name
+        container: Container name
+        mysqldb_name: MySQL database name
+        mysql_user: MySQL username
+        mysql_password: MySQL password
+        ip_to_delete: IP address to delete
+        retries: Number of retry attempts
+        delay: Delay between retries in seconds
+
+    Returns:
+        dict: Result containing success status and message
+    """
+    query = (
+        f"DELETE FROM {mysqldb_name}.services "
+        f"WHERE ip = '{ip_to_delete}';"
+    )
+
+    for attempt in range(retries):
+        result = run_mysql_query_in_pod(
+            namespace=namespace,
+            pod=pod,
+            container=container,
+            mysql_user=mysql_user,
+            mysql_password=mysql_password,
+            query=query
+        )
+
+        if result.get("rc") == 0:
+            return {
+                "success": True,
+                "ip": ip_to_delete,
+                "msg": f"Successfully deleted iDRAC IP {ip_to_delete} from MySQL."
+            }
+
+        if attempt < retries - 1:
+            time.sleep(delay)
+
+    return {
+        "success": False,
+        "ip": ip_to_delete,
+        "msg": f"Failed to delete iDRAC IP {ip_to_delete} after {retries} attempts: {result.get('result')}"
+    }
+
+
+def main():
+    """Main function to execute the module logic."""
+    module_args = {
+        "telemetry_namespace": {"type": "str", "required": True},
+        "idrac_podnames": {"type": "list", "required": True},
+        "mysqldb_k8s_name": {"type": "str", "required": True},
+        "mysqldb_name": {"type": "str", "required": True},
+        "mysqldb_user": {"type": "str", "required": True, "no_log": True},
+        "mysqldb_password": {"type": "str", "required": True, "no_log": True},
+        "ips_to_delete": {"type": "list", "required": True},
+        "pod_to_db_idrac_ips": {"type": "dict", "required": True},
+        "db_retries": {"type": "int", "default": 3},
+        "db_delay": {"type": "int", "default": 3},
+    }
+
+    module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
+
+    telemetry_namespace = module.params["telemetry_namespace"]
+    idrac_podnames = module.params["idrac_podnames"]
+    mysqldb_k8s_name = module.params["mysqldb_k8s_name"]
+    mysqldb_name = module.params["mysqldb_name"]
+    mysqldb_user = module.params["mysqldb_user"]
+    mysqldb_password = module.params["mysqldb_password"]
+    ips_to_delete = module.params["ips_to_delete"]
+    pod_to_db_idrac_ips = module.params["pod_to_db_idrac_ips"]
+    db_retries = module.params["db_retries"]
+    db_delay = module.params["db_delay"]
+
+    load_kube_context()
+
+    deleted_ips = []
+    failed_ips = []
+    changed = False
+
+    try:
+        for pod in idrac_podnames:
+            pod_ips = pod_to_db_idrac_ips.get(pod, [])
+            ips_to_delete_from_pod = list(set(pod_ips) & set(ips_to_delete))
+
+            if not ips_to_delete_from_pod:
+                module.warn(f"No IPs to delete from pod {pod}. Skipping.")
+                continue
+
+            module.warn(f"Deleting IPs from pod {pod}: {ips_to_delete_from_pod}")
+
+            for ip in ips_to_delete_from_pod:
+                result = delete_idrac_from_mysql(
+                    namespace=telemetry_namespace,
+                    pod=pod,
+                    container=mysqldb_k8s_name,
+                    mysqldb_name=mysqldb_name,
+                    mysql_user=mysqldb_user,
+                    mysql_password=mysqldb_password,
+                    ip_to_delete=ip,
+                    retries=db_retries,
+                    delay=db_delay
+                )
+
+                if result.get("success"):
+                    deleted_ips.append(ip)
+                    changed = True
+                else:
+                    failed_ips.append({
+                        "pod": pod,
+                        "ip": ip,
+                        "msg": result.get("msg", "Unknown error")
+                    })
+
+        module.exit_json(
+            changed=changed,
+            deleted_ips=deleted_ips,
+            failed_ips=failed_ips,
+            msg=f"Deleted {len(deleted_ips)} iDRAC IPs from MySQL database."
+        )
+
+    except (OSError, ValueError) as e:
+        module.fail_json(
+            msg=f"An error occurred while deleting iDRAC IPs from MySQL: {str(e)}",
+            deleted_ips=deleted_ips,
+            failed_ips=failed_ips
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/common/library/modules/disable_idrac_telemetry.py b/common/library/modules/disable_idrac_telemetry.py
new file mode 100644
index 0000000000..cb7b885e1e
--- /dev/null
+++ b/common/library/modules/disable_idrac_telemetry.py
@@ -0,0 +1,184 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/python
+"""Module to disable telemetry on iDRAC nodes via Redfish API.
+This module connects to iDRAC nodes and disables telemetry collection
+by sending PATCH requests to the Redfish API endpoint."""
+
+import requests
+import urllib3
+from ansible.module_utils.basic import AnsibleModule
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+
+def disable_telemetry_on_idrac(idrac_ip, username, password, timeout=30):
+    """
+    Disable telemetry on a single iDRAC node using Redfish API.
+
+    Args:
+        idrac_ip: IP address of the iDRAC
+        username: iDRAC username
+        password: iDRAC password
+        timeout: Request timeout in seconds
+
+    Returns:
+        dict: Result containing success status and message
+    """
+    url = (
+        f"https://{idrac_ip}/redfish/v1/Managers/"
+        f"iDRAC.Embedded.1/Attributes"
+    )
+
+    # Try different telemetry property names in order of preference
+    telemetry_properties = [
+        "Telemetry.1.EnableTelemetry",
+        "TelemetryService.1.EnableTelemetry", 
+        "Telemetry.2.EnableTelemetry",
+        "Redfish.1.TelemetryServiceEnabled"
+    ]
+
+    headers = {
+        "Content-Type": "application/json"
+    }
+
+    for property_name in telemetry_properties:
+        payload = {
+            "Attributes": {
+                property_name: "Disabled"
+            }
+        }
+
+        try:
+            response = requests.patch(
+                url,
+                json=payload,
+                headers=headers,
+                auth=(username, password),
+                verify=False,
+                timeout=timeout
+            )
+            
+            if response.status_code in [200, 202, 204]:
+                return {
+                    "success": True,
+                    "ip": idrac_ip,
+                    "status_code": response.status_code,
+                    "msg": f"Successfully disabled telemetry on iDRAC {idrac_ip} using {property_name}"
+                }
+            elif response.status_code == 400:
+                # Property not supported, try next one
+                continue
+            else:
+                return {
+                    "success": False,
+                    "ip": idrac_ip,
+                    "status_code": response.status_code,
+                    "msg": (
+                        f"Failed to disable telemetry on iDRAC {idrac_ip}. "
+                        f"Status: {response.status_code}, Response: {response.text}"
+                    )
+                }
+        
+        except requests.exceptions.Timeout:
+            return {
+                "success": False,
+                "ip": idrac_ip,
+                "msg": f"Timeout while connecting to iDRAC {idrac_ip}"
+            }
+        
+        except requests.exceptions.ConnectionError:
+            return {
+                "success": False,
+                "ip": idrac_ip,
+                "msg": f"Connection error while connecting to iDRAC {idrac_ip}"
+            }
+        
+        except (requests.exceptions.RequestException, OSError) as e:
+            return {
+                "success": False,
+                "ip": idrac_ip,
+                "msg": f"Error disabling telemetry on iDRAC {idrac_ip}: {str(e)}"
+            }
+
+    # All properties failed
+    return {
+        "success": False,
+        "ip": idrac_ip,
+        "msg": (
+            f"Failed to disable telemetry on iDRAC {idrac_ip}. "
+            f"None of the supported telemetry properties were found: {', '.join(telemetry_properties)}"
+        )
+    }
+
+
+def main():
+    """Main function to execute the module logic."""
+    module_args = {
+        "idrac_ips": {"type": "list", "required": True, "elements": "str"},
+        "username": {"type": "str", "required": True, "no_log": True},
+        "password": {"type": "str", "required": True, "no_log": True},
+        "timeout": {"type": "int", "default": 30},
+    }
+
+    module = AnsibleModule(
+        argument_spec=module_args,
+        supports_check_mode=True
+    )
+
+    idrac_ips = module.params["idrac_ips"]
+    username = module.params["username"]
+    password = module.params["password"]
+    timeout = module.params["timeout"]
+
+    disabled_ips = []
+    failed_ips = []
+    changed = False
+
+    try:
+        for idrac_ip in idrac_ips:
+            result = disable_telemetry_on_idrac(
+                idrac_ip=idrac_ip,
+                username=username,
+                password=password,
+                timeout=timeout
+            )
+
+            if result.get("success"):
+                disabled_ips.append(idrac_ip)
+                changed = True
+            else:
+                failed_ips.append({
+                    "ip": idrac_ip,
+                    "msg": result.get("msg", "Unknown error")
+                })
+
+        module.exit_json(
+            changed=changed,
+            disabled_ips=disabled_ips,
+            failed_ips=failed_ips,
+            msg=f"Disabled telemetry on {len(disabled_ips)} iDRAC nodes."
+        )
+
+    except (requests.exceptions.RequestException, OSError) as e:
+        module.fail_json(
+            msg=f"An error occurred while disabling telemetry: {str(e)}",
+            disabled_ips=disabled_ips,
+            failed_ips=failed_ips
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml
index 8615897205..7078a2f056 100644
--- a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml
+++ b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -87,6 +87,9 @@
   ansible.builtin.debug:
     msg: "Filtered BMC IPs: {{ filtered_bmc_ip_list }}"
 
+- name: Remove deleted nodes from telemetry (nodes not in bmc_data.csv)
+  ansible.builtin.include_tasks: remove_deleted_nodes.yml
+
 - name: Convert filtered_bmc_ip_list to a dictionary with bmc_ip
   ansible.builtin.set_fact:
     filtered_bmc_ip_dict_list: "{{ filtered_bmc_ip_list | map('community.general.dict_kv', 'bmc_ip') | list }}"
diff --git a/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml
new file mode 100644
index 0000000000..4c82abf9e1
--- /dev/null
+++ b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml
@@ -0,0 +1,101 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Identify iDRAC IPs to remove (present in DB but not in bmc_data.csv)
+  ansible.builtin.set_fact:
+    ips_to_remove: "{{ db_idrac_ip_list | difference(bmc_ips) }}"
+
+- name: Show iDRAC IPs to be removed
+  ansible.builtin.debug:
+    msg: "iDRAC IPs to be removed: {{ ips_to_remove }}"
+  when: ips_to_remove | length > 0
+
+- name: Skip removal if no IPs to remove
+  ansible.builtin.debug:
+    msg: "{{ no_idracips_to_remove_msg }}"
+  when: ips_to_remove | length == 0
+
+- name: Disable telemetry on iDRAC nodes before removal
+  when: ips_to_remove | length > 0
+  block:
+    - name: Disable telemetry service on iDRAC nodes
+      disable_idrac_telemetry:
+        idrac_ips: "{{ ips_to_remove }}"
+        username: "{{ hostvars['localhost']['bmc_username'] }}"
+        password: "{{ hostvars['localhost']['bmc_password'] }}"
+        timeout: "{{ redfish_timeout }}"
+      register: disable_telemetry_result
+      ignore_errors: true
+
+    - name: Show successfully disabled telemetry IPs
+      ansible.builtin.debug:
+        msg: "Successfully disabled telemetry on: {{ disable_telemetry_result.disabled_ips | default([]) }}"
+      when:
+        - disable_telemetry_result.disabled_ips is defined
+        - disable_telemetry_result.disabled_ips | length > 0
+
+    - name: Show failed to disable telemetry IPs
+      ansible.builtin.debug:
+        msg: "Failed to disable telemetry on: {{ disable_telemetry_result.failed_ips | default([]) }}"
+      when:
+        - disable_telemetry_result.failed_ips is defined
+        - disable_telemetry_result.failed_ips | length > 0
+
+- name: Remove iDRAC IPs from MySQL database
+  when: ips_to_remove | length > 0
+  block:
+    - name: Delete iDRAC IPs from mysqldb
+      delete_idracips_from_mysqldb:
+        telemetry_namespace: "{{ telemetry_namespace }}"
+        idrac_podnames: "{{ idrac_podname_idracips.idrac_podname_ips.keys() | list }}"
+        mysqldb_k8s_name: "{{ mysqldb_k8s_name }}"
+        mysqldb_name: "{{ mysqldb_name }}"
+        mysqldb_user: "{{ hostvars['localhost']['mysqldb_user'] }}"
+        mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] }}"
+        ips_to_delete: "{{ ips_to_remove }}"
+        pod_to_db_idrac_ips: "{{ existing_pod_to_db_idrac_ips }}"
+        db_retries: "{{ db_retries }}"
+        db_delay: "{{ db_delay }}"
+      register: delete_idrac_result
+  rescue:
+    - name: Failed to delete iDRAC IPs from mysqldb
+      ansible.builtin.fail:
+        msg: "{{ mysqldb_delete_fail_msg }}"
+
+- name: Show deleted iDRAC IPs
+  ansible.builtin.debug:
+    msg: "Successfully deleted iDRAC IPs from mysqldb: {{ delete_idrac_result.deleted_ips | default([]) }}"
+  when:
+    - ips_to_remove | length > 0
+    - delete_idrac_result.deleted_ips is defined
+    - delete_idrac_result.deleted_ips | length > 0
+
+- name: Show failed to delete iDRAC IPs
+  ansible.builtin.debug:
+    msg: "Failed to delete iDRAC IPs from mysqldb: {{ delete_idrac_result.failed_ips | default([]) }}"
+  when:
+    - ips_to_remove | length > 0
+    - delete_idrac_result.failed_ips is defined
+    - delete_idrac_result.failed_ips | length > 0
+
+- name: Update telemetry report variables with deletion info
+  ansible.builtin.set_fact:
+    deleted_idrac_count: "{{ delete_idrac_result.deleted_ips | default([]) | length }}"
+    deleted_idrac_ips: "{{ delete_idrac_result.deleted_ips | default([]) }}"
+    failed_delete_count: "{{ delete_idrac_result.failed_ips | default([]) | length }}"
+    failed_delete_ips: "{{ delete_idrac_result.failed_ips | default([]) }}"
+    disabled_telemetry_count: "{{ disable_telemetry_result.disabled_ips | default([]) | length }}"
+    disabled_telemetry_ips: "{{ disable_telemetry_result.disabled_ips | default([]) }}"
+  when: ips_to_remove | length > 0
diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
index 4d8554cab3..06bf230980 100644
--- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
+++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
@@ -14,5 +14,23 @@ Telemetry not supported IPs List:
   - {{ item }}
 {% endfor %}
 
+{% if deleted_idrac_count is defined and deleted_idrac_count | int > 0 %}
+----- Node Deletion Report -----
+
+Total IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }}
+Removed IPs List:
+{% for item in deleted_idrac_ips %}
+  - {{ item }}
+{% endfor %}
+
+{% if disabled_telemetry_count is defined and disabled_telemetry_count | int > 0 %}
+IPs with telemetry disabled via Redfish: {{ disabled_telemetry_count | int }}
+Disabled telemetry IPs List:
+{% for item in disabled_telemetry_ips %}
+  - {{ item }}
+{% endfor %}
+{% endif %}
+{% endif %}
+
 ===== Telemetry Report End =====
 
diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml
index d2696f4ac8..7fe6730789 100644
--- a/telemetry/roles/idrac_telemetry/vars/main.yml
+++ b/telemetry/roles/idrac_telemetry/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -67,6 +67,13 @@ idrac_telemetry_statefulset_restart_failure_msg: |
   Failed to restart the  {{ idrac_telemetry_k8s_name }} StatefulSet.
   Please check the logs using the command kubectl logs -n {{ telemetry_namespace }} {{ idrac_telemetry_k8s_name }}-<pod-index> and try again.
 
+# Usage: remove_deleted_nodes.yml
+redfish_timeout: 30
+mysqldb_delete_fail_msg: |
+  Failed to delete iDRAC IPs from the mysql database.
+  This could be due to the tables in the mysqldb not being accessible at the moment. Please try running the playbook again after some time.
+no_idracips_to_remove_msg: "No iDRAC IPs to remove. All DB entries are present in bmc_data.csv."
+
 # Usage: create_telemetry_report.yml
 telemetry_report_path: "/opt/omnia/telemetry/idrac_telemetry_report.yml"
 telemetry_report_template: "telemetry_report.j2"
@@ -75,6 +82,9 @@ telemetry_report: |
 
       IP count with Telemetry not supported: {{ failed_idrac_count | int + invalid_idrac_count | int }}
       IP count with Telemetry activated in current execution: {{ telemetry_idrac_count | int }}
+      {% if deleted_idrac_count is defined %}
+      IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }}
+      {% endif %}
 
       {% if (failed_idrac_count | int + invalid_idrac_count | int) > 0 %}
       Potential reasons for telemetry not being initiated include Redfish connectivity problems, timeout issues,
@@ -105,3 +115,15 @@ telemetry_report: |
         - {{ item }}
       {% endfor %}
       {% endif %}
+      {% if deleted_idrac_ips is defined and deleted_idrac_ips | length > 0 %}
+      IPs removed from telemetry database (not present in bmc_data.csv):
+      {% for item in deleted_idrac_ips %}
+        - {{ item }}
+      {% endfor %}
+      {% endif %}
+      {% if disabled_telemetry_ips is defined and disabled_telemetry_ips | length > 0 %}
+      IPs with telemetry disabled via Redfish:
+      {% for item in disabled_telemetry_ips %}
+        - {{ item }}
+      {% endfor %}
+      {% endif %}

From 128cac669d133c7c6eb1f52b37b1d201e1a3810a Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Tue, 17 Feb 2026 08:37:15 +0530
Subject: [PATCH 166/172] support multiple Omnia versions (2.1.0.0, 2.1.0.1)
 using a single core container tag (2.1) (#3983)

---
 omnia.sh | 782 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 622 insertions(+), 160 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index b7a086545d..3b320b0bf6 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -52,11 +52,226 @@ is_local_ip() {
     fi
 }
 
+# Version configuration variables
+OMNIA_CORE_CONTAINER_TAG="2.1"  # Default container tag
+OMNIA_VERSION=""  # Will be read from metadata
+TARGET_OMNIA_VERSION=""  # Target version for upgrade
+TARGET_CONTAINER_TAG=""  # Target container tag for upgrade
+
+# Centralized version list (in chronological order)
+ALL_OMNIA_VERSIONS=("2.0.0.0" "2.1.0.0")
+
 # Container-side paths (used inside podman exec commands)
 CONTAINER_INPUT_DIR="/opt/omnia/input"
 CONTAINER_BACKUPS_DIR="/opt/omnia/backups"
 CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
 
+# Function to get available upgrade versions (higher than current)
+get_available_upgrade_versions() {
+    local current_version="$1"
+    local available_versions=()
+    local version_descriptions=()
+    
+    # Find versions higher than current
+    local found_current=false
+    for version in "${ALL_OMNIA_VERSIONS[@]}"; do
+        if [ "$version" = "$current_version" ]; then
+            found_current=true
+            continue
+        fi
+        
+        if [ "$found_current" = true ]; then
+            available_versions+=("$version")
+            
+            # Generate description based on upgrade type
+            local current_tag=$(get_container_tag_from_version "$current_version")
+            local target_tag=$(get_container_tag_from_version "$version")
+            
+            if [ "$current_tag" = "$target_tag" ]; then
+                version_descriptions+=("Patch upgrade to $version (container restart only)")
+            else
+                version_descriptions+=("Major upgrade to $version (container swap required)")
+            fi
+        fi
+    done
+    
+    # Return arrays
+    printf '%s\n' "${available_versions[@]}"
+    printf '%s\n' "${version_descriptions[@]}"
+}
+
+# Function to get available rollback versions (lower than current)
+get_available_rollback_versions() {
+    local current_version="$1"
+    local available_versions=()
+    
+    # Find versions lower than current
+    for version in "${ALL_OMNIA_VERSIONS[@]}"; do
+        if [ "$version" = "$current_version" ]; then
+            break
+        fi
+        available_versions+=("$version")
+    done
+    
+    # Return array (reverse order for rollback - newest first)
+    local reversed_versions=()
+    for ((i=${#available_versions[@]}-1; i>=0; i--)); do
+        reversed_versions+=("${available_versions[$i]}")
+    done
+    
+    printf '%s\n' "${reversed_versions[@]}"
+}
+
+# Function to perform same-tag rollback (container restart only)
+rollback_same_tag() {
+    local target_version="$1"
+    local current_version="$2"
+    
+    echo "[INFO] [ROLLBACK] Phase: Same-Tag Rollback"
+    echo "[INFO] [ROLLBACK] Rolling back to $target_version within same container tag"
+    
+    # Verify container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ROLLBACK] Container is not running for same-tag rollback"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Updating metadata to version $target_version"
+    
+    # Update version metadata
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
+            sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE'
+        else
+            echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE'
+        fi
+    "; then
+        echo "[ERROR] [ROLLBACK] Failed to update metadata version"
+        echo "[ERROR] [ROLLBACK] Rollback failed: Could not update version metadata"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Restarting container to apply changes..."
+    
+    # Restart container to apply changes
+    if ! systemctl restart omnia_core.service; then
+        echo "[ERROR] [ROLLBACK] Failed to restart container service"
+        echo "[ERROR] [ROLLBACK] Rollback failed: Container restart failed"
+        return 1
+    fi
+    
+    # Wait for container to be healthy after restart
+    echo "[INFO] [ROLLBACK] Waiting for container health check after restart (30s)"
+    local health_timeout=30
+    local health_count=0
+    
+    while [ $health_count -lt $health_timeout ]; do
+        if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then
+            echo "[INFO] [ROLLBACK] Container is healthy after restart"
+            break
+        fi
+        sleep 1
+        health_count=$((health_count + 1))
+        echo -n "."
+    done
+    
+    if [ $health_count -ge $health_timeout ]; then
+        echo ""
+        echo "[ERROR] [ROLLBACK] Container failed to become healthy within 30 seconds after restart"
+        echo "[ERROR] [ROLLBACK] Rollback failed: Container health check failed"
+        return 1
+    fi
+    
+    # Verify version update
+    local updated_version=$(get_current_omnia_version)
+    if [ "$updated_version" != "$target_version" ]; then
+        echo "[ERROR] [ROLLBACK] Version update verification failed"
+        echo "[ERROR] [ROLLBACK] Expected: $target_version, Found: $updated_version"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Same-tag rollback completed successfully"
+    echo "[INFO] [ROLLBACK] Version rolled back to: $target_version"
+    return 0
+}
+
+# Function to validate container image availability and show build instructions
+validate_container_image() {
+    local target_version="$1"
+    local target_container_tag="$2"
+    local operation="${3:-upgrade}"
+    
+    echo -e "${BLUE}Validating target container image: omnia_core:$target_container_tag${NC}"
+    if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then
+        echo -e "${RED}ERROR: Target image missing locally: omnia_core:$target_container_tag${NC}"
+        echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}"
+        echo -e "1. Clone the Omnia Artifactory repository:"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-$target_version"
+        echo -e "2. Navigate to the repository directory:"
+        echo -e "   cd omnia-artifactory"
+        echo -e "3. Build the core image locally (loads into local Podman by default):"
+        echo -e "   ./build_images.sh core core_tag=$target_container_tag omnia_branch=$target_version"
+        echo -e "Then re-run:"
+        echo -e "   ./omnia.sh --$operation"
+        return 1
+    fi
+    
+    echo -e "${GREEN}✓ Target image available locally: omnia_core:$target_container_tag${NC}"
+    return 0
+}
+
+# Function to get container tag from omnia version
+get_container_tag_from_version() {
+    local version="$1"
+    case "$version" in
+        2.0.*)
+            echo "1.0"
+            ;;
+        *)
+            echo "$(echo "$version" | awk -F. '{print $1"."$2}')"
+            ;;
+    esac
+}
+
+# Function to read current omnia version from metadata
+get_current_omnia_version() {
+    if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        podman exec omnia_core cat /opt/omnia/.data/oim_metadata.yml 2>/dev/null | grep "omnia_version:" | awk '{print $2}' | tr -d '"'
+    else
+        echo ""
+    fi
+}
+
+show_post_upgrade_instructions() {
+    local upgraded_version="$1"
+
+    echo ""
+    echo -e "${YELLOW}================================================================================${NC}"
+    echo -e "${YELLOW}                    IMPORTANT POST-UPGRADE STEP${NC}"
+    echo -e "${YELLOW}================================================================================${NC}"
+    echo ""
+    echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}"
+    echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}"
+    echo ""
+    echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}"
+    echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}"
+    echo ""
+    echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}"
+    echo ""
+    echo -e "${BLUE}This playbook will:${NC}"
+    echo -e "• Update input files"
+    echo -e "• Update internal configurations"
+    echo ""
+    echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}"
+    echo -e "${YELLOW}================================================================================${NC}"
+    echo ""
+}
+
 # Host-side paths (initialized dynamically after omnia_path is set)
 OMNIA_INPUT_DIR=""
 OMNIA_METADATA_DIR=""
@@ -1004,29 +1219,9 @@ install_omnia_core() {
     local omnia_core_tag="2.1"
     local omnia_core_registry=""
     
-    # Check if local omnia_core:2.1 exists
-    if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
+    # Check if local omnia_core image exists using validate function
+    if validate_container_image "" "$omnia_core_tag" "install"; then
         echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
-    # Check if latest exists for backward compatibility
-    elif podman inspect omnia_core:latest >/dev/null 2>&1; then
-        echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}"
-        # Tag it as 2.1 for consistency
-        podman tag omnia_core:latest omnia_core:${omnia_core_tag}
-    else
-        echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
-        echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}"
-        echo ""
-        echo -e "${YELLOW}One way to build the image locally:${NC}"
-        echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
-        echo -e "2. Navigate to the repository directory:"
-        echo -e "   cd omnia-artifactory"
-        echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
-        echo ""
-        echo -e "${YELLOW}Then re-run:${NC}"
-        echo -e "   ./omnia.sh --install"
-        exit 1
     fi
 
     # Check if any other containers with 'omnia' in their name are running
@@ -1148,9 +1343,6 @@ install_omnia_core() {
 
     # If core container is not present
     else
-
-        # Start the container setup
-        echo -e "${GREEN}Starting Omnia core container setup.${NC}"
         setup_omnia_core
     fi
 }
@@ -1216,16 +1408,6 @@ phase1_validate() {
         return 1
     fi
 
-    if [ "$previous_omnia_version" = "2.1.0.0" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade already performed. Current Omnia version is 2.1.0.0. No further upgrade required."
-        return 1
-    fi
-
-    if [ "$previous_omnia_version" != "2.0.0.0" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version"
-        return 1
-    fi
-
     shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r')
     if [ -z "$shared_path" ]; then
         echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml"
@@ -1244,28 +1426,6 @@ phase1_validate() {
         return 1
     fi
 
-    current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null)
-    if [ -z "$current_image" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image"
-        return 1
-    fi
-
-    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
-        echo ""
-        echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}"
-        echo ""
-        echo -e "${YELLOW}To build the core image locally:${NC}"
-        echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
-        echo -e "2. Navigate to the repository directory:"
-        echo -e "   cd omnia-artifactory"
-        echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
-        echo ""
-        return 1
-    fi
-
     echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed"
     return 0
 }
@@ -1277,13 +1437,18 @@ phase2_approval() {
     echo "============================================"
     echo "OMNIA UPGRADE SUMMARY"
     echo "============================================"
-    echo "Current Container Tag: 1.0"
-    echo "Target Container Tag:  2.1"
-    echo "Current Omnia Release: 2.0.0.0"
-    echo "Target Omnia Release:  2.1.0.0"
-    echo "New Features:"
-    echo "  - Add and remove node for slurm cluster"
-    echo "  - Additional Package Installation"
+    echo "Current Container Tag: $OMNIA_CORE_CONTAINER_TAG"
+    echo "Target Container Tag:  $TARGET_CONTAINER_TAG"
+    echo "Current Omnia Release: $OMNIA_VERSION"
+    echo "Target Omnia Release:  $TARGET_OMNIA_VERSION"
+    
+    # Show upgrade type
+    if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then
+        echo "Upgrade Type: Same-tag upgrade (container restart)"
+    else
+        echo "Upgrade Type: Cross-tag upgrade (container swap)"
+    fi
+    
     echo "============================================"
 
     current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null)
@@ -1367,6 +1532,85 @@ phase3_backup_creation() {
     return 0
 }
 
+phase4_same_tag_upgrade() {
+    local target_version="$1"
+    
+    echo "[INFO] [ORCHESTRATOR] Phase 4: Same-Tag Upgrade"
+    echo "[INFO] [ORCHESTRATOR] Upgrading to $target_version within same container tag"
+    
+    # Verify container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Container is not running for same-tag upgrade"
+        return 1
+    fi
+    
+    echo "[INFO] [ORCHESTRATOR] Updating metadata to version $target_version"
+    
+    # Update version metadata
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
+            sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE'
+        else
+            echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE'
+        fi
+    "; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to update metadata version"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata"
+        return 1
+    fi
+    
+    echo "[INFO] [ORCHESTRATOR] Restarting container to apply changes..."
+    
+    # Restart container to apply changes
+    if ! systemctl restart omnia_core.service; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to restart container service"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container restart failed"
+        return 1
+    fi
+    
+    # Wait for container to be healthy after restart
+    echo "[INFO] [ORCHESTRATOR] Waiting for container health check after restart (30s)"
+    local health_timeout=30
+    local health_count=0
+    
+    while [ $health_count -lt $health_timeout ]; do
+        if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then
+            echo "[INFO] [ORCHESTRATOR] Container is healthy after restart"
+            break
+        fi
+        sleep 1
+        health_count=$((health_count + 1))
+        echo -n "."
+    done
+    
+    if [ $health_count -ge $health_timeout ]; then
+        echo ""
+        echo "[ERROR] [ORCHESTRATOR] Container failed to become healthy within 30 seconds after restart"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container health check failed"
+        return 1
+    fi
+    
+    # Verify version update
+    local updated_version=$(get_current_omnia_version)
+    if [ "$updated_version" != "$target_version" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Version update verification failed"
+        echo "[ERROR] [ORCHESTRATOR] Expected: $target_version, Found: $updated_version"
+        return 1
+    fi
+    
+    echo "[INFO] [ORCHESTRATOR] Same-tag upgrade completed successfully"
+    echo "[INFO] [ORCHESTRATOR] Version updated to: $target_version"
+
+    show_post_upgrade_instructions "$target_version"
+    
+    return 0
+}
+
 phase4_container_swap() {
     local quadlet_file="/etc/containers/systemd/omnia_core.container"
     local i
@@ -1376,12 +1620,12 @@ phase4_container_swap() {
     if [ ! -f "$quadlet_file" ]; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Stopping omnia_core 1.0 container"
+    echo "[INFO] [ORCHESTRATOR] Stopping omnia_core $OMNIA_CORE_CONTAINER_TAG container"
     systemctl stop omnia_core.service >/dev/null 2>&1 || true
 
     if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
@@ -1391,25 +1635,25 @@ phase4_container_swap() {
 
     if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop $OMNIA_CORE_CONTAINER_TAG container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit"
-    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+    echo "[INFO] [ORCHESTRATOR] Starting omnia_core $TARGET_CONTAINER_TAG Quadlet unit"
+    if ! podman inspect "omnia_core:$TARGET_CONTAINER_TAG" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:$TARGET_CONTAINER_TAG"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG image not available"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then
-        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file"
+    if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$TARGET_CONTAINER_TAG/" "$quadlet_file"; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to $TARGET_CONTAINER_TAG in quadlet file"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
@@ -1417,20 +1661,20 @@ phase4_container_swap() {
     systemctl daemon-reload || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     }
 
     systemctl start omnia_core.service || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start $TARGET_CONTAINER_TAG container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     }
 
-    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)"
+    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core $TARGET_CONTAINER_TAG health check (60s)"
     for i in $(seq 1 60); do
         if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
             break
@@ -1440,13 +1684,13 @@ phase4_container_swap() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG container failed health check"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to 2.1.0.0"
+    echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $TARGET_OMNIA_VERSION"
     if ! podman exec -u root omnia_core bash -c "
         set -e
         if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
@@ -1454,14 +1698,14 @@ phase4_container_swap() {
             exit 1
         fi
         if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
-            sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$CONTAINER_METADATA_FILE'
+            sed -i 's/^omnia_version:.*/omnia_version: $TARGET_OMNIA_VERSION/' '$CONTAINER_METADATA_FILE'
         else
-            echo 'omnia_version: 2.1.0.0' >> '$CONTAINER_METADATA_FILE'
+            echo 'omnia_version: $TARGET_OMNIA_VERSION' >> '$CONTAINER_METADATA_FILE'
         fi
     "; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
@@ -1471,21 +1715,129 @@ phase4_container_swap() {
 }
 
 upgrade_omnia_core() {
-    local lock_file="/var/lock/omnia_core_upgrade.lock"
-    local backup_base
-
-    if [ -e "$lock_file" ]; then
-        echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}"
+    echo -e "${BLUE}=================== Omnia Core Upgrade ====================${NC}"
+    echo -e "${BLUE}This script will upgrade Omnia core container.${NC}"
+    echo -e "${BLUE}Current version will be backed up and upgraded to target version.${NC}"
+    echo -e "${BLUE}=============================================================${NC}"
+    
+    # Read current version
+    OMNIA_VERSION=$(get_current_omnia_version)
+    if [ -z "$OMNIA_VERSION" ]; then
+        echo -e "${RED}ERROR: Could not determine current Omnia version${NC}"
+        echo -e "${YELLOW}Please ensure omnia_core container is running and metadata is accessible${NC}"
         exit 1
     fi
-
-    mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true
-    echo "$$" > "$lock_file" || {
-        echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}"
+    
+    # Get current container tag
+    OMNIA_CORE_CONTAINER_TAG=$(get_container_tag_from_version "$OMNIA_VERSION")
+    
+    echo -e "${GREEN}Current Omnia version: $OMNIA_VERSION${NC}"
+    echo -e "${GREEN}Current container tag: $OMNIA_CORE_CONTAINER_TAG${NC}"
+    
+    # Show available upgrade options
+    echo ""
+    echo "Available upgrade options:"
+    echo "========================="
+    
+    # Get available upgrade versions dynamically
+    local upgrade_output
+    upgrade_output=$(get_available_upgrade_versions "$OMNIA_VERSION")
+    
+    # Parse output into versions and descriptions
+    local available_versions=()
+    local version_descriptions=()
+    local line_count=0
+    local total_lines
+    
+    # Count total lines
+    total_lines=$(echo "$upgrade_output" | wc -l)
+    
+    # Split into versions and descriptions (first half = versions, second half = descriptions)
+    local mid_line=$((total_lines / 2))
+    local line_num=0
+    
+    while IFS= read -r line; do
+        line_num=$((line_num + 1))
+        if [ $line_num -le $mid_line ]; then
+            available_versions+=("$line")
+        else
+            version_descriptions+=("$line")
+        fi
+    done <<< "$upgrade_output"
+    
+    # Check if any upgrade options are available
+    if [ ${#available_versions[@]} -eq 0 ]; then
+        echo -e "${GREEN}Already at latest version $OMNIA_VERSION${NC}"
+        echo "No upgrade options available."
+        exit 0
+    fi
+    
+    # Display upgrade options
+    for i in "${!available_versions[@]}"; do
+        local target_version="${available_versions[$i]}"
+        local target_container_tag=$(get_container_tag_from_version "$target_version")
+        
+        # Check if target image exists locally
+        local image_status="✓ Available"
+        if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then
+            image_status="✗ Missing (build required)"
+        fi
+        
+        echo "$((i+1)). Upgrade to $target_version (container tag: $target_container_tag) [$image_status]"
+    done
+    
+    # Prompt user to select upgrade version
+    echo -n "Select upgrade option (1-${#available_versions[@]}) or press Enter to cancel: "
+    read -r selection
+    
+    # Validate selection
+    if [ -z "$selection" ]; then
+        echo "Upgrade cancelled by user."
+        exit 0
+    fi
+    
+    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then
+        echo -e "${RED}ERROR: Invalid selection.${NC}"
         exit 1
-    }
+    fi
+    
+    # Set target version based on user selection
+    TARGET_OMNIA_VERSION="${available_versions[$((selection-1))]}"
+    TARGET_CONTAINER_TAG=$(get_container_tag_from_version "$TARGET_OMNIA_VERSION")
+    
+    # Pre-validation: Check if target container image exists locally
+    if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then
+        exit 1
+    fi
+    
+    echo -e "${GREEN}Target Omnia version: $TARGET_OMNIA_VERSION${NC}"
+    echo -e "${GREEN}Target container tag: $TARGET_CONTAINER_TAG${NC}"
+    
+    # Check if container tag change is needed
+    if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then
+        echo -e "${BLUE}Upgrade within same container tag ($TARGET_CONTAINER_TAG)${NC}"
+        echo -e "${BLUE}Will restart container instead of swapping${NC}"
+        SAME_TAG_UPGRADE=true
+    else
+        echo -e "${BLUE}Container tag change required ($OMNIA_CORE_CONTAINER_TAG -> $TARGET_CONTAINER_TAG)${NC}"
+        echo -e "${BLUE}Will perform full container swap${NC}"
+        SAME_TAG_UPGRADE=false
+    fi
+    
+    # Pre-validation: Check if target container image exists locally
+    if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then
+        exit 1
+    fi
+    local lock_file="/tmp/omnia_upgrade.lock"
+    if [ -f "$lock_file" ]; then
+        echo -e "${RED}ERROR: Another upgrade process is already running${NC}"
+        echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}"
+        exit 1
+    fi
+    touch "$lock_file"
     trap 'rm -f "$lock_file"' EXIT
 
+    # Run upgrade phases
     if ! phase1_validate; then
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
         exit 1
@@ -1495,7 +1847,7 @@ upgrade_omnia_core() {
         exit 0
     fi
 
-    backup_base="$OMNIA_UPGRADE_BACKUP_PATH"
+    local backup_base="$OMNIA_UPGRADE_BACKUP_PATH"
     if [ -z "$backup_base" ]; then
         echo "[ERROR] [ORCHESTRATOR] Backup path is empty"
         exit 1
@@ -1506,13 +1858,26 @@ upgrade_omnia_core() {
         exit 1
     fi
 
-    if ! phase4_container_swap; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
-        exit 1
+    # Choose upgrade path based on container tag
+    if [ "$SAME_TAG_UPGRADE" = "true" ]; then
+        if ! phase4_same_tag_upgrade "$TARGET_OMNIA_VERSION"; then
+            echo "[ERROR] [ORCHESTRATOR] Upgrade failed in same-tag upgrade"
+            exit 1
+        fi
+    else
+        if ! phase4_container_swap; then
+            echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
+            exit 1
+        fi
     fi
 
     echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
     echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base"
+
+    show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
+    
+    # Initialize SSH config and start container session
+    init_ssh_config
     start_container_session
     exit 0
 }
@@ -1622,16 +1987,31 @@ restore_from_backup() {
 display_cleanup_instructions() {
     echo ""
     echo -e "${RED}================================================================================${NC}"
-    echo -e "${RED}                    ROLLBACK FAILED${NC}"
+    echo -e "${RED}                    UPGRADE/ROLLBACK FAILED${NC}"
     echo -e "${RED}================================================================================${NC}"
     echo ""
-    echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}"
+    echo -e "${YELLOW}Operation failed. Manual cleanup is required to restore a clean state before retrying.${NC}"
+    echo ""
+    echo -e "${BLUE}Choose the appropriate cleanup scenario:${NC}"
+    echo ""
+    echo -e "${GREEN}CASE 1: If you can log into omnia_core container:${NC}"
+    echo -e "${YELLOW}1. Enter omnia_core container: podman exec -it omnia_core bash${NC}"
+    echo -e "${YELLOW}2. Run oim cleanup: ansible-playbook /omnia/oim_cleanup.yml${NC}"
+    echo -e "${YELLOW}3. Run uninstall inside container: ./omnia.sh --uninstall${NC}"
+    echo -e "${YELLOW}4. Exit container: exit${NC}"
+    echo -e "${YELLOW}5. Clean shared path: rm -rf <omnia_shared_path>${NC}"
+    echo -e "${YELLOW}6. Install required version: ./omnia.sh --install${NC}"
     echo ""
-    echo -e "${YELLOW}Run the following on the OIM host:${NC}"
-    echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf <shared_path>${NC}"
-    echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}"
-    echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}"
-    echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}"
+    echo -e "${GREEN}CASE 2: If you cannot log into omnia_core container (but other containers are running):${NC}"
+    echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}"
+    echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}"
+    echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}"
+    echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}6. Clean shared path: rm -rf <omnia_shared_path>${NC}"
+    echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}"
+    echo ""
+    echo -e "${BLUE}Note: Replace <omnia_shared_path> with your actual Omnia shared path.${NC}"
     echo ""
 }
 
@@ -1652,6 +2032,27 @@ rollback_omnia_core() {
         exit 1
     fi
     
+    # Create lock file to prevent concurrent rollbacks
+    local lock_file="/tmp/omnia_rollback.lock"
+    if [ -f "$lock_file" ]; then
+        local existing_pid
+        existing_pid=$(cat "$lock_file" 2>/dev/null | tr -d ' \t\n\r')
+
+        if [ -n "$existing_pid" ] && kill -0 "$existing_pid" >/dev/null 2>&1; then
+            echo -e "${RED}ERROR: Another rollback process is already running (PID: $existing_pid)${NC}"
+            echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}"
+            exit 1
+        fi
+
+        if [ -n "$existing_pid" ]; then
+            echo -e "${YELLOW}[WARN] Stale rollback lock file found (PID: $existing_pid); removing: $lock_file${NC}"
+        fi
+        rm -f "$lock_file" >/dev/null 2>&1 || true
+    fi
+
+    echo "$$" > "$lock_file"
+    trap 'rm -f "$lock_file"' EXIT INT TERM
+    
     # Get current version
     if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then
         echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}"
@@ -1659,48 +2060,56 @@ rollback_omnia_core() {
     fi
     
     local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
-    if [ "$current_version" != "2.1.0.0" ]; then
-        echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}"
-        exit 1
-    fi
     
-    # List available backups
-    echo "[INFO] [ROLLBACK] Scanning for available backups..."
-    local backup_dirs=()
+    # Get available rollback versions dynamically
+    local rollback_versions
+    rollback_versions=$(get_available_rollback_versions "$current_version")
+    
+    # Convert to array
+    local available_versions=()
     while IFS= read -r line; do
-        backup_dirs+=("$line")
-    done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r)
+        available_versions+=("$line")
+    done <<< "$rollback_versions"
     
-    if [ ${#backup_dirs[@]} -eq 0 ]; then
-        echo -e "${RED}ERROR: No backup directories found.${NC}"
+    # Check if any rollback options are available
+    if [ ${#available_versions[@]} -eq 0 ]; then
+        echo -e "${RED}ERROR: No rollback versions available from $current_version.${NC}"
         exit 1
     fi
     
     echo ""
-    echo "Available backup versions:"
-    for i in "${!backup_dirs[@]}"; do
-        local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//')
-        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
-        echo "  $((i+1)). Version $version (created: $backup_date)"
+    echo "Available rollback versions:"
+    echo "==========================="
+    for i in "${!available_versions[@]}"; do
+        local version="${available_versions[$i]}"
+        local container_tag=$(get_container_tag_from_version "$version")
+        
+        # Check if target image exists locally
+        local image_status="✓ Available"
+        if ! podman inspect "omnia_core:$container_tag" >/dev/null 2>&1; then
+            image_status="✗ Missing (build required)"
+        fi
+        
+        echo "  $((i+1)). Rollback to version $version (container tag: $container_tag) [$image_status]"
     done
     
-    # Prompt for backup selection
+    # Prompt for rollback selection
     echo ""
-    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
+    echo -n "Select rollback version (1-${#available_versions[@]}): "
     read -r selection
     
     # Validate selection
-    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then
+    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then
         echo -e "${RED}ERROR: Invalid selection.${NC}"
         exit 1
     fi
     
-    local selected_backup="${backup_dirs[$((selection-1))]}"
-    local backup_version=$(basename "$selected_backup" | sed 's/version_//')
+    local selected_version="${available_versions[$((selection-1))]}"
+    local selected_container_tag=$(get_container_tag_from_version "$selected_version")
     
     echo ""
-    echo "Selected backup: Version $backup_version"
-    echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: "
+    echo "Selected rollback: Version $selected_version"
+    echo -n "Are you sure you want to rollback to version $selected_version? [y/N]: "
     read -r confirm
     
     if [[ ! "$confirm" =~ ^[yY] ]]; then
@@ -1708,50 +2117,99 @@ rollback_omnia_core() {
         exit 0
     fi
     
-    # Validate selected backup - only check if directory exists without podman exec
-    if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
-        # Try to check on host if container check fails
-        # Get shared path from metadata to check on host
-        local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
-        local host_backup_path="${selected_backup#/opt/omnia}"
-        if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then
-            echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}"
+    # Pre-validation: Check if target container image exists locally
+    if ! validate_container_image "$selected_version" "$selected_container_tag" "rollback"; then
+        exit 1
+    fi
+    
+    # Check if container tag change is needed
+    local current_container_tag=$(get_container_tag_from_version "$current_version")
+    if [ "$current_container_tag" = "$selected_container_tag" ]; then
+        echo -e "${BLUE}Rollback within same container tag ($selected_container_tag)${NC}"
+        echo -e "${BLUE}Will restart container instead of swapping${NC}"
+        
+        # Perform same-tag rollback (container restart only)
+        if ! rollback_same_tag "$selected_version" "$current_version"; then
+            echo "[ERROR] [ROLLBACK] Rollback failed in same-tag rollback"
             exit 1
         fi
+        
+        echo "[INFO] [ROLLBACK] Rollback completed successfully"
+        echo "[INFO] [ROLLBACK] Version rolled back to: $selected_version"
+        exit 0
+    else
+        echo -e "${BLUE}Container tag change required ($current_container_tag -> $selected_container_tag)${NC}"
+        echo -e "${BLUE}Will perform full container swap${NC}"
+        # Continue with existing container swap logic
+    fi
+    
+    # List available backups for selected version
+    echo "[INFO] [ROLLBACK] Scanning for available backups for version $selected_version..."
+    local backup_dirs=()
+    while IFS= read -r line; do
+        backup_dirs+=("$line")
+    done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_${selected_version}*" 2>/dev/null | sort -r)
+    
+    if [ ${#backup_dirs[@]} -eq 0 ]; then
+        echo -e "${RED}ERROR: No backup directories found for version $selected_version.${NC}"
+        exit 1
+    fi
+    
+    echo ""
+    echo "Available backups for version $selected_version:"
+    for i in "${!backup_dirs[@]}"; do
+        local backup_path="${backup_dirs[$i]}"
+        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
+        echo "  $((i+1)). Backup created: $backup_date"
+    done
+    
+    # Prompt for backup selection
+    echo ""
+    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
+    read -r backup_selection
+    
+    # Validate backup selection
+    if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then
+        echo -e "${RED}ERROR: Invalid backup selection.${NC}"
+        exit 1
+    fi
+    
+    local selected_backup="${backup_dirs[$((backup_selection-1))]}"
+    
+    # Validate selected backup exists
+    if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
+        echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}"
+        exit 1
     fi
     
     echo ""
     echo "[INFO] [ROLLBACK] Starting rollback process..."
     
-    # Step 1: Stop 2.1 container gracefully
+    # Step 1: Stop current container gracefully
     echo ""
-    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..."
+    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core $current_container_tag container..."
     if ! stop_container_gracefully "omnia_core" 30; then
         echo -e "${RED}ERROR: Failed to stop container.${NC}"
         display_cleanup_instructions
         exit 1
     fi
     
-    # Step 2: Check for 1.0 image
+    # Step 2: Update Quadlet file to use target container tag
     echo ""
-    echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..."
-    if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then
-        echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}"
-        echo -e "${YELLOW}Attempting to tag image...${NC}"
-        
-        # Try to tag latest as 1.0 if available
-        if podman inspect omnia_core:latest >/dev/null 2>&1; then
-            podman tag omnia_core:latest omnia_core:1.0
-        else
-            echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}"
-            display_cleanup_instructions
-            exit 1
-        fi
+    echo "[INFO] [ROLLBACK] Step 2: Updating Quadlet file to use container tag $selected_container_tag..."
+    local quadlet_file="/etc/containers/systemd/omnia_core.container"
+    
+    if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$selected_container_tag/" "$quadlet_file"; then
+        echo -e "${RED}ERROR: Failed to update Image to $selected_container_tag in quadlet file${NC}"
+        display_cleanup_instructions
+        exit 1
     fi
     
-    # Step 3: Start 1.0 container
+    echo "[INFO] [ROLLBACK] Quadlet file updated to use omnia_core:$selected_container_tag"
+    
+    # Step 3: Start target container
     echo ""
-    echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..."
+    echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core $selected_container_tag container..."
     systemctl daemon-reload
     if ! systemctl start omnia_core.service; then
         echo -e "${RED}ERROR: Failed to start container service.${NC}"
@@ -1805,8 +2263,8 @@ rollback_omnia_core() {
     echo "[INFO] [ROLLBACK] Step 7: Verifying container version..."
     local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
     
-    if [ "$verify_version" != "$backup_version" ]; then
-        echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}"
+    if [ "$verify_version" != "$selected_version" ]; then
+        echo -e "${RED}ERROR: Version verification failed. Expected: $selected_version, Found: $verify_version${NC}"
         display_cleanup_instructions
         exit 1
     fi
@@ -1814,18 +2272,22 @@ rollback_omnia_core() {
     # Audit log end
     local rollback_end=$(date -Iseconds)
     echo "[AUDIT] Rollback operation completed at: $rollback_end"
-    echo "[AUDIT] Rolled back from version $current_version to $backup_version"
+    echo "[AUDIT] Rolled back from version $current_version to $selected_version"
     
     echo ""
     echo -e "${GREEN}================================================================================${NC}"
     echo -e "${GREEN}                    ROLLBACK COMPLETED SUCCESSFULLY${NC}"
     echo -e "${GREEN}================================================================================${NC}"
     echo ""
-    echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}"
+    echo -e "${GREEN}✓ Omnia core has been rolled back to version $selected_version${NC}"
     echo -e "${GREEN}✓ Container is running and healthy${NC}"
     echo -e "${GREEN}✓ Configuration restored from backup${NC}"
     echo ""
     
+    # Clean up lock file before starting long-running ssh session
+    rm -f "$lock_file" >/dev/null 2>&1 || true
+    echo "[INFO] Rollback lock file removed before starting container session"
+    
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session

From 2078496e82aa5525bfc6255373f8f42ca4a51fa2 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Tue, 17 Feb 2026 12:35:42 +0530
Subject: [PATCH 167/172] LDMS Slurm node add /delete (#3976)

* LDMS slurm node add/delete

* pr review comments update
---
 .../telemetry/tasks/check_pxe_changes.yml     |  88 ++++++++++
 discovery/roles/telemetry/tasks/main.yml      |  10 ++
 .../telemetry/tasks/restart_ldms_configs.yml  | 151 ++++++++++++++++++
 discovery/roles/telemetry/vars/main.yml       |  21 +++
 4 files changed, 270 insertions(+)
 create mode 100644 discovery/roles/telemetry/tasks/check_pxe_changes.yml
 create mode 100644 discovery/roles/telemetry/tasks/restart_ldms_configs.yml

diff --git a/discovery/roles/telemetry/tasks/check_pxe_changes.yml b/discovery/roles/telemetry/tasks/check_pxe_changes.yml
new file mode 100644
index 0000000000..398c831961
--- /dev/null
+++ b/discovery/roles/telemetry/tasks/check_pxe_changes.yml
@@ -0,0 +1,88 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if current PXE mapping file exists
+  ansible.builtin.stat:
+    path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+  delegate_to: localhost
+  register: current_pxe_file
+
+- name: Check if backup PXE mapping file exists
+  ansible.builtin.stat:
+    path: "{{ backup_pxe_mapping_ldms_path }}"
+  delegate_to: localhost
+  register: backup_pxe_file
+
+- name: Handle first discovery run (no backup exists)
+  when:
+    - current_pxe_file.stat.exists
+    - not backup_pxe_file.stat.exists
+  block:
+    - name: Create backup of PXE mapping file
+      ansible.builtin.copy:
+        src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+        dest: "{{ backup_pxe_mapping_ldms_path }}"
+        remote_src: true
+        mode: preserve
+      delegate_to: localhost
+
+    - name: Set pxe_changed to false for first run
+      ansible.builtin.set_fact:
+        pxe_changed: false
+
+    - name: Display first run message
+      ansible.builtin.debug:
+        msg: "{{ pxe_first_run_msg }}"
+
+- name: Compare PXE mapping files when backup exists
+  when:
+    - current_pxe_file.stat.exists
+    - backup_pxe_file.stat.exists
+  block:
+    - name: Get checksum of current PXE mapping file
+      ansible.builtin.stat:
+        path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+        checksum_algorithm: sha256
+      delegate_to: localhost
+      register: current_pxe_checksum
+
+    - name: Get checksum of backup PXE mapping file
+      ansible.builtin.stat:
+        path: "{{ backup_pxe_mapping_ldms_path }}"
+        checksum_algorithm: sha256
+      delegate_to: localhost
+      register: backup_pxe_checksum
+
+    - name: Set pxe_changed based on checksum comparison
+      ansible.builtin.set_fact:
+        pxe_changed: "{{ current_pxe_checksum.stat.checksum != backup_pxe_checksum.stat.checksum }}"
+
+    - name: Update backup PXE mapping file when changed
+      ansible.builtin.copy:
+        src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+        dest: "{{ backup_pxe_mapping_ldms_path }}"
+        remote_src: true
+        mode: preserve
+      delegate_to: localhost
+      when: pxe_changed | bool
+
+    - name: Display PXE change status
+      ansible.builtin.debug:
+        msg: "{{ pxe_changed_msg if (pxe_changed | bool) else pxe_no_change_msg }}"
+
+- name: Set pxe_changed to false when PXE file is missing
+  ansible.builtin.set_fact:
+    pxe_changed: false
+  when: not current_pxe_file.stat.exists
diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml
index 825c3988d7..e4e3d1846a 100644
--- a/discovery/roles/telemetry/tasks/main.yml
+++ b/discovery/roles/telemetry/tasks/main.yml
@@ -55,3 +55,13 @@
 - name: Update ldms agg configuration
   ansible.builtin.include_tasks: update_ldms_agg_config.yml
   when: hostvars['localhost']['ldms_support']
+
+- name: Check if PXE mapping has changed since last run
+  ansible.builtin.include_tasks: check_pxe_changes.yml
+  when: hostvars['localhost']['ldms_support']
+
+- name: Restart LDMS configs for node addition and deletion
+  ansible.builtin.include_tasks: restart_ldms_configs.yml
+  when:
+    - hostvars['localhost']['ldms_support']
+    - pxe_changed | default(false) | bool
diff --git a/discovery/roles/telemetry/tasks/restart_ldms_configs.yml b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml
new file mode 100644
index 0000000000..0a176118f0
--- /dev/null
+++ b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml
@@ -0,0 +1,151 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Load high availability config
+  ansible.builtin.include_vars:
+    file: "{{ hostvars['localhost']['input_project_dir'] }}/high_availability_config.yml"
+    name: ha_config
+
+- name: Set kube_vip fact
+  ansible.builtin.set_fact:
+    kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}"
+
+- name: Test SSH connectivity to kube VIP only when PXE has changed
+  when:
+    - kube_vip | length > 0
+    - pxe_changed | default(false) | bool
+  block:
+    - name: SSH test to kube VIP
+      ansible.builtin.command:
+        cmd: "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes {{ kube_vip }} echo reachable"
+      delegate_to: localhost
+      register: kube_vip_ssh_check
+      changed_when: false
+
+    - name: Set kube VIP reachable fact
+      ansible.builtin.set_fact:
+        kube_vip_reachable: "{{ kube_vip_ssh_check.rc == 0 }}"
+
+  rescue:
+    - name: Display kube VIP unreachable message
+      ansible.builtin.debug:
+        msg: "{{ kube_vip_unreachable_msg }}"
+
+    - name: Set kube VIP reachable fact to false
+      ansible.builtin.set_fact:
+        kube_vip_reachable: false
+
+- name: Restart LDMS aggregator when PXE has changed
+  when: pxe_changed | default(false) | bool
+  block:
+    - name: Check if LDMS aggregator is running on service k8s cluster
+      kubernetes.core.k8s_info:
+        api_version: apps/v1
+        kind: StatefulSet
+        name: nersc-ldms-aggr
+        namespace: "{{ telemetry_namespace }}"
+      delegate_to: "{{ kube_vip }}"
+      register: ldms_statefulset_info
+      failed_when: false
+      when:
+        - kube_vip_reachable | bool
+
+    - name: Set LDMS running state
+      ansible.builtin.set_fact:
+        ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}"
+      when:
+        - kube_vip_reachable | bool
+
+    - name: Check if LDMS conf ConfigMap file exists
+      ansible.builtin.stat:
+        path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml"
+      register: ldms_conf_file
+      when: ldms_running | default(false) | bool
+
+    - name: Check if LDMS bin ConfigMap file exists
+      ansible.builtin.stat:
+        path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml"
+      register: ldms_bin_file
+      when: ldms_running | default(false) | bool
+
+    - name: Apply LDMS configuration ConfigMap
+      kubernetes.core.k8s:
+        state: present
+        src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml"
+        namespace: "{{ telemetry_namespace }}"
+      delegate_to: "{{ kube_vip }}"
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+
+    - name: Apply LDMS scripts ConfigMap
+      kubernetes.core.k8s:
+        state: present
+        src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml"
+        namespace: "{{ telemetry_namespace }}"
+      delegate_to: "{{ kube_vip }}"
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_bin_file.stat.exists | default(false)
+
+    - name: Restart LDMS aggregator StatefulSet
+      kubernetes.core.k8s:
+        state: present
+        definition:
+          apiVersion: apps/v1
+          kind: StatefulSet
+          metadata:
+            name: nersc-ldms-aggr
+            namespace: "{{ telemetry_namespace }}"
+          spec:
+            template:
+              metadata:
+                annotations:
+                  kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}"
+      delegate_to: "{{ kube_vip }}"
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+        - ldms_bin_file.stat.exists | default(false)
+
+    - name: Wait for LDMS aggregator pod to be ready after restart
+      kubernetes.core.k8s_info:
+        api_version: v1
+        kind: Pod
+        namespace: "{{ telemetry_namespace }}"
+        label_selectors:
+          - "app=nersc-ldms-aggr"
+        wait: true
+        wait_condition:
+          type: Ready
+          status: "True"
+        wait_timeout: 120
+      delegate_to: "{{ kube_vip }}"
+      register: ldms_pod_ready
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+        - ldms_bin_file.stat.exists | default(false)
+
+    - name: Display LDMS aggregator restart status
+      ansible.builtin.debug:
+        msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}"
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+        - ldms_bin_file.stat.exists | default(false)
diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml
index 5c5838ce29..69b0c0c0ac 100644
--- a/discovery/roles/telemetry/vars/main.yml
+++ b/discovery/roles/telemetry/vars/main.yml
@@ -252,3 +252,24 @@ common_templates:
     skip_when: "{{ cluster_id_present | default(false) }}"
   - src: 'telemetry/kustomization.yaml.j2'
     dest: 'kustomization.yaml'
+
+# Usage: check_pxe_changes.yml
+backup_pxe_mapping_ldms_path: "/opt/omnia/telemetry/backup_pxe_mapping_ldms.csv"
+pxe_first_run_msg: "First discovery run detected. Saving PXE mapping backup. LDMS restart not required."
+pxe_no_change_msg: "PXE mapping file has not changed since last run. Skipping LDMS restart."
+pxe_changed_msg: "PXE mapping file has changed. LDMS restart will be triggered."
+
+# Usage: restart_ldms_configs.yml
+kube_vip_unreachable_msg: >-
+  Kube VIP ({{ kube_vip }}) is not reachable via SSH.
+  There might be issues with the k8s cluster.
+  LDMS aggregator restart will be skipped.
+
+  After discovery completes, manually restart the LDMS aggregator pod with:
+
+  ssh {{ kube_vip }}
+  kubectl rollout restart statefulset nersc-ldms-aggr -n {{ telemetry_namespace }}
+  kubectl get pods -n {{ telemetry_namespace }} -l app=nersc-ldms-aggr -w
+
+ldms_pod_ready_msg: "LDMS aggregator pod is ready."
+ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready within 120s."

From 4dbc6a978fdbcbd74c7a7c62e75ab47c399784be Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Wed, 18 Feb 2026 07:32:41 +0000
Subject: [PATCH 168/172] mask docker credentials in local_repo logs

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../library/module_utils/local_repo/parse_and_download.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 15bed1efb3..d5192e2bbe 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -84,16 +84,16 @@ def execute_command(cmd_string, logger, type_json=False):
                 logger.error(f"Raw output was: {status['stdout']}")
                 return False
 
-        logger.info(f"Command succeeded: {cmd_string}")
+        logger.info(f"Command succeeded: {safe_cmd_string}")
         return status
     except subprocess.CalledProcessError as e:
-        logger.error(f"Command failed: {cmd_string} - {e}")
+        logger.error(f"Command failed: {safe_cmd_string} - {e}")
         return False
     except subprocess.TimeoutExpired as e:
-        logger.error(f"Command timed out: {cmd_string} - {e}")
+        logger.error(f"Command timed out: {safe_cmd_string} - {e}")
         return False
     except OSError as e:
-        logger.error(f"OS error during command: {cmd_string} - {e}")
+        logger.error(f"OS error during command: {safe_cmd_string} - {e}")
         return False
 
     finally:

From 76d7f3cd0c9c77fd0467a18249be12edc4236b34 Mon Sep 17 00:00:00 2001
From: Nethravathi M G <146437298+nethramg@users.noreply.github.com>
Date: Thu, 19 Feb 2026 13:04:12 +0530
Subject: [PATCH 169/172] Removing the IP's from the Activated IP list (#3992)

---
 telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
index 06bf230980..54986f418f 100644
--- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
+++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
@@ -2,9 +2,9 @@
 
 ----- Telemetry Report for Cluster -----
 
-Total IP count with Telemetry activated: {{ (db_idrac_ip_list | length) + (telemetry_idrac | length) }}
+Total IP count with Telemetry activated: {{ ((db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([]))) | length }}
 Telemetry activated IPs List:
-{% for item in db_idrac_ip_list + telemetry_idrac %}
+{% for item in (db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([])) %}
   - {{ item }}
 {% endfor %}
 

From 272bfb51c94fe7283bc3256c32894882b7b032e8 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Thu, 19 Feb 2026 14:41:35 +0530
Subject: [PATCH 170/172] Fix for local_repo.yml allows passes even with
 invalid package names in JSON files.

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../library/module_utils/local_repo/config.py |   6 +-
 .../local_repo/container_repo_utils.py        | 161 ++++++++++--------
 .../module_utils/local_repo/download_rpm.py   |  89 +++++++++-
 3 files changed, 178 insertions(+), 78 deletions(-)

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index a731c8528d..7bfea4b301 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -64,6 +64,10 @@
     "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"],
     "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"]
 }
+DNF_INFO_COMMANDS = {
+    "x86_64": ["dnf", "info", "--quiet"],
+    "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"]
+}
 
 # ----------------------------
 # Used by download_common.py
@@ -222,7 +226,7 @@
 # Naming convention: <arch>_omnia-additional to match existing filter patterns
 # ----------------------------
 ADDITIONAL_REPOS_KEY = "additional_repos"
-AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional-repo"
+AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional"
 AGGREGATED_REMOTE_NAME_TEMPLATE = "{arch}_omnia-additional-{name}"
 AGGREGATED_DISTRIBUTION_NAME_TEMPLATE = "{arch}_omnia-additional"
 AGGREGATED_BASE_PATH_TEMPLATE = "opt/omnia/offline_repo/cluster/{arch}/rhel/10.0/rpms/omnia-additional"
diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py
index 0a4abb35fb..e3f47869af 100644
--- a/common/library/module_utils/local_repo/container_repo_utils.py
+++ b/common/library/module_utils/local_repo/container_repo_utils.py
@@ -13,6 +13,13 @@
 # limitations under the License.
 #pylint: disable=import-error,no-name-in-module
 
+"""
+Container repository utilities for Pulp operations.
+
+This module provides functions for creating, syncing, and managing
+container repositories and distributions in Pulp.
+"""
+
 import multiprocessing
 from ansible.module_utils.local_repo.parse_and_download import execute_command
 from ansible.module_utils.local_repo.config import (
@@ -114,109 +121,119 @@ def sync_container_repository(repo_name, remote_name, package_content, logger, t
         logger.info(f"Getting repository version before sync for {repo_name}")
         verify_command = pulp_container_commands["show_container_repo"] % repo_name
         verify_result_before = execute_command(verify_command, logger, type_json=True)
-        
+
         version_before = None
-        if verify_result_before and isinstance(verify_result_before, dict) and "stdout" in verify_result_before:
+        if (verify_result_before and isinstance(verify_result_before, dict) and 
+                "stdout" in verify_result_before):
             repo_data_before = verify_result_before["stdout"]
             if isinstance(repo_data_before, dict):
                 version_before = repo_data_before.get("latest_version_href")
                 logger.info(f"Repository version before sync: {version_before}")
-        
+
         command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name)
         result = execute_command(command,logger)
         if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
             logger.error(f"Sync command failed for repository {repo_name}")
             return False
-        
+
         logger.info(f"Validating sync result for repository {repo_name}")
         verify_result_after = execute_command(verify_command, logger, type_json=True)
-        
-        if verify_result_after and isinstance(verify_result_after, dict) and "stdout" in verify_result_after:
+
+        if (verify_result_after and isinstance(verify_result_after, dict) and 
+                "stdout" in verify_result_after):
             repo_data_after = verify_result_after["stdout"]
             if isinstance(repo_data_after, dict):
                 version_after = repo_data_after.get("latest_version_href")
                 logger.info(f"Repository version after sync: {version_after}")
-                
+
                 if not version_after or version_after.endswith("/versions/0/"):
                     logger.error(f"Sync completed but no content was downloaded for {repo_name}. "
                                f"The specified image tag likely does not exist in the upstream registry.")
                     return False
-                
+
                 if version_before and version_after and version_before == version_after:
                     # Check if tag actually exists using precise Pulp commands
                     try:
                         # Step 1: Get distribution to find repository href
                         dist_command = f"pulp container distribution show --name {repo_name}"
                         dist_result = execute_command(dist_command, logger, type_json=True)
-                        
+
                         if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result:
-                            logger.error(f"Failed to get distribution info for {repo_name}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        dist_data = dist_result["stdout"]
-                        if not isinstance(dist_data, dict) or "repository" not in dist_data:
-                            logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        repo_href = dist_data["repository"]
-                        logger.info(f"Found repository href: {repo_href}")
-                        
-                        # Step 2: Get repository version href
-                        repo_command = f"pulp container repository show --href {repo_href}"
-                        repo_result = execute_command(repo_command, logger, type_json=True)
-                        
-                        if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result:
-                            logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        repo_data = repo_result["stdout"]
-                        if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data:
-                            logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        repo_ver_href = repo_data["latest_version_href"]
-                        logger.info(f"Found repository version href: {repo_ver_href}")
-                        
-                        # Step 3: Check if tag exists in content
-                        tags_command = f"pulp show --href '/pulp/api/v3/content/container/tags/?repository_version={repo_ver_href}'"
-                        tags_result = execute_command(tags_command, logger, type_json=True)
-                        
-                        if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result:
-                            logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        tags_data = tags_result["stdout"]
-                        if not isinstance(tags_data, dict) or "results" not in tags_data:
-                            logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        tags = tags_data["results"]
-                        tag_exists = False
-                        
-                        # Use the tag parameter if provided, otherwise fall back to checking package_content
-                        tag_to_check = tag if tag else package_content
-                        
-                        for tag_item in tags:
-                            if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check:
-                                tag_exists = True
-                                break
-                        
-                        if tag_exists:
-                            logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.")
+                            logger.info(f"Distribution {repo_name} does not exist yet - skipping tag validation, will create distribution")
+                        # Skip tag validation but continue to create distribution at line 221
                         else:
-                            logger.error(f"Sync completed but repository version did not change for {repo_name}. "
-                                       f"Version remained at {version_after}. "
-                                       f"Tag '{tag_to_check}' does not exist in Pulp repository content. "
-                                       f"This indicates the tag likely does not exist in the upstream registry.")
-                            return False
+                            # Distribution exists, validate the tag
+                            dist_data = dist_result["stdout"]
+                            if not isinstance(dist_data, dict) or "repository" not in dist_data:
+                                logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.")
+                                return False
+                            repo_href = dist_data["repository"]
+                            logger.info(f"Found repository href: {repo_href}")
+
+                            # Step 2: Get repository version href
+                            repo_command = f"pulp container repository show --href {repo_href}"
+                            repo_result = execute_command(repo_command, logger, type_json=True)
+
+                            if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result:
+                                logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            repo_data = repo_result["stdout"]
+                            if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data:
+                                logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            repo_ver_href = repo_data["latest_version_href"]
+                            logger.info(f"Found repository version href: {repo_ver_href}")
+
+                            # Step 3: Check if tag exists in content
+                            tags_command = (
+                                f"pulp show --href "
+                                f"'/pulp/api/v3/content/container/tags/"
+                                f"?repository_version={repo_ver_href}'"
+                            )
+                            tags_result = execute_command(tags_command, logger, type_json=True)
+
+                            if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result:
+                                logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            tags_data = tags_result["stdout"]
+                            if not isinstance(tags_data, dict) or "results" not in tags_data:
+                                logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            tags = tags_data["results"]
+                            tag_exists = False
+
+                            # Use the tag parameter if provided, otherwise fall back to checking package_content
+                            tag_to_check = tag if tag else package_content
+
+                            for tag_item in tags:
+                                if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check:
+                                    tag_exists = True
+                                    break
+
+                            if tag_exists:
+                                logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.")
+                            else:
+                                logger.error(f"Sync completed but repository version did not change for {repo_name}. "
+                                        f"Version remained at {version_after}. "
+                                        f"Tag '{tag_to_check}' does not exist in Pulp repository content. "
+                                        f"This indicates the tag likely does not exist in the upstream registry.")
+                                return False
                             
                     except Exception as e:
-                        logger.error(f"Error checking repository tag existence: {e}. Assuming tag doesn't exist.")
+                        logger.error(
+                            f"Error checking repository tag existence: {e}. Assuming tag doesn't exist."
+                        )
                         return False
-                
-                logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}")
-        
-        result = create_container_distribution(repo_name,package_content,logger)
+
+                logger.info(
+                    f"Sync validation successful: repository {repo_name} version changed "
+                    f"from {version_before} to {version_after}"
+                )
+        result = create_container_distribution(repo_name, package_content, logger)
         return result
     except Exception as e:
         logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}")
diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py
index 95b354dd6b..44b56c1799 100644
--- a/common/library/module_utils/local_repo/download_rpm.py
+++ b/common/library/module_utils/local_repo/download_rpm.py
@@ -20,7 +20,8 @@
 import shutil
 from pathlib import Path
 from ansible.module_utils.local_repo.config import (
-    DNF_COMMANDS
+    DNF_COMMANDS,
+    DNF_INFO_COMMANDS
 )
 from multiprocessing import Lock
 from ansible.module_utils.local_repo.parse_and_download import write_status_to_file
@@ -95,11 +96,30 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
             for pkg in rpm_list:
                 # Get repo_name for this specific RPM from mapping
                 pkg_repo_name = repo_mapping.get(pkg, "")
-                if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines):
+                # Check if package was downloaded successfully
+                # Look for "Already downloaded" or actual .rpm file in output
+                pkg_downloaded = False
+                for line in stdout_lines + stderr_lines:
+                    if pkg in line and (".rpm" in line or "Already downloaded" in line):
+                        pkg_downloaded = True
+                        break
+
+                # Also check for "No match for argument" or "No package" errors
+                pkg_not_found = False
+                for line in stderr_lines:
+                    if pkg in line and ("No match for argument" in line or 
+                                       "No package" in line or
+                                       "not found" in line.lower()):
+                        pkg_not_found = True
+                        break
+
+                if pkg_downloaded and not pkg_not_found:
                     downloaded.append(pkg)
                     write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
                 else:
                     failed.append(pkg)
+                    if pkg_not_found:
+                        logger.warning(f"Package '{pkg}' not found in configured repositories")
 
             # Retry failed ones individually
             if failed:
@@ -110,6 +130,15 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                     # Get repo_name for this specific RPM from mapping
                     pkg_repo_name = repo_mapping.get(pkg, "")
 
+                    # Check for package not found errors
+                    retry_stderr = retry_res.stderr.lower()
+                    pkg_invalid = any(err in retry_stderr for err in [
+                        "no match for argument",
+                        "no package",
+                        "not found",
+                        "unable to find a match"
+                    ])
+
                     if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr:
                         downloaded.append(pkg)
                         failed.remove(pkg)
@@ -117,7 +146,10 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                         logger.info(f"Package '{pkg}' downloaded successfully on retry.")
                     else:
                         write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name)
-                        logger.error(f"Package '{pkg}' still failed after retry.")
+                        if pkg_invalid:
+                            logger.error(f"Package '{pkg}' does not exist in configured repositories.")
+                        else:
+                            logger.error(f"Package '{pkg}' still failed after retry.")
 
             # Determine final status
             if not failed:
@@ -128,12 +160,59 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                 status = "Failed"
 
         else:
-            status = "Success"
             logger.info("RPM won't be downloaded when repo_config is partial or never")
+            logger.info("Validating package availability using dnf info...")
+
+            arch_key = "x86_64" if arc.lower() in ("x86_64") else "aarch64"
+            valid_packages = []
+            invalid_packages = []
+
             for pkg in package["rpm_list"]:
+                # Validate package using dnf info
+                dnf_info_command = DNF_INFO_COMMANDS[arch_key] + [
+                    "--repo=*",  # Search all enabled repositories
+                    pkg
+                ]
+                result = subprocess.run(
+                    dnf_info_command,
+                    check=False,
+                    capture_output=True,
+                    text=True
+                )
                 # Get repo_name for this specific RPM from mapping
                 pkg_repo_name = repo_mapping.get(pkg, "")
-                write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
+                if result.returncode == 0:
+                    # Package exists and is available
+                    valid_packages.append(pkg)
+                    write_status_to_file(
+                        status_file_path, pkg, "rpm", "Success", 
+                        logger, file_lock, pkg_repo_name
+                    )
+                    logger.info(f"Package '{pkg}' validated successfully")
+                else:
+                    # Package not found or invalid
+                    invalid_packages.append(pkg)
+                    write_status_to_file(
+                        status_file_path, pkg, "rpm", "Failed", 
+                        logger, file_lock, pkg_repo_name
+                    )
+                    logger.error(
+                        f"Package '{pkg}' validation failed. "
+                        f"Package may not exist in configured repositories."
+                    )
+
+            # Determine final status based on validation results
+            if not invalid_packages:
+                status = "Success"
+            elif valid_packages:
+                status = "Partial"
+            else:
+                status = "Failed"
+
+            logger.info(
+                f"Validation complete - Valid: {len(valid_packages)}, "
+                f"Invalid: {len(invalid_packages)}"
+            )
 
     except Exception as e:
         logger.error(f"Exception occurred: {e}")

From c42782c8481703c5d0c10ba3e36ee7e242bd0304 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 20 Feb 2026 17:54:24 +0530
Subject: [PATCH 171/172] Lock Mechanism for Upgrade Sequence Integrity (#3994)

---
 build_image_aarch64/build_image_aarch64.yml   |  3 +
 build_image_x86_64/build_image_x86_64.yml     |  3 +
 discovery/discovery.yml                       |  3 +
 local_repo/local_repo.yml                     |  3 +
 omnia.sh                                      | 82 +++++++++++++------
 prepare_oim/prepare_oim.yml                   |  3 +
 .../tasks/display_warnings.yml                |  2 +
 upgrade/upgrade_omnia.yml                     | 10 +++
 .../get_config_credentials.yml                |  4 +
 utils/oim_cleanup.yml                         |  4 +
 utils/upgrade_checkup.yml                     | 33 ++++++++
 11 files changed, 126 insertions(+), 24 deletions(-)
 create mode 100644 utils/upgrade_checkup.yml

diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml
index 08ee0b4ad8..d5dc76a82d 100644
--- a/build_image_aarch64/build_image_aarch64.yml
+++ b/build_image_aarch64/build_image_aarch64.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml
index 676d8adbd6..8f56b86ef6 100644
--- a/build_image_x86_64/build_image_x86_64.yml
+++ b/build_image_x86_64/build_image_x86_64.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/discovery/discovery.yml b/discovery/discovery.yml
index 75efadb47c..40fd00123c 100644
--- a/discovery/discovery.yml
+++ b/discovery/discovery.yml
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: ../utils/include_input_dir.yml
diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml
index 3a743c3f47..963715b5e3 100644
--- a/local_repo/local_repo.yml
+++ b/local_repo/local_repo.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/omnia.sh b/omnia.sh
index 3b320b0bf6..530c168e7d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -299,7 +299,18 @@ update_metadata_upgrade_backup_dir() {
     "
 }
 
-
+# Resolve the upgrade guard lock path (container or host shared path)
+get_upgrade_guard_lock_path() {
+    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
+    local upgrade_guard_lock_host
+    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -n "$upgrade_guard_lock_host" ]; then
+        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
+    else
+        upgrade_guard_lock_host="$upgrade_guard_lock_container"
+    fi
+    echo "$upgrade_guard_lock_host"
+}
 
 check_internal_nfs_export() {
     nfs_server_ip=$1
@@ -398,6 +409,11 @@ cleanup_omnia_core() {
         # Fetch the configuration from the Omnia core container.
         fetch_config
 
+        # Clear upgrade guard lock if present (shared path visible to container and host)
+        local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+        rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true
+        echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path"
+
         # Remove the container
         remove_container
 
@@ -1837,6 +1853,16 @@ upgrade_omnia_core() {
     touch "$lock_file"
     trap 'rm -f "$lock_file"' EXIT
 
+    # Create upgrade guard lock in shared path so other playbooks can block during upgrade
+    local upgrade_guard_lock_path
+    upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+
+    mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true
+    echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || {
+        echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}"
+        exit 1
+    }
+
     # Run upgrade phases
     if ! phase1_validate; then
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
@@ -1874,8 +1900,10 @@ upgrade_omnia_core() {
     echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
     echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base"
 
+    # Seed inputs and defaults after upgrade
+    post_setup_config
+
     show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
-    
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session
@@ -1885,15 +1913,15 @@ upgrade_omnia_core() {
 # Validate backup directory structure and files
 validate_backup_directory() {
     local backup_path="$1"
-    
+
     echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path"
-    
+
     # Check if backup directory exists
     if ! podman exec -u root omnia_core test -d "$backup_path"; then
         echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path"
         return 1
     fi
-    
+
     # Check for required subdirectories
     for subdir in input metadata configs; do
         if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then
@@ -1901,24 +1929,24 @@ validate_backup_directory() {
             return 1
         fi
     done
-    
+
     # Check for required files
     if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then
         echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml"
         return 1
     fi
-    
+
     if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then
         echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container"
         return 1
     fi
-    
+
     # Verify metadata contains version information
     if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then
         echo "[ERROR] [ROLLBACK] Metadata file does not contain version information"
         return 1
     fi
-    
+
     echo "[INFO] [ROLLBACK] Backup validation successful"
     return 0
 }
@@ -1927,15 +1955,15 @@ validate_backup_directory() {
 stop_container_gracefully() {
     local container_name="$1"
     local timeout="${2:-30}"
-    
+
     echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..."
-    
+
     # Try graceful stop first
     if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then
         echo "[INFO] [ROLLBACK] Container stopped gracefully"
         return 0
     fi
-    
+
     # Check if container is still running
     if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
         echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..."
@@ -1947,16 +1975,16 @@ stop_container_gracefully() {
             return 1
         fi
     fi
-    
+
     return 0
 }
 
 # Restore files from backup
 restore_from_backup() {
     local backup_path="$1"
-    
+
     echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path"
-    
+
     # Restore input files
     if ! podman exec -u root omnia_core bash -c "
         set -e
@@ -1966,19 +1994,19 @@ restore_from_backup() {
         echo "[ERROR] [ROLLBACK] Failed to restore input files"
         return 1
     fi
-    
+
     # Restore metadata
     if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then
         echo "[ERROR] [ROLLBACK] Failed to restore metadata"
         return 1
     fi
-    
+
     # Restore container config on host
     if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then
         echo "[ERROR] [ROLLBACK] Failed to restore container config"
         return 1
     fi
-    
+
     echo "[INFO] [ROLLBACK] Files restored successfully"
     return 0
 }
@@ -2006,8 +2034,8 @@ display_cleanup_instructions() {
     echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}"
     echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}"
     echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}"
-    echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}"
-    echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}"
+    echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}"
     echo -e "${YELLOW}6. Clean shared path: rm -rf <omnia_shared_path>${NC}"
     echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}"
     echo ""
@@ -2015,7 +2043,6 @@ display_cleanup_instructions() {
     echo ""
 }
 
-# Main rollback function
 rollback_omnia_core() {
     echo -e "${GREEN}================================================================================${NC}"
     echo -e "${GREEN}                         OMNIA CORE ROLLBACK${NC}"
@@ -2287,7 +2314,14 @@ rollback_omnia_core() {
     # Clean up lock file before starting long-running ssh session
     rm -f "$lock_file" >/dev/null 2>&1 || true
     echo "[INFO] Rollback lock file removed before starting container session"
-    
+
+    # Clear upgrade guard lock if it exists (shared path visible to container and host)
+    local upgrade_guard_lock_path
+    upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+
+    rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true
+    echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path"
+
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session
@@ -2325,4 +2359,4 @@ main() {
 }
 
 # Call the main function
-main "$1"
+main "$1"
\ No newline at end of file
diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml
index 50c48fd3e5..f5ea607994 100644
--- a/prepare_oim/prepare_oim.yml
+++ b/prepare_oim/prepare_oim.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
index 2cc6dfed26..444869291b 100644
--- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -29,6 +29,7 @@
 
 - name: Pause for user to review warnings
   ansible.builtin.pause:
+    seconds: 30
     prompt: |
       ╔════════════════════════════════════════════╗
       ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
@@ -42,6 +43,7 @@
 
       Please review these warnings carefully.
       Press ENTER to continue or CTRL+C to abort.
+      Continuing automatically in 30 seconds...
   when:
     - upgrade_warnings is defined
     - upgrade_warnings | length > 0
diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml
index 61050ec244..ade6b1f173 100644
--- a/upgrade/upgrade_omnia.yml
+++ b/upgrade/upgrade_omnia.yml
@@ -18,3 +18,13 @@
 
 - name: Upgrade cluster tasks
   ansible.builtin.import_playbook: upgrade_cluster.yml
+
+- name: Clear upgrade guard lock
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Remove upgrade guard lock
+      ansible.builtin.file:
+        path: /opt/omnia/.data/upgrade_in_progress.lock
+        state: absent
diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml
index 0e4c323b94..b77ba14b9b 100644
--- a/utils/credential_utility/get_config_credentials.yml
+++ b/utils/credential_utility/get_config_credentials.yml
@@ -13,6 +13,10 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../upgrade_checkup.yml
+  tags: always
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: ../include_input_dir.yml
diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml
index edb9cfb207..4d959d5ea4 100644
--- a/utils/oim_cleanup.yml
+++ b/utils/oim_cleanup.yml
@@ -13,6 +13,10 @@
 #  limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: upgrade_checkup.yml
+  tags: always
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: include_input_dir.yml
diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml
new file mode 100644
index 0000000000..5fb8582000
--- /dev/null
+++ b/utils/upgrade_checkup.yml
@@ -0,0 +1,33 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: "Guard: block if upgrade is in progress"
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Check upgrade lock file
+      ansible.builtin.stat:
+        path: /opt/omnia/.data/upgrade_in_progress.lock
+      register: upgrade_lock
+
+    - name: Block playbook while upgrade is in progress
+      ansible.builtin.fail:
+        msg: >-
+          Upgrade is not completed fully.
+          Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command:
+          "ansible-playbook /omnia/upgrade/upgrade_omnia.yml"
+          If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command
+          "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed.
+      when: upgrade_lock.stat.exists

From d11fde8e868837f3c5403bd3b55f36b72ee60ae5 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com>
Date: Fri, 20 Feb 2026 18:14:34 +0530
Subject: [PATCH 172/172] Slurm delete node - drain node before delete -
 skip_merge new option (#3986)

* Node drain logic for deletion

* Shell instead of command for piping

* lint fixes

* Updated permission for slurmdbd
Added new force_conf option for allowing confs pass through validation

* removede new file

* Renamed force_conf to skip_merge
---
 .../input_validation/schema/omnia_config.json |   4 +
 .../validation_flows/common_validation.py     |   9 +-
 .../slurm_config/tasks/build_slurm_conf.yml   |   5 +
 .../slurm_config/tasks/check_ctld_running.yml |  12 +-
 discovery/roles/slurm_config/tasks/confs.yml  |  14 ++-
 .../slurm_config/tasks/create_slurm_dir.yml   |   1 +
 .../tasks/drain_and_remove_node.yml           | 109 ++++++++++++++++++
 .../roles/slurm_config/tasks/remove_node.yml  |   2 +-
 discovery/roles/slurm_config/vars/main.yml    |   6 +-
 input/omnia_config.yml                        |  10 ++
 10 files changed, 161 insertions(+), 11 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml

diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json
index f53485770f..ca7266124c 100644
--- a/common/library/module_utils/input_validation/schema/omnia_config.json
+++ b/common/library/module_utils/input_validation/schema/omnia_config.json
@@ -19,6 +19,10 @@
             "minLength": 1,
             "description": "Name of the nfs storage in storage_config.yml" 
           },
+          "skip_merge": { 
+            "type": "boolean", 
+            "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" 
+          },
           "config_sources": {
             "type": "object",
             "description": "Config can be a file path or inline mapping",
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index f577a4e9b8..36f55130d4 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1074,9 +1074,12 @@ def validate_omnia_config(
                     "slurm NFS not provided",
                     f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}"
                     ))
-        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+
         skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation")
-        for cfg_path_dict in cnfg_src:
+        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+        skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')]
+        for idx, cfg_path_dict in enumerate(cnfg_src):
+            skip_merge = skip_merge_list[idx]
             for k,v in cfg_path_dict.items():
                 conf_dict = None
                 if isinstance(v, str):
@@ -1086,7 +1089,7 @@ def validate_omnia_config(
                                 f"provided conf path for {k} - {v} does not exist"))
                         continue
                     else: # path exists
-                        if not skip_conf_validation:
+                        if not skip_merge and not skip_conf_validation:
                             conf_dict, duplicate_keys = parse_slurm_conf(v, k, False)
                             if duplicate_keys:
                                 errors.append(
diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
index 9d5d0f0944..40b6137172 100644
--- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
+++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Read NodeName parameters from iDRAC
+  ansible.builtin.include_tasks: read_node_idrac.yml
+  when: cmpt_list
+  loop: "{{ cmpt_list }}"
+
 - name: Append node_params list into NodeName list
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({})
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 7d908169ab..ce27d3c362 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -22,6 +22,16 @@
   register: ssh_check
   ignore_errors: true
 
+- name: Drain and remove nodes if any
+  ansible.builtin.include_tasks: drain_and_remove_node.yml
+  loop: "{{ nodes_in_normal_not_in_cmpt }}"
+  loop_control:
+    loop_var: node_to_remove
+  when:
+    - ssh_check is success
+    - nodes_in_normal_not_in_cmpt is defined
+    - nodes_in_normal_not_in_cmpt | length > 0
+
 - name: Enter slurm controller when pingable
   when:
     - ssh_check is success
@@ -37,7 +47,7 @@
       register: service_facts
       ignore_unreachable: true
 
-    - name: Fail if slurmctld is unreachable
+    - name: Check slurmctld is reachable
       ansible.builtin.fail:
         msg: "Failed to connect to {{ ctld }}."
       when: service_facts is unreachable
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index c5f7953b0d..1e5a4e507e 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -17,13 +17,16 @@
     apply_config: "{{ __default_config }}"
   no_log: true
 
-- name: Read NodeName parameters
-  ansible.builtin.include_tasks: read_node_idrac.yml
-  when: cmpt_list
-  loop: "{{ cmpt_list }}"
+- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true)
+  ansible.builtin.set_fact:
+    conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}"
+  when:
+    - skip_merge | default(false)
+    - configs_input is defined
 
 - name: Build slurm.conf
   ansible.builtin.include_tasks: build_slurm_conf.yml
+  when: "'slurm' in conf_files"
 
 - name: Slurm dbd opts
   ansible.builtin.set_fact:
@@ -167,12 +170,13 @@
 - name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd
   ansible.builtin.set_fact:
     conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}"
+  when: slurm_conf_dict is defined
 
 - name: Write merged .conf
   ansible.builtin.copy:
     content: "{{ item.ini_lines | join('\n') }}\n"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf"
-    mode: "0640"
+    mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}"
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index e4ac760d77..b68bcbbded 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -60,6 +60,7 @@
   ansible.builtin.set_fact:
     cluster_name: "{{ slurm_cluster[0].cluster_name }}"
     configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}"
+    skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}"
     slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
     controller_trackfile_path: "{{ share_path }}/ctld_track"
 
diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
new file mode 100644
index 0000000000..da1c41d3fe
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -0,0 +1,109 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Check if node exists in Slurm cluster
+  ansible.builtin.command: scontrol show node {{ node_to_remove }}
+  register: node_exists_check
+  failed_when: false
+  ignore_unreachable: true
+  changed_when: false
+  delegate_to: "{{ ctld }}"
+
+- name: Skip if node does not exist
+  ansible.builtin.debug:
+    msg: "Node {{ node_to_remove }} not found in cluster, skipping removal"
+  when:
+    - node_exists_check is reachable
+    - node_exists_check.rc != 0
+
+- name: Process node removal
+  when:
+    - node_exists_check is reachable
+    - node_exists_check.rc == 0
+  ignore_unreachable: true
+  block:
+    - name: Get current job count on node
+      ansible.builtin.shell:
+        cmd: |
+          set -o pipefail
+          squeue -w {{ node_to_remove }} -h | wc -l
+      register: current_jobs
+      changed_when: false
+      delegate_to: "{{ ctld }}"
+
+    - name: Display job information
+      ansible.builtin.debug:
+        msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)"
+
+    - name: Drain the node to prevent new job assignments
+      ansible.builtin.command: >
+        scontrol update NodeName={{ node_to_remove }}
+        State=DRAIN
+        Reason="Scheduled removal - waiting for jobs to complete"
+      changed_when: true
+      delegate_to: "{{ ctld }}"
+
+    - name: Wait for all jobs to complete on the node
+      ansible.builtin.shell:
+        cmd: |
+          set -o pipefail
+          squeue -w {{ node_to_remove }} -h | wc -l
+      register: job_count_check
+      until: job_count_check.stdout | int == 0
+      retries: "{{ (node_drain_timeout / node_drain_delay) | int }}"
+      delay: "{{ node_drain_delay }}"
+      changed_when: false
+      delegate_to: "{{ ctld }}"
+      when: current_jobs.stdout | int > 0
+
+    - name: Confirm jobs completed
+      ansible.builtin.debug:
+        msg: "All jobs on {{ node_to_remove }} have completed"
+      when: current_jobs.stdout | int > 0
+
+    - name: Log node removal
+      ansible.builtin.debug:
+        msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state"
+
+  rescue:
+    - name: Log node removal failure
+      ansible.builtin.debug:
+        msg: "Failed to drain node {{ node_to_remove }}"
+
+    - name: Remove slurm node with running job after timeout
+      ansible.builtin.pause:
+        prompt: |
+          Node {{ node_to_remove }} has been DRAINED to prevent new job assignments.
+          Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds.
+          Options:
+            1. Press Ctrl+C then 'A' to abort
+            2. Press Enter to force removal (jobs will be killed)
+      when: not force_scancel_node
+
+    - name: Force cancel jobs if timeout reached
+      ansible.builtin.command: scancel -f -w {{ node_to_remove }}
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ ctld }}"
+
+  always:
+    - name: Set node to DOWN state
+      ansible.builtin.command: >
+        scontrol update NodeName={{ node_to_remove }}
+        State=DOWN
+        Reason="Node removed from cluster"
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ ctld }}"
+      when: node_exists_check.rc == 0
diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml
index 4dc0217559..ba93bb086a 100644
--- a/discovery/roles/slurm_config/tasks/remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/remove_node.yml
@@ -30,7 +30,7 @@
 - name: Update normal partition Nodes to match cmpt_list
   ansible.builtin.set_fact:
     updated_partitions: "{{ updated_partitions | default([])
-     + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}"
+     + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}"
   loop: "{{ slurm_conf_dict.PartitionName | default([]) }}"
   when:
     - "'slurm' in conf_merge_dict"
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 1593f791cb..d708eb0777 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -68,6 +68,7 @@ gpu_slurm_conf:
   SlurmdParameters: l3cache_as_socket
 innodb_buffer_pool_size: 4G
 innodb_lock_wait_timeout: 900
+conf_server: "--conf-server {{ ctld_list | join(',') }}"
 # TODO tmp
 nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
 bmc_username: "{{ hostvars['localhost']['bmc_username'] }}"
@@ -117,12 +118,15 @@ munge_dir_mode: "0700"
 common_mode: "0755"
 slurm_dbd_mode: "0600"
 slurm_db_cnf_mode: "0600"
+node_drain_timeout: 900
+node_drain_delay: 30
+force_scancel_node: false
 dbd_slurm_conf:
   AccountingStoragePort: "{{ slurm_dbd_port }}"
   AccountingStorageType: accounting_storage/slurmdbd
 partition_params:
   PartitionName: "{{ slurm_partition_name }}"
-  Nodes: "{{ cmpt_list | join(',') }}"
+  Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}"
   MaxTime: "INFINITE"
   State: "UP"
   Default: "YES"
diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index bb5a4f06fa..943d70e530 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -27,6 +27,15 @@
 # Storage name corresponding to the NFS share to be used by slurm cluster 
 # This should match with exactly with a entry in storage_config.yml
 
+# skip_merge
+# Variable indicates whether a specific configuration file path
+# under config_sources should be used as-is without merging
+# If skip_merge is set to true for a configuration source path,
+# that configuration file will be applied directly
+# without merging with defaults or existing configurations
+# It accepts true and false values
+# Default value is false
+
 # config_sources
 # defines how the Slurm configuration files are provided to the cluster.
 # <conf name>: 
@@ -50,6 +59,7 @@
 slurm_cluster:
   - cluster_name: slurm_cluster
     nfs_storage_name: nfs_slurm
+    # skip_merge: true
     # config_sources:
     #   slurm:
     #     SlurmctldTimeout: 60