From 97a86f9ff5a9b33581d894d38e1a7c839eae2b3a Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 23 Jun 2026 10:08:02 +0000 Subject: [PATCH 1/3] cleanup k8s script files also Signed-off-by: Nagachandan-P --- .../tasks/cleanup_k8s.yml | 28 +++++++++++++++++-- .../oim_container_cleanup/vars/main.yml | 19 +++++++++---- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml b/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml index 6f0da6b43e..3b6d39633b 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml @@ -121,9 +121,10 @@ - name: Display k8s cleanup information ansible.builtin.debug: msg: | - WARNING: This will delete K8s-related directories from NFS shares: + WARNING: This will delete K8s-related directories and files from NFS shares: {% for mount in k8s_storage_mounts %} Storage: {{ mount.name }} ({{ mount.mount_point }}) + Directories: {% for item in k8s_static_dirs_stat.results %} {% if item.stat.exists and item.item.startswith(mount.mount_point) %} - {{ item.item }} ({{ item.item | basename }}) @@ -141,6 +142,10 @@ {% else %} Node IP directories: Skipped (k8s_cleanup_node_ips: false) {% endif %} + Root-level files: + {% for file in k8s_cleanup_files %} + - {{ mount.mount_point }}/{{ file }} + {% endfor %} {% endfor %} CRITICAL WARNING: Deleting NFS shared data will affect ALL nodes! @@ -173,6 +178,17 @@ when: k8s_cleanup_needed | default(false) loop: "{{ k8s_all_cleanup_paths }}" + - name: Delete K8s root-level files + ansible.builtin.file: + path: "{{ item.0 }}/{{ item.1 }}" + state: absent + register: k8s_files_cleanup_result + when: k8s_cleanup_needed | default(false) + loop: "{{ all_k8s_base_paths | product(k8s_cleanup_files) | list }}" + loop_control: + label: "{{ item.0 }}/{{ item.1 }}" + failed_when: false + - name: Display k8s cleanup completion message ansible.builtin.debug: msg: | @@ -182,10 +198,18 @@ {% set mount_deleted = k8s_cleanup_result.results | selectattr('item', 'search', '^' + mount.mount_point) | selectattr('changed') | list %} {% if mount_deleted %} {% for item in mount_deleted %} - -> Deleted: {{ item.item }} + -> Deleted directory: {{ item.item }} {% endfor %} {% else %} -> No directories deleted from this storage {% endif %} + {% set mount_files_deleted = k8s_files_cleanup_result.results | selectattr('item', 'search', '^' + mount.mount_point) | selectattr('changed') | list %} + {% if mount_files_deleted %} + {% for item in mount_files_deleted %} + -> Deleted file: {{ item.item }} + {% endfor %} + {% else %} + -> No files deleted from this storage + {% endif %} {% endfor %} when: k8s_cleanup_needed | default(false) diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml index 28dd327351..5f3d60ea72 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml @@ -218,13 +218,14 @@ oim_cleanup_note: | - For Slurm configuration backup, use the separate utility: ansible-playbook utils/slurm_config_util.yml --tags config_backup - To skip slurm cleanup, run: ansible-playbook utils/oim_cleanup.yml --skip-tags slurm - 3. The playbook removes K8s-related directories from NFS shares: - - ssh, calico, metallb, helm, packages, telemetry, karavi-observability, csi-driver-powerscale, nfs-client-provisioner + 3. The playbook removes K8s-related directories and files from NFS shares: + - Directories: ssh, calico, metallb, helm, packages, telemetry, karavi-observability, csi-driver-powerscale, nfs-client-provisioner + - Files: control-plane-join-command.sh, generate-control-plane-join.sh, worker-join-command.sh, pulp_webserver.crt - Node IP directories (when k8s_cleanup_node_ips: true) - - Directory list is configurable via k8s_cleanup_directories variable in vars/main.yml - - Supports multi-storage: Cleans directories from all K8s storage mounts configured in omnia_config.yml + - Directory and file lists are configurable via k8s_cleanup_directories and k8s_cleanup_files variables in vars/main.yml + - Supports multi-storage: Cleans directories and files from all K8s storage mounts configured in omnia_config.yml - To skip k8s cleanup, run: ansible-playbook utils/oim_cleanup.yml --skip-tags k8s - - No backup is created for k8s directories (directory deletion only) + - No backup is created for k8s directories and files (deletion only) 4. The omnia_core container is NOT removed by oim_cleanup.yml. - To delete it, log in to the OIM node and run: @@ -248,6 +249,14 @@ k8s_cleanup_directories: - csi-driver-powerscale - nfs-client-provisioner +# List of k8s root-level files to delete from NFS share +# Edit this list to add/remove files as needed +k8s_cleanup_files: + - control-plane-join-command.sh + - generate-control-plane-join.sh + - worker-join-command.sh + - pulp_webserver.crt + # Delete node IP directories (pattern: x.x.x.x) # Set to false to skip node directories k8s_cleanup_node_ips: true From 6e97680e4cd3c21ddca9df214b9961374396ddbb Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 23 Jun 2026 11:00:21 +0000 Subject: [PATCH 2/3] lint issue fix Signed-off-by: Nagachandan-P --- .../oim_container_cleanup/tasks/cleanup_k8s.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml b/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml index 3b6d39633b..52e324cd76 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/tasks/cleanup_k8s.yml @@ -195,7 +195,10 @@ K8s-related cleanup completed. {% for mount in k8s_storage_mounts %} Storage: {{ mount.name }} ({{ mount.mount_point }}) - {% set mount_deleted = k8s_cleanup_result.results | selectattr('item', 'search', '^' + mount.mount_point) | selectattr('changed') | list %} + {% set mount_deleted = k8s_cleanup_result.results | + selectattr('item', 'search', '^' + mount.mount_point) | + selectattr('changed') | + list %} {% if mount_deleted %} {% for item in mount_deleted %} -> Deleted directory: {{ item.item }} @@ -203,7 +206,10 @@ {% else %} -> No directories deleted from this storage {% endif %} - {% set mount_files_deleted = k8s_files_cleanup_result.results | selectattr('item', 'search', '^' + mount.mount_point) | selectattr('changed') | list %} + {% set mount_files_deleted = k8s_files_cleanup_result.results | + selectattr('item', 'search', '^' + mount.mount_point) | + selectattr('changed') | + list %} {% if mount_files_deleted %} {% for item in mount_files_deleted %} -> Deleted file: {{ item.item }} From 4c064e375c3381eaaa6202b6cb39f0f19e3c4c44 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Wed, 24 Jun 2026 06:28:40 +0000 Subject: [PATCH 3/3] mpi env set for default Signed-off-by: Nagachandan-P --- .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 | 4 +--- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 5 +---- .../cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 6 +----- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 6 +----- 4 files changed, 4 insertions(+), 17 deletions(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index dfdaa37111..5879e0ba3d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -281,18 +281,16 @@ {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 0c727b0c01..77109a4828 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -282,19 +282,16 @@ {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" - -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cdce20193e..449cedfa8f 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -422,9 +422,7 @@ # DOCA and IB configuration - now ready before vendor_data mounts - bash /usr/local/bin/doca-install.sh || true - bash /usr/local/bin/configure-ib-network.sh -{% if hostvars['localhost']['openmpi_support'] %} - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" -{% endif %} {# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined and cloud_init_groups_dict[functional_group_name].runcmd is not none %} @@ -513,17 +511,15 @@ - mount -av {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index ee33e0ff6f..ecf388cdeb 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -428,9 +428,7 @@ # DOCA and IB configuration - now ready before vendor_data mounts - bash /usr/local/bin/doca-install.sh || echo "DOCA install failed (non-critical)" - bash /usr/local/bin/configure-ib-network.sh || echo "IB network configuration failed (non-critical)" -{% if hostvars['localhost']['openmpi_support'] %} - bash /usr/local/bin/setup_doca_mpi_env.sh || echo "DOCA MPI environment setup failed (non-critical)" -{% endif %} {# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined and cloud_init_groups_dict[functional_group_name].runcmd is not none %} @@ -515,17 +513,15 @@ - mount -av {% endif %} # UCX and OpenMPI auto-compilation disabled - # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 used by default + # DOCA UCX 1.20.0 and OpenMPI 4.1.9a1 configured by default {% if hostvars['localhost']['ucx_support'] %} - echo "===== UCX Configuration =====" - echo "UCX version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA UCX 1.20.0 (system default)" {% endif %} -{% if hostvars['localhost']['openmpi_support'] %} - echo "===== OpenMPI Configuration =====" - echo "OpenMPI version specified in software_config.json (available for manual compilation)" - echo "Default stack - DOCA OpenMPI 4.1.9a1 (system default)" -{% endif %} {% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log