Skip to content
Merged
19 changes: 8 additions & 11 deletions discovery/roles/ome_discovery/tasks/generate_discovery_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@
- ""
- "3. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed."
- ""
- "4. Update the following parameter in provision_config.yml:"
- " pxe_mapping_file_path: {{ pxe_mapping_output_file }}"
- ""
- "5. Run:"
- "4. Run:"
- " ansible-playbook provision/provision.yml"
- "============================================================"

Expand All @@ -84,12 +81,12 @@
- ""
- "3. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed."
- ""
- "4. Update the following parameter in provision_config.yml:"
- " pxe_mapping_file_path: {{ pxe_mapping_output_file }}"
- ""
- "5. If using BuildStream, manually copy the PXE mapping file to GitLab:"
- " input/pxe_mapping_file.csv"
- "4. If GitLab server is not yet up, copy the generated file to"
- " /opt/omnia/input/project_default/pxe_mapping_file.csv in the omnia_core container"
- ""
- "6. Run:"
- " ansible-playbook provision/provision.yml"
- "5. If the GitLab server is up and running, copy the file to"
- " input/pxe_mapping_file.csv in the GitLab project and commit the changes"
- " after building the images using the build pipeline. Committing the PXE mapping"
- " file will automatically trigger the deploy pipeline and deploy the images on"
- " the nodes listed in the newly committed PXE mapping file."
- "============================================================"
238 changes: 236 additions & 2 deletions omnia.sh

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,12 @@ echo "====================================================="
# Detect CUDA major version for DCGM package selection
echo "[INFO] Detecting CUDA version for DCGM package compatibility..."
# Try to get CUDA version from nvidia-smi
CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1)
CUDA_VERSION=$(nvidia-smi | sed -nE 's/.*CUDA( UMD)? Version: *([0-9]+).*/\2/p')

# Fallback: Try to get CUDA version from nvcc if available
if [ -z "$CUDA_VERSION" ]; then
if command -v nvcc &>/dev/null; then
CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1)
echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION"
else
echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc."
echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup."
echo "[ERROR] Could not detect CUDA version from nvidia-smi"
echo "[ERROR] CUDA driver is required for DCGM package version detection. Skipping DCGM setup."
exit 1
fi
else
echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION"
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ spec:
- ip: "127.0.0.1"
hostnames:
- "mysqldb"
terminationGracePeriodSeconds: 10
terminationGracePeriodSeconds: 120
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
Expand Down
103 changes: 62 additions & 41 deletions rollback/playbooks/rollback_slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
---

# ============================================================================
# Play 1: Pre-flight — manifest gating, BuildStream terminal gate
# ============================================================================
- name: Rollback Slurm feature updates
hosts: localhost
connection: local
Expand All @@ -36,66 +40,83 @@
- name: Read rollback_manifest.yml
ansible.builtin.include_vars:
file: "{{ rollback_manifest_path }}"
name: rollback_manifest
name: manifest

- name: Skip if slurm already rolled back
ansible.builtin.meta: end_play
- name: Initialize slurm_skip
ansible.builtin.set_fact:
slurm_skip: false

- name: Set slurm_skip when already completed
ansible.builtin.set_fact:
slurm_skip: true
when:
- rollback_manifest.component_status[component_name] | default('pending') == 'completed'
- manifest.component_status[component_name] | default('pending') == 'completed'

- name: "Mark as skipped — BuildStream terminal gate active (C-24)"
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
{{ manifest | combine({
'component_status': manifest.component_status | combine({
component_name: 'skipped'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
when:
- not slurm_skip
- hostvars['localhost']['build_stream_terminal'] | default(false) | bool
- manifest.component_status.build_stream | default('pending') == 'completed'

- name: "Set slurm_skip — BuildStream terminal gate active (C-24)"
ansible.builtin.set_fact:
slurm_skip: true
when:
- hostvars['localhost']['build_stream_terminal'] | default(false) | bool
- manifest.component_status.build_stream | default('pending') == 'completed'

- name: "Skip — BuildStream terminal gate active (C-24)"
ansible.builtin.meta: end_play
when:
- hostvars['localhost']['build_stream_terminal'] | default(false) | bool

- name: Set slurm rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ rollback_manifest | combine({
'component_status': rollback_manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
- name: Block when slurm is already completed
when: not slurm_skip
block:
- name: Set slurm rollback status to in-progress
ansible.builtin.copy:
content: >-
{{ manifest | combine({
'component_status': manifest.component_status | combine({
component_name: 'in-progress'
})
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'

- name: "Display rollback status in-progress — {{ component_name }}"
ansible.builtin.debug:
msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: in-progress"
- name: "Display rollback status in-progress — {{ component_name }}"
ansible.builtin.debug:
msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: in-progress"

- name: Check for existing reboot state file
ansible.builtin.stat:
path: /opt/omnia/.data/slurm_rollback_reboot_state.yml
register: _reboot_state_stat
- name: Check for existing reboot state file
ansible.builtin.stat:
path: /opt/omnia/.data/slurm_rollback_reboot_state.yml
register: _reboot_state_stat

- name: Load reboot state from previous run
ansible.builtin.include_vars:
file: /opt/omnia/.data/slurm_rollback_reboot_state.yml
name: _reboot_state
when: _reboot_state_stat.stat.exists | default(false)
- name: Load reboot state from previous run
ansible.builtin.include_vars:
file: /opt/omnia/.data/slurm_rollback_reboot_state.yml
name: _reboot_state
when: _reboot_state_stat.stat.exists | default(false)

- name: Set previously successful reboot list
ansible.builtin.set_fact:
slurm_previously_rebooted: "{{ _reboot_state.successfully_rebooted | default([]) }}"
when: _reboot_state_stat.stat.exists | default(false)
- name: Set previously successful reboot list
ansible.builtin.set_fact:
slurm_previously_rebooted: "{{ _reboot_state.successfully_rebooted | default([]) }}"
when: _reboot_state_stat.stat.exists | default(false)

- name: Initialize previously rebooted list (no prior state)
ansible.builtin.set_fact:
slurm_previously_rebooted: []
when: not (_reboot_state_stat.stat.exists | default(false))
- name: Initialize previously rebooted list (no prior state)
ansible.builtin.set_fact:
slurm_previously_rebooted: []
when: not (_reboot_state_stat.stat.exists | default(false))

- name: Create OIM host group for cloud-init/BSS update
ansible.builtin.import_playbook: ../../utils/create_container_group.yml
Expand All @@ -111,8 +132,7 @@
tasks:
- name: Skip if slurm upgrade not needed
ansible.builtin.meta: end_play
when:
- hostvars['localhost']['slurm_skip'] | default(false) | bool
when: hostvars['localhost']['slurm_skip'] | default(false) | bool

- name: Read rollback_manifest.yml
ansible.builtin.include_vars:
Expand Down Expand Up @@ -142,6 +162,7 @@
}) | to_nice_yaml }}
dest: "{{ rollback_manifest_path }}"
mode: '0644'
delegate_to: localhost

- name: End play
ansible.builtin.meta: end_play
Expand Down Expand Up @@ -249,13 +270,13 @@
tasks:
- name: Skip if slurm upgrade not needed
ansible.builtin.meta: end_play
when: slurm_skip | default(false) | bool
when: hostvars['localhost']['slurm_skip'] | default(false) | bool

- name: Initialize state
ansible.builtin.set_fact:
node_status:
hostname: "{{ inventory_hostname }}"
reboot: false # TODO: rename as reboot_failed
reboot: false
ssh: false
sinfo: false
unreachable: false
Expand Down Expand Up @@ -356,7 +377,7 @@
tasks:
- name: Skip if slurm rollback not needed
ansible.builtin.meta: end_play
when: slurm_skip | default(false) | bool
when: hostvars['localhost']['slurm_skip'] | default(false) | bool

- name: Set slurm nodes from inventory
ansible.builtin.set_fact:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
rollback_oim_host_quadlets: >-
{{ rollback_oim_host_backup_dir }}/{{ backup_quadlets_subpath }}

- name: List backed-up quadlet files
- name: List backed-up quadlet container files
ansible.builtin.find:
paths: "{{ rollback_oim_host_quadlets }}"
patterns: "*.container"
Expand All @@ -63,14 +63,43 @@
delegate_facts: true
connection: ssh

- name: List backed-up quadlet network files
ansible.builtin.find:
paths: "{{ rollback_oim_host_quadlets }}"
patterns: "*.network"
register: rollback_network_files
delegate_to: oim
delegate_facts: true
connection: ssh
failed_when: false

- name: Display backed-up quadlet files found
ansible.builtin.debug:
verbosity: 1
msg: >-
Found {{ rollback_quadlet_files.files | length }} quadlet files in backup:
{{ rollback_quadlet_files.files | map(attribute='path') | map('basename') | list }}
Found {{ rollback_quadlet_files.files | length }} container files and
{{ rollback_network_files.files | default([]) | length }} network files in backup:
Containers: {{ rollback_quadlet_files.files | map(attribute='path') | map('basename') | list }}
Networks: {{ rollback_network_files.files | default([]) | map(attribute='path') | map('basename') | list }}

# Restore network files FIRST - these must exist before containers can use them
- name: Restore each v2.1 network quadlet file to systemd quadlet directory
ansible.builtin.copy:
src: "{{ item.path }}"
dest: "{{ systemd_quadlet_dir }}/{{ item.path | basename }}"
remote_src: true
owner: root
group: root
mode: "{{ file_permissions_644 }}"
loop: "{{ rollback_network_files.files | default([]) }}"
loop_control:
label: "{{ item.path | basename }}"
delegate_to: oim
delegate_facts: true
connection: ssh
when: rollback_network_files.files | default([]) | length > 0

- name: Restore each v2.1 quadlet file to systemd quadlet directory
- name: Restore each v2.1 container quadlet file to systemd quadlet directory
ansible.builtin.copy:
src: "{{ item.path }}"
dest: "{{ systemd_quadlet_dir }}/{{ item.path | basename }}"
Expand Down Expand Up @@ -216,7 +245,8 @@
ansible.builtin.debug:
msg:
- "{{ rollback_messages.restore.quadlets_success }}"
- "Quadlet files restored: {{ rollback_quadlet_files.files | length }}"
- "Container quadlet files restored: {{ rollback_quadlet_files.files | length }}"
- "Network quadlet files restored: {{ rollback_network_files.files | default([]) | length }}"
- "v2.2-only quadlets removed: {{ rollback_v22_only_quadlets | join(', ') }}"
- "openchami.target: restored from backup (references coresmd.service)"
- "/etc/openchami: {{ 'restored from backup' if rollback_etc_openchami_backup_stat.stat.exists | default(false) else 'backup NOT found' }}"
Expand Down
Loading
Loading