diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 2d851373d1..35ab68f037 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -11,6 +11,8 @@ on: - pub/ochami - pub/ochami_aarch64 - pub/k8s_telemetry + - pub/ib_support + - pub/v2.1_rc1 jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index b7e5c822d0..c9544dc16c 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -10,6 +10,8 @@ on: - pub/ochami - pub/ochami_aarch64 - pub/k8s_telemetry + - pub/ib_support + - pub/v2.1_rc1 jobs: build: diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index 941d575ebf..1801448611 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,7 +167,7 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.0" + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1" - name: Pull aarch64 image using Podman ansible.builtin.command: diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml index c344f59d8f..0b7a56072d 100644 --- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml +++ b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml @@ -13,21 +13,16 @@ # limitations under the License. --- -- name: Pull specific OpenCHAMI image by version tag +- name: Pull image-build image ansible.builtin.command: - cmd: "podman pull {{ openchami_image_sha }}" + cmd: "podman pull {{ image_build_el10 }}" register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result.rc == 0 changed_when: "'Image is up to date' not in pull_result.stdout" - name: Fail if image not pulled successfully ansible.builtin.fail: msg: "{{ pull_result.stdout }}" when: pull_result.rc != 0 - -- name: Tagging OpenCHAMI image with stable name - ansible.builtin.command: - cmd: "{{ ochami_stable_image_tag }}" - args: - creates: "{{ ochami_stable_image_path }}" - register: tag_result - changed_when: "'Tagged' in tag_result.stdout" diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index 66b7b2538d..a05a39d37d 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -openchami_image_sha: "ghcr.io/openchami/image-build@sha256:52dd9d546951ce4f2f6f9febd08a228cfcb5b9e8e204ca4f5ee232f6be65d3a4" +image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0" +pull_image_retries: "3" +pull_image_delay: "10" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" dir_permissions_644: "0644" @@ -33,7 +35,7 @@ ochami_compute_mounts: ochami_x86_64_image: - --entrypoint /bin/bash - - ghcr.io/openchami/image-build:stable + - docker.io/dellhpcomniaaisolution/image-build-el10:1.0 ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' @@ -52,7 +54,3 @@ compute_image_failure_msg: | # build_compute_image.yml openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2" openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml" - -# build_image_tag.yml -ochami_stable_image_tag: "podman tag {{ openchami_image_sha }} ghcr.io/openchami/image-build:stable" -ochami_stable_image_path: "/var/lib/containers/storage/overlay-images/{{ openchami_image_sha }}" diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 77bc8c3544..e72c474513 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -326,6 +326,12 @@ def json_file_mandatory(file_path): "Please ensure the CSV file has the required headers." ) NETWORK_SPEC_FILE_NOT_FOUND_MSG = "network_spec.yml file not found in input folder." +IB_NETMASK_BITS_MISMATCH_MSG = ( + "netmask_bits configured for ib_network must match admin_network netmask_bits in network_spec.yml." +) +IB_SUBNET_IN_ADMIN_RANGE_MSG = ( + "ib_network subnet must be outside the admin network range derived from primary_oim_admin_ip/netmask_bits in network_spec.yml." +) # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" @@ -427,3 +433,4 @@ def get_logic_failed(input_file_path): def get_logic_success(input_file_path): """Returns a formatted message indicating logic validation success for a file.""" return f"{'#' * 10} Logic validation successful for {input_file_path} {'#' * 10}" + diff --git a/common/library/module_utils/input_validation/schema/network_spec.json b/common/library/module_utils/input_validation/schema/network_spec.json index 64fe70f407..bea5622095 100644 --- a/common/library/module_utils/input_validation/schema/network_spec.json +++ b/common/library/module_utils/input_validation/schema/network_spec.json @@ -100,9 +100,35 @@ } }, "additionalProperties": false + }, + { + "type": "object", + "required": ["ib_network"], + "properties": { + "ib_network": { + "type": "object", + "required": [ + "subnet", + "netmask_bits" + ], + "properties": { + "subnet": { + "type": "string", + "pattern": "^(?:(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})\\.){3}(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})$" + }, + "netmask_bits": { + "type": "string", + "pattern": "^(1[0-9]|2[0-9]|[1-9])$|^3[0-2]$" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false } ] } } } } + diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json index 9cae297a43..41746905f1 100644 --- a/common/library/module_utils/input_validation/schema/storage_config.json +++ b/common/library/module_utils/input_validation/schema/storage_config.json @@ -49,6 +49,36 @@ ] }, "minItems": 1 + }, + "powervault_config": { + "required": ["ip", "isci_initiators", "volume_id"], + "properties": { + "ip": { + "description": "List of target controller IP addresses", + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "format": "ipv4" + }, + "uniqueItems": true + }, + + "port": { + "description": "TCP port for iSCSI (default 3260)", + "type": "integer" + }, + + "isci_initiators": { + "description": "iSCSI initiator IQN", + "type": "string" + }, + + "volume_id": { + "description": "Volume identifier (hex string)", + "type": "string" + } + } } }, "required": [ diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index e598bc155e..7eef7bef20 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -21,6 +21,7 @@ import itertools import csv import yaml +import ipaddress from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config from ansible.module_utils.input_validation.common_utils import en_us_validation_msg @@ -744,6 +745,54 @@ def validate_network_spec( ) return errors + # Extract admin and IB parameters for cross-validation + admin_netmask_bits = None + admin_primary_ip = None + ib_netmask_bits = None + ib_subnet = None + ib_present = False + + for network in data["Networks"]: + if "admin_network" in network and isinstance(network["admin_network"], dict): + admin_net = network["admin_network"] + admin_netmask_bits = admin_net.get("netmask_bits", admin_netmask_bits) + admin_primary_ip = admin_net.get("primary_oim_admin_ip", admin_primary_ip) + + if "ib_network" in network and isinstance(network["ib_network"], dict): + ib_net = network["ib_network"] + # Consider IB network present only when config is non-empty + if ib_net: + ib_present = True + ib_netmask_bits = ib_net.get("netmask_bits", ib_netmask_bits) + ib_subnet = ib_net.get("subnet", ib_subnet) + + # If IB network is configured and both netmask bits are available, they must match + if ib_present and ib_netmask_bits and admin_netmask_bits and ib_netmask_bits != admin_netmask_bits: + errors.append( + create_error_msg( + "ib_network.netmask_bits", + ib_netmask_bits, + en_us_validation_msg.IB_NETMASK_BITS_MISMATCH_MSG, + ) + ) + + # If IB subnet and admin primary IP are available, ensure IB subnet is not in admin range + if ib_present and ib_subnet and admin_primary_ip and admin_netmask_bits: + try: + admin_network = ipaddress.IPv4Network(f"{admin_primary_ip}/{admin_netmask_bits}", strict=False) + ib_ip = ipaddress.IPv4Address(ib_subnet) + if ib_ip in admin_network: + errors.append( + create_error_msg( + "ib_network.subnet", + ib_subnet, + en_us_validation_msg.IB_SUBNET_IN_ADMIN_RANGE_MSG, + ) + ) + except ValueError: + # If IPs/netmask are invalid, rely on existing validations to report issues + pass + for network in data["Networks"]: errors.extend(_validate_admin_network(network)) @@ -941,3 +990,4 @@ def _validate_ip_ranges(dynamic_range, network_type, netmask_bits): ) return errors + diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py index c8d8bd1339..ed886bd69e 100644 --- a/common/library/module_utils/local_repo/download_common.py +++ b/common/library/module_utils/local_repo/download_common.py @@ -477,7 +477,7 @@ def process_manifest(file,repo_store_path, status_file_path, cluster_os_type, cl manifest_directory = os.path.join(repo_store_path, "offline_repo", "cluster",arc.lower(), cluster_os_type, cluster_os_version, "manifest", package_name) # # Determine the manifest file path file_path = os.path.join(manifest_directory, f"{package_name}.yaml") - repository_name = "manifest" + package_name + repository_name = arc.lower() + "_manifest" + package_name output_file = package_name + ".yml" relative_path = output_file base_path = manifest_directory.strip("/") @@ -531,7 +531,7 @@ def process_git(file,repo_store_path, status_file_path, cluster_os_type, cluster clone_directory = os.path.join(git_modules_directory, package_name) clone_directory = shlex.quote(clone_directory).strip("'\"") tarball_path = os.path.join(git_modules_directory, f'{package_name}.tar.gz') - repository_name = "git" + package_name + repository_name = arc.lower() + "_git" + package_name output_file = package_name + ".tar.gz" relative_path = output_file base_path = git_modules_directory.strip("/") @@ -600,7 +600,7 @@ def process_shell(file,repo_store_path, status_file_path, cluster_os_type, clus os.makedirs(sh_directory, exist_ok=True) # Ensure the directory exists sh_path = os.path.join(sh_directory, f"{package_name}.sh") - repository_name = "shell" + package_name + repository_name = arc.lower() + "_shell" + package_name output_file = package_name + ".sh" relative_path = output_file base_path = sh_directory.strip("/") @@ -651,7 +651,7 @@ def process_ansible_galaxy_collection(file, repo_store_path, status_file_path, c galaxy_collections_directory = shlex.quote(galaxy_collections_directory).strip("'\"") os.makedirs(galaxy_collections_directory, exist_ok=True) # Ensure the directory exists collections_tarball_path = os.path.join(galaxy_collections_directory, f'{package_name.replace(".", "-")}-{version}.tar.gz') - repository_name = "ansible_galaxy_collection" + package_name + repository_name = arc.lower() + "_ansible_galaxy_collection" + package_name output_file = f"{file['package'].replace('.', '-')}-{file['version']}.tar.gz" relative_path = output_file base_path = galaxy_collections_directory.strip("/") @@ -758,7 +758,7 @@ def process_tarball(package, repo_store_path, status_file_path, version_variable tarball_path = os.path.join(tarball_directory, f"{package_name}.tar.gz") tarball_path = shlex.quote(tarball_path).strip("'\"") - repository_name = "tarball" + package_name + repository_name = arc.lower() + "_tarball" + package_name output_file = package_name + ".tar.gz" relative_path = output_file base_path = tarball_directory.strip("/") @@ -844,7 +844,7 @@ def process_iso(package, repo_store_path, status_file_path, url_support = True package_name = package['package'] package_type = package['type'] - repository_name = "iso" + package_name + arc + repository_name = arc.lower() + "_iso" + package_name distribution_name = repository_name if 'url' in package: @@ -941,7 +941,7 @@ def process_pip(package, repo_store_path, status_file_path, cluster_os_type, cl package_name = shlex.quote(package['package']).strip("'\"") package_type = package['type'] version = package.get('version', None) - pip_repo = "pip_module" + package_name + pip_repo = arc.lower() + "_pip_module" + package_name distribution_name = pip_repo logger.info(f"Processing Pip Package: {package_name}, Version: {version}") diff --git a/common/vars/openchami_image_cmd.yml b/common/vars/openchami_image_cmd.yml index 4746bb4037..96cd3abcb2 100644 --- a/common/vars/openchami_image_cmd.yml +++ b/common/vars/openchami_image_cmd.yml @@ -20,8 +20,6 @@ rhel_aarch64_base_image_name: "rhel-aarch64_base" base_image_commands: - "dracut --add 'dmsquash-live livenet network-manager' --install '/usr/lib/systemd/systemd-sysroot-fstab-check' --kver $(basename /lib/modules/*) -N -f --logfile /tmp/dracut.log 2>/dev/null" # noqa: yaml[line-length] - "echo DRACUT LOG:; cat /tmp/dracut.log" - - "rm -f /var/lib/rpm/__db*" - - "rpmdb --rebuilddb" # x86_64 compute commands default_x86_64_compute_commands: diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 877a137e34..75efadb47c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -75,6 +75,18 @@ name: discovery_validations tasks_from: validate_oim_timezone.yml +- name: Build cluster host lists from PXE mapping + hosts: localhost + connection: local + roles: + - passwordless_ssh + +- name: Configure OIM SSH from cluster host lists + hosts: oim + connection: ssh + roles: + - passwordless_ssh + - name: Validate discovery parameters hosts: oim connection: ssh @@ -102,6 +114,11 @@ ansible.builtin.include_role: name: configure_ochami tasks_from: discover_mapping_nodes.yml + + - name: Read nodes.yaml and derive Omnia node facts + ansible.builtin.include_role: + name: passwordless_ssh + tasks_from: read_nodes_yaml.yml roles: - nfs_client - k8s_config diff --git a/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml b/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml index 2d7858b16c..96a0cbd556 100644 --- a/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml +++ b/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml @@ -66,6 +66,12 @@ register: read_ssh_key no_log: true +- name: Read the ssh private key + ansible.builtin.command: cat {{ ssh_private_key_path }} + changed_when: false + register: read_ssh_private_key + no_log: true + - name: Hash the password ansible.builtin.command: openssl passwd -6 "{{ hostvars['localhost']['provision_password'] }}" changed_when: false diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index c39b27005a..47949323a3 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -15,9 +15,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -78,25 +95,37 @@ exit 1 fi - echo "[INFO] Installing CUDA toolkit..." - if [ -f "/cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run" ]; then - # Install only the toolkit component - bash /cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run --silent --toolkit --toolkitpath=/usr/local/cuda --override + echo "[INFO] Setting up shared CUDA directory..." + # Create and mount shared directory for compute nodes + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share. Exiting." + umount /cuda-runfile 2>/dev/null + exit 1 + fi + + echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." + if [ -f "/cuda-runfile/{{ cuda_runfile_aarch64 }}" ]; then + mkdir -p /shared-cuda-toolkit/tmp + # Install toolkit directly to the NFS-mounted shared location + bash /cuda-runfile/{{ cuda_runfile_aarch64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully." + echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - # Set up environment variables + # Set up environment variables pointing to shared location cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit ENDOFFILE # Apply environment variables for current session - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit echo "[INFO] CUDA environment configured" else @@ -115,17 +144,6 @@ echo "[ERROR] CUDA toolkit (nvcc) not found after installation." fi - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - echo "[INFO] Cleaning up temporary mounts..." umount /cuda-runfile 2>/dev/null rmdir /cuda-runfile 2>/dev/null @@ -171,32 +189,34 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 5b9ad2dcd1..2ba3b39d42 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -15,9 +15,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -52,7 +69,13 @@ fi done fi - + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes - path: /usr/local/bin/install_cuda_toolkit.sh permissions: '0755' @@ -79,25 +102,37 @@ exit 1 fi - echo "[INFO] Installing CUDA toolkit..." - if [ -f "/cuda-runfile/cuda_13.0.2_580.95.05_linux.run" ]; then - # Install only the toolkit component - bash /cuda-runfile/cuda_13.0.2_580.95.05_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda --override + echo "[INFO] Setting up shared CUDA directory..." + # Create and mount shared directory for compute nodes + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share. Exiting." + umount /cuda-runfile 2>/dev/null + exit 1 + fi + + echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." + if [ -f "/cuda-runfile/{{ cuda_runfile_x86_64 }}" ]; then + mkdir -p /shared-cuda-toolkit/tmp + # Install toolkit directly to the NFS-mounted shared location + bash /cuda-runfile/{{ cuda_runfile_x86_64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully." + echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - # Set up environment variables + # Set up environment variables pointing to shared location cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit ENDOFFILE # Apply environment variables for current session - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit echo "[INFO] CUDA environment configured" else @@ -116,17 +151,6 @@ echo "[ERROR] CUDA toolkit (nvcc) not found after installation." fi - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - echo "[INFO] Cleaning up temporary mounts..." umount /cuda-runfile 2>/dev/null rmdir /cuda-runfile 2>/dev/null @@ -173,10 +197,11 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab @@ -184,21 +209,28 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 3079364950..a77ac4158b 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -17,9 +17,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -93,32 +110,34 @@ runcmd: - /usr/local/bin/set-ssh.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index a1dfc3708a..93c0d5ce45 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -17,9 +17,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -55,6 +72,13 @@ fi done fi + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -92,10 +116,11 @@ runcmd: - /usr/local/bin/set-ssh.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab @@ -103,21 +128,29 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 39068b4f40..14616e9226 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -20,6 +20,18 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -54,6 +66,13 @@ fi done fi + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ k8s_control_ssh_patterns }} + IdentityFile {{ k8s_client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes - path: /etc/chrony.conf permissions: '0644' @@ -90,6 +109,7 @@ {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubelet /var/lib/kubelet nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubernetes /etc/kubernetes nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/pod-logs /var/log/pods nfs noatime,nolock 0 0 + {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/packages /var/lib/packages nfs noatime,nolock 0 0 tmpfs /tmp/crio-storage tmpfs size={{ k8s_crio_storage_size }},noatime,nodev,nosuid 0 0 permissions: '0644' @@ -369,7 +389,7 @@ - sudo modprobe nf_conntrack || true - sudo modprobe vxlan || true - sysctl --system - - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods + - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods /var/lib/packages - | tmpfile=$(mktemp) # Extract the first 'search' line only (ignore duplicates) @@ -391,14 +411,15 @@ chattr +i /etc/resolv.conf || true fi - mount -a + - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors + - update-ca-trust extract + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }} - - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - - update-ca-trust extract - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - systemctl daemon-reload - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 7d04398cca..c27216fcdf 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -20,6 +20,18 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -55,6 +67,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ k8s_control_ssh_patterns }} + IdentityFile {{ k8s_client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes + - path: /etc/modules-load.d/k8s.conf content: | br_netfilter @@ -77,6 +96,7 @@ {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubelet /var/lib/kubelet nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubernetes /etc/kubernetes nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/pod-logs /var/log/pods nfs noatime,nolock 0 0 + {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/packages /var/lib/packages nfs noatime,nolock 0 0 tmpfs /tmp/crio-storage tmpfs size={{ k8s_crio_storage_size }},noatime,nodev,nosuid 0 0 permissions: '0644' - path: /etc/containers/storage.conf @@ -276,7 +296,7 @@ - sudo modprobe nf_conntrack || true - sudo modprobe vxlan || true - sysctl --system - - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods + - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods /var/lib/packages - | tmpfile=$(mktemp) @@ -299,15 +319,16 @@ chattr +i /etc/resolv.conf || true fi - mount -a + - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors + - update-ca-trust extract + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - - update-ca-trust extract - systemctl daemon-reload - systemctl restart crio - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} - echo "Installing helm" - /usr/local/bin/install-helm.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 6b52f12c55..7f115c766d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -20,6 +20,18 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -55,6 +67,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ k8s_control_ssh_patterns }} + IdentityFile {{ k8s_client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes + - path: /etc/modules-load.d/k8s.conf content: | br_netfilter @@ -76,6 +95,7 @@ {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubelet /var/lib/kubelet nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubernetes /etc/kubernetes nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/pod-logs /var/log/pods nfs noatime,nolock 0 0 + {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/packages /var/lib/packages nfs noatime,nolock 0 0 tmpfs /tmp/crio-storage tmpfs size={{ k8s_crio_storage_size }},noatime,nodev,nosuid 0 0 permissions: '0644' - path: /etc/containers/storage.conf @@ -178,7 +198,7 @@ - sudo modprobe nf_conntrack || true - sudo modprobe vxlan || true - sysctl --system - - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/kubelet /etc/kubernetes /var/log/pods + - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/kubelet /etc/kubernetes /var/log/pods /var/lib/packages - | tmpfile=$(mktemp) @@ -202,14 +222,15 @@ fi - systemctl restart rpcbind - mount -a + - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors + - update-ca-trust extract + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - - update-ca-trust extract - systemctl daemon-reload - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} - | diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 8e9d66b214..35079fb0f5 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -17,9 +17,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -54,6 +71,208 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes + +{% if powervault_config is defined %} + - path: /usr/local/bin/setup_iscsi_storage.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + set -euo pipefail + + LOGFILE="/var/log/omnia_iscsi_setup.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + + PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %}) + PORT="{{ powervault_config.port | default(3260) }}" + INITIATOR_IQN="{{ powervault_config.isci_initiators | default('') }}" + VOLUME_ID="{{ powervault_config.volume_id | default('') }}" + FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}" + MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}" + + PERSIST_MOUNT="/mnt/slurm-persist" + MYSQL_SUBDIR="${PERSIST_MOUNT}/mysql" + SPOOL_SUBDIR="${PERSIST_MOUNT}/spool" + + log "Enabling iSCSI daemon" + systemctl enable --now iscsid + /sbin/mpathconf --enable || true + + if [[ -n "${INITIATOR_IQN}" ]]; then + log "Setting InitiatorName to ${INITIATOR_IQN}" + if [[ -f /etc/iscsi/initiatorname.iscsi ]] && grep -q "^InitiatorName=${INITIATOR_IQN}$" /etc/iscsi/initiatorname.iscsi; then + log "InitiatorName already set; not changing" + else + printf "InitiatorName=%s\n" "${INITIATOR_IQN}" > /etc/iscsi/initiatorname.iscsi + log "Restarting iscsid after InitiatorName change" + systemctl restart iscsid + fi + else + log "INITIATOR_IQN not set; leaving /etc/iscsi/initiatorname.iscsi unchanged" + fi + + log "Current initiatorname:" + cat /etc/iscsi/initiatorname.iscsi || true + + log "Discovering iSCSI targets from all portals" + TARGET_IQN="" + + for ip in "${PORTALS[@]}"; do + log "Trying discovery on ${ip}:${PORT}" + OUT=$(iscsiadm -m discovery -t sendtargets -p "${ip}:${PORT}" 2>/dev/null || true) + echo "$OUT" + if [[ -z "${TARGET_IQN}" ]]; then + CANDIDATE_IQN=$(echo "$OUT" | awk '{print $2}' | head -1) + if [[ -n "${CANDIDATE_IQN}" ]]; then + TARGET_IQN="${CANDIDATE_IQN}" + fi + fi + done + + if [[ -z "${TARGET_IQN}" ]]; then + log "ERROR: Unable to determine target IQN from discovery output" + exit 1 + fi + log "Discovered TARGET_IQN=${TARGET_IQN}" + + log "Logging in to ALL discovered iSCSI targets" + iscsiadm -m node --login || true + + log "Setting automatic startup for all nodes" + iscsiadm -m node --op update -n node.startup -v automatic || true + + log "Waiting for devices to settle..." + sleep 5 + + log "Enabling multipathd" + systemctl enable --now multipathd || true + + log "Rescanning iSCSI sessions" + iscsiadm -m session --rescan || true + + log "Reloading multipath configuration" + multipath -r || true + + sleep 3 + + log "Verifying disks" + lsblk || true + lsscsi -s 2>/dev/null | grep -iE "ME|DELL" || true + + log "Multipath devices:" + multipath -ll || true + + LATEST_MPATH="" + + if [[ -n "${VOLUME_ID}" ]]; then + log "Selecting multipath using VOLUME_ID match: ${VOLUME_ID}" + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep -iF "${VOLUME_ID}" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "Selecting multipath using vendor match DellEMC,ME5" + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep -i "DellEMC,ME5" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "Selecting multipath using vendor match DellEMC,ME4" + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep -i "DellEMC,ME4" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "Selecting multipath using latest dm-* fallback" + LATEST=$(multipath -ll 2>/dev/null | grep -oP 'dm-\d+' | sort -t- -k2 -n | tail -1 || true) + if [[ -z "${LATEST}" ]]; then + log "ERROR: No multipath dm-* devices detected" + exit 1 + fi + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep "${LATEST}" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "ERROR: Unable to determine multipath device" + exit 1 + fi + + MPATH_DEV="/dev/mapper/${LATEST_MPATH}" + log "Using multipath device: ${MPATH_DEV}" + + PART_DEV="/dev/mapper/${LATEST_MPATH}1" + + if [[ ! -e "${PART_DEV}" ]]; then + log "Creating GPT label and partition on ${MPATH_DEV}" + parted -s "${MPATH_DEV}" mklabel gpt + parted -s "${MPATH_DEV}" mkpart primary "${FS_TYPE}" 0% 100% + sleep 2 + partprobe "${MPATH_DEV}" || true + kpartx -av "${MPATH_DEV}" || true + sleep 2 + fi + + log "Using partition device: ${PART_DEV}" + + if ! blkid -s TYPE -o value "${PART_DEV}" 2>/dev/null | grep -q .; then + log "Formatting ${PART_DEV} with ${FS_TYPE}" + mkfs."${FS_TYPE}" -f "${PART_DEV}" + else + log "Filesystem already exists on ${PART_DEV}; skipping format" + fi + + mkdir -p "${PERSIST_MOUNT}" + + UUID=$(blkid -s UUID -o value "${PART_DEV}" 2>/dev/null || true) + + if [[ -n "${UUID}" ]]; then + log "Using UUID=${UUID} for fstab" + FSTAB_ENTRY="UUID=${UUID}" + FSTAB_MATCH="^UUID=${UUID}\\s" + else + log "UUID not available, using device path ${PART_DEV} for fstab" + FSTAB_ENTRY="${PART_DEV}" + FSTAB_MATCH="^${PART_DEV}\\s" + fi + + if ! grep -qE "${FSTAB_MATCH}" /etc/fstab; then + log "Adding persistent mount to /etc/fstab" + echo "${FSTAB_ENTRY} ${PERSIST_MOUNT} ${FS_TYPE} ${MOUNT_OPTS} 0 0" >> /etc/fstab + fi + + if ! mountpoint -q "${PERSIST_MOUNT}"; then + log "Mounting ${PERSIST_MOUNT}" + mount "${PART_DEV}" "${PERSIST_MOUNT}" + fi + + df -h "${PERSIST_MOUNT}" || true + + mkdir -p "${MYSQL_SUBDIR}" "${SPOOL_SUBDIR}" /var/lib/mysql /var/spool + + grep -qE "\s+/var/lib/mysql\s+none\s+bind" /etc/fstab || echo "${MYSQL_SUBDIR} /var/lib/mysql none bind 0 0" >> /etc/fstab + grep -qE "\s+/var/spool\s+none\s+bind" /etc/fstab || echo "${SPOOL_SUBDIR} /var/spool none bind 0 0" >> /etc/fstab + + mount /var/lib/mysql || true + mount /var/spool || true + + chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql + + log "Final mount summary:" + mount | grep -E "/mnt/slurm-persist|/var/lib/mysql|/var/spool" || true + + log "iSCSI sessions:" + iscsiadm -m session || true + + log "Multipath status:" + multipath -ll || true + + log "iSCSI/multipath setup complete. Log saved to ${LOGFILE}" +{% endif %} + {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -108,8 +327,8 @@ content: | #!/bin/bash SLURMDBD_CONF="/etc/slurm/slurmdbd.conf" - SLURM_USER="{{ user }}" - SLURM_GROUP="{{ slurm_group_name }}" + SLURM_USER="{{ slurm_user }}" + SLURM_GROUP="{{ slurm_user }}" # Function to extract value from slurm.conf get_value_slurm_conf() { local key="$1" @@ -119,8 +338,8 @@ echo "${value:-$default}" } chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql - chown -R {{ user }}:{{ slurm_group_name }} /var/log/mariadb - chown -R {{ user }}:{{ slurm_group_name }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb + chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb #firewall systemctl enable firewalld @@ -137,8 +356,8 @@ content: | #!/bin/bash SLURMDBD_CONF="/etc/slurm/slurmdbd.conf" - SLURM_USER="{{ user }}" - SLURM_GROUP="{{ slurm_group_name }}" + SLURM_USER="{{ slurm_user }}" + SLURM_GROUP="{{ slurm_user }}" # Function to extract value from slurm.conf get_value_slurm_conf() { local key="$1" @@ -148,7 +367,7 @@ echo "${value:-$default}" } chmod {{ file_mode_600 }} /etc/slurm/slurmdbd.conf - chown {{ user }}:{{ slurm_group_name }} /etc/slurm/slurmdbd.conf + chown {{ slurm_user }}:{{ slurm_user }} /etc/slurm/slurmdbd.conf #file PidFile PidFile=$(get_value_slurm_conf "PidFile" "/var/run/slurmdbd.pid") mkdir -pv $(dirname "$PidFile") @@ -175,8 +394,8 @@ content: | #!/bin/bash SLURM_CONF="/etc/slurm/slurm.conf" - SLURM_USER="{{ user }}" - SLURM_GROUP="{{ slurm_group_name }}" + SLURM_USER="{{ slurm_user }}" + SLURM_GROUP="{{ slurm_user }}" # Function to extract value from slurm.conf get_value_slurm_conf() { local key="$1" @@ -238,28 +457,40 @@ runcmd: - /usr/local/bin/set-ssh.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser # Required?? - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/slurm/ssh + + # slurm user and group created in the users module # Create directories for nfs and mount all - - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track + - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm /etc/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/my.cnf.d /etc/my.cnf.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/mariadb /var/log/mariadb nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab +{% if powervault_config is not defined %} + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab +{% endif %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh +{% if powervault_config is defined %} + - /usr/local/bin/setup_iscsi_storage.sh +{% endif %} - - chown -R {{ user }}:{{ slurm_group_name }} {{ home_dir }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }} - - chown -R {{ user }}:{{ slurm_group_name }} /etc/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /etc/slurm - chmod {{ file_mode_755 }} /etc/slurm - chmod {{ file_mode }} /etc/slurm/slurm.conf @@ -277,9 +508,6 @@ - systemctl enable sshd - systemctl start sshd - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - {% if hostvars['localhost']['openldap_support'] %} - /usr/local/bin/update_ldap_conf.sh - mkdir /ldapcerts diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 8f032ed3b6..ddaf5fb4f4 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -17,9 +17,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -87,8 +104,8 @@ fi echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run" ]; then - bash /gpu-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build + if [ -f "/gpu-runfile/{{ cuda_runfile_aarch64 }}" ]; then + bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." else @@ -219,8 +236,8 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" - echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + echo "[INFO] Creating base directories for Slurm and Munge" + mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -230,6 +247,9 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + chmod {{ file_mode }} /etc/fstab echo "[INFO] Mounting all NFS entries from /etc/fstab" @@ -251,10 +271,10 @@ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm echo "[INFO] Setting permissions for Slurm directories" chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm @@ -266,7 +286,7 @@ echo "[INFO] Creating and configuring /var/spool/slurmd" mkdir -p /var/spool/slurmd chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd echo "[INFO] ===== Completed slurmd setup (aarch64) =====" @@ -372,13 +392,11 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_nvidia_driver.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 94c12fd6d2..9a2f27dfb7 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -17,9 +17,27 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin + disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -54,6 +72,13 @@ fi done fi + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes - path: /usr/local/bin/install_nvidia_driver.sh permissions: '0755' @@ -87,8 +112,8 @@ fi echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/cuda_13.0.2_580.95.05_linux.run" ]; then - bash /gpu-runfile/cuda_13.0.2_580.95.05_linux.run --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build + if [ -f "/gpu-runfile/{{ cuda_runfile_x86_64 }}" ]; then + bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." else @@ -227,9 +252,11 @@ exec > >(tee -a "$LOGFILE") 2>&1 echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge =====" - + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -238,6 +265,8 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab chmod {{ file_mode }} /etc/fstab echo "[INFO] Mounting all NFS entries from /etc/fstab" @@ -259,10 +288,10 @@ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm echo "[INFO] Setting permissions for Slurm directories" chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm @@ -274,7 +303,7 @@ echo "[INFO] Creating and configuring /var/spool/slurmd" mkdir -p /var/spool/slurmd chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd echo "[INFO] ===== Completed slurmd setup =====" @@ -374,17 +403,16 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_nvidia_driver.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} + # slurm user and group created in the users module - /usr/local/bin/configure_dirs_and_mounts.sh - - /usr/local/bin/configure_slurmd_setup.sh - - /usr/local/bin/configure_munge_and_pam.sh - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - /usr/local/bin/configure_slurmd_setup.sh + - /usr/local/bin/configure_munge_and_pam.sh + - setenforce 0 - /usr/local/bin/configure_firewall_and_services.sh diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 new file mode 100644 index 0000000000..1cb95d6f9b --- /dev/null +++ b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -0,0 +1,72 @@ +#!/bin/bash +set -euo pipefail + +ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" +NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" +IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" + +ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) +} + +int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" +} + + +ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") +IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") + +HOST_BITS=$(( 32 - NETMASK_BITS )) +HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + +HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) +IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) + +IB_IP=$(int_to_ip "$IB_IP_INT") + +echo "Derived IB IP : $IB_IP/$NETMASK_BITS" + +MAX_WAIT=120 # total wait time in seconds (2 minutes) +INTERVAL=10 # check every 10 seconds +ELAPSED=0 +IB_NIC="" + +while [[ $ELAPSED -lt $MAX_WAIT ]]; do + for nic in $(ip -o link show | awk -F': ' '{print $2}' | grep '^ib'); do + if ip link show "$nic" | grep -q "UP,LOWER_UP"; then + IB_NIC="$nic" + break 2 + fi + done + + echo "IB interface not ready yet. Waiting..." + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) +done + +if [[ -z "$IB_NIC" ]]; then + echo "No active InfiniBand interface found after ${MAX_WAIT}s. Exiting." + exit 0 +fi + +echo "Using IB interface: $IB_NIC" + +if command -v nmcli >/dev/null 2>&1; then + echo "Configuring IB interface using NetworkManager" + nmcli con delete "$IB_NIC" &>/dev/null || true + nmcli con add type infiniband ifname "$IB_NIC" con-name "$IB_NIC" + nmcli con modify "$IB_NIC" ipv4.method manual ipv4.addresses "$IB_IP/$NETMASK_BITS" + nmcli con up "$IB_NIC" +else + echo "Configuring IB interface using iproute2" + ip addr flush dev "$IB_NIC" + ip addr add "$IB_IP/$NETMASK_BITS" dev "$IB_NIC" + ip link set "$IB_NIC" up +fi + +echo "SUCCESS: Assigned $IB_IP/$NETMASK_BITS to $IB_NIC" + diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 new file mode 100644 index 0000000000..111abcb3a1 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -0,0 +1,92 @@ +#!/bin/bash +set -euo pipefail + +# Optimize firewall ports declaration later +DOCA_FIREWALL_PORTS=( + "18515-18520/tcp" + "18515-18520/udp" + "18515/tcp" + "18515/udp" +) + +echo "Checking for Mellanox / ConnectX / InfiniBand card..." + +if ! lspci | grep -i 'mellanox'; then + echo "No Mellanox RDMA hardware detected. Skipping DOCA-OFED installation." + exit 0 +fi + +echo "Mellanox RDMA hardware detected. Proceeding with DOCA-OFED installation." + +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "Unsupported architecture: ${sys_arch}" + exit 1 + ;; +esac + +echo "Check if kernel-devel package is present" +if rpm -q kernel-devel-$(uname -r) >/dev/null 2>&1; then + echo "kernel-devel package is already installed." +else + echo "kernel-devel package is not installed. Installing..." + dnf install -y kernel-devel-$(uname -r) +fi + +echo "Check if kernel-headers package is present" +if rpm -q kernel-headers-$(uname -r) >/dev/null 2>&1; then + echo "kernel-headers package is already installed." +else + echo "kernel-headers package is not installed. Installing..." + dnf install -y kernel-headers-$(uname -r) +fi + +echo "Bootstrap doca-ofed package..." +rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm" + +echo "Installing doca-ofed..." +if rpm -q doca-ofed >/dev/null 2>&1; then + echo "doca-ofed package is already installed." +else + echo "doca-ofed package is not installed. Installing..." + dnf install -y doca-ofed +fi + +echo "Unloading RDMA kernel modules..." +rmmod bnxt_re || true +rmmod mlx5_ib || true +rmmod ib_uverbs || true +rmmod xpmem || true +rmmod ib_core || true +rmmod mlx5_core || true + +echo "Loading RDMA kernel modules..." +modprobe mlx5_core || true +modprobe mlx5_ib || true +modprobe ib_core || true +modprobe ib_uverbs || true +modprobe ib_umad || true +modprobe ib_cm || true +modprobe rdma_cm || true +modprobe rdma_ucm || true +modprobe xpmem || true +modprobe knem || true +modprobe ib_ipoib || true + +if command -v firewall-cmd &>/dev/null; then + echo "Adding firewall ports..." + + for port in "${DOCA_FIREWALL_PORTS[@]}"; do + firewall-cmd --zone=public --add-port="$port" --permanent || true + done + + firewall-cmd --reload || true +else + echo "firewalld not running. Skipping firewall configuration." +fi + +echo "DOCA-OFED installation completed successfully." + diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 3f82454590..8c7ff96a82 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -49,6 +49,7 @@ ci_defaults_dest: '{{ cloud_init_dir }}/ci-defaults.yaml' ci_group_load_fail_msg: | "Template loading failed. Ensure the template exists in the specified path and is compatible with the defined functional groups." default_file_path: "{{ playbook_dir }}/roles/slurm_config/defaults/main.yml" +ssh_private_key_path: /root/.ssh/oim_rsa # Usage: configure_cloud_init_common.yml ci_group_common_template: cloud_init/ci-group-common.yaml.j2 @@ -74,7 +75,6 @@ ldap_starttls_port: 389 ldap_ssl_port: 636 # Usage: ci-group-slurm_control_node_x86_64.yaml.j2 -slurm_group_name: slurm home_dir: /var/lib/slurm user: slurm munge_user: munge @@ -86,3 +86,22 @@ file_mode_755: "0755" file_mode_600: "0600" ip_timeout: 10 ip_wait_loop: 60 + +# Hostname lists for stack-specific SSH configs (populated by passwordless_ssh role) +k8s_cluster_hostnames: "{{ hostvars['localhost']['k8s_cluster_hostnames'] | default([]) }}" +slurm_cluster_hostnames: "{{ hostvars['localhost']['slurm_cluster_hostnames'] | default([]) }}" + +# IP wildcard lists for stack-specific SSH configs +k8s_cluster_ip_patterns: "{{ hostvars['localhost']['k8s_cluster_ip_patterns'] | default([]) }}" +slurm_cluster_ip_patterns: "{{ hostvars['localhost']['slurm_cluster_ip_patterns'] | default([]) }}" + +# SSH Host patterns precomputed on OIM by passwordless_ssh/read_nodes_yaml.yml +slurm_control_ssh_patterns: "{{ hostvars['oim']['slurm_ssh_patterns'] | default('*') }}" +k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') }}" + +# Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) +all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" + +# CUDA/NVIDIA runfile names (extracted from slurm_custom.json in slurm_config role) +cuda_runfile_x86_64: "{{ hostvars['oim']['cuda_runfile_x86_64'] | default('cuda_13.0.2_580.95.05_linux.run') }}" +cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cuda_13.0.2_580.95.05_linux_sbsa.run') }}" diff --git a/discovery/roles/discovery_validations/tasks/include_software_config.yml b/discovery/roles/discovery_validations/tasks/include_software_config.yml index 321e74df41..f4b8b40466 100644 --- a/discovery/roles/discovery_validations/tasks/include_software_config.yml +++ b/discovery/roles/discovery_validations/tasks/include_software_config.yml @@ -41,6 +41,7 @@ admin_nic_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}" admin_nic: "{{ network_data.admin_network.oim_nic_name }}" admin_netmask_bits: "{{ network_data.admin_network.netmask_bits }}" + ib_network_subnet: "{{ network_data.ib_network.subnet }}" dns: "{{ network_data.admin_network.dns }}" - name: Initialise variables diff --git a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml index f83250f7e8..40e9328cdd 100644 --- a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -39,6 +39,22 @@ k8s_nfs_server_ip: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_ip }}" k8s_server_share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_share_path }}" +- name: Ensure SSH key directory exists on K8s share + ansible.builtin.file: + path: "{{ k8s_client_mount_path }}/ssh" + state: directory + owner: root + group: root + mode: '0700' + +- name: Copy OIM private key to K8s share for node-to-node SSH + ansible.builtin.copy: + src: "{{ ssh_private_key_path }}" + dest: "{{ k8s_client_mount_path }}/ssh/oim_rsa" + owner: root + group: root + mode: '0600' + - name: Set admin network nic and ip ansible.builtin.set_fact: admin_nic_ip: "{{ hostvars['localhost']['admin_nic_ip'] }}" @@ -104,6 +120,79 @@ - name: Creating the persist folders in nfs share ansible.builtin.include_tasks: create_node_dir.yml +# additional packages +- name: Create x86_64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create aarch64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create x86_64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_x86_64 }}" + +- name: Create aarch64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_aarch64 }}" + +- name: Print copy paths for x86_64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + +- name: Print copy paths for aarch64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + +- name: Check x86_64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + register: x86_64_offline_pkg_sources + +- name: Check aarch64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + register: aarch64_offline_pkg_sources + +- name: Copy x86_64 offline packages + ansible.builtin.copy: + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 + +- name: Copy aarch64 offline packages + ansible.builtin.copy: + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 + - name: Include local repo access variable file ansible.builtin.include_vars: "{{ local_repo_access_config_file }}" diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index 1e5400c270..433b8e9f76 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -71,3 +71,26 @@ nfs_export_help_msg: | 1) Run 'exportfs -ra' on the NFS server and verify permissions/mounts 2) Execute 'systemctl restart nfs-server' 3) Rerun the playbook. + +# Usage create_k8s_config_nfs.yml +packages_base_dir_x86_64: "{{ k8s_client_mount_path }}/packages/x86_64" +packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64" +offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" +offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" +packages_layout_x86_64: + - doca-ofed + - cuda +packages_layout_aarch64: + - doca-ofed + - cuda +print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" +offline_path_x86_64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" +offline_path_aarch64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" + +ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml b/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml new file mode 100644 index 0000000000..53b734ac89 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml @@ -0,0 +1,129 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# tasks/build_host_lists.yml + +- name: Ensure PXE mapping file path is set + ansible.builtin.assert: + that: pxe_mapping_file_path is defined + fail_msg: "pxe_mapping_file_path is not defined. Check provision_config.yml." + +- name: Read PXE mapping file (FUNCTIONAL_GROUP_NAME, HOSTNAME, ...) + community.general.read_csv: + path: "{{ pxe_mapping_file_path }}" + key: ADMIN_MAC + register: pxe_mapping_dict + +- name: Initialize per-stack hostname lists and IP wildcard patterns + ansible.builtin.set_fact: + k8s_cluster_hostnames: [] + slurm_cluster_hostnames: [] + k8s_cluster_ip_patterns: [] + slurm_cluster_ip_patterns: [] + omnia_cluster_ip_patterns: [] + omnia_hosts_map: {} + when: inventory_hostname == 'localhost' + +- name: Build per-stack hostname lists and IP wildcard patterns from PXE mapping + ansible.builtin.set_fact: + k8s_cluster_hostnames: >- + {{ + (k8s_cluster_hostnames + [item.value.HOSTNAME]) + if item.value.FUNCTIONAL_GROUP_NAME in k8s_functional_groups + else k8s_cluster_hostnames + }} + slurm_cluster_hostnames: >- + {{ + (slurm_cluster_hostnames + [item.value.HOSTNAME]) + if item.value.FUNCTIONAL_GROUP_NAME in slurm_functional_groups + else slurm_cluster_hostnames + }} + k8s_cluster_ip_patterns: >- + {{ + (k8s_cluster_ip_patterns + [ (item.value.ADMIN_IP | regex_replace('\\.[0-9]+$', '.*')) ]) + if ( + item.value.ADMIN_IP | default('') | length > 0 and + item.value.FUNCTIONAL_GROUP_NAME in k8s_functional_groups + ) + else k8s_cluster_ip_patterns + }} + slurm_cluster_ip_patterns: >- + {{ + (slurm_cluster_ip_patterns + [ (item.value.ADMIN_IP | regex_replace('\\.[0-9]+$', '.*')) ]) + if ( + item.value.ADMIN_IP | default('') | length > 0 and + item.value.FUNCTIONAL_GROUP_NAME in slurm_functional_groups + ) + else slurm_cluster_ip_patterns + }} + omnia_cluster_ip_patterns: >- + {{ + (omnia_cluster_ip_patterns + [ (item.value.ADMIN_IP | regex_replace('\\.[0-9]+$', '.*')) ]) + if ( + item.value.ADMIN_IP | default('') | length > 0 and + (item.value.FUNCTIONAL_GROUP_NAME in k8s_functional_groups or + item.value.FUNCTIONAL_GROUP_NAME in slurm_functional_groups) + ) + else omnia_cluster_ip_patterns + }} + omnia_hosts_map: >- + {{ + (omnia_hosts_map | default({})) + | combine( + ({ (item.value.HOSTNAME): item.value.ADMIN_IP } + if (item.value.HOSTNAME | default('') | length > 0 and + item.value.ADMIN_IP | default('') | length > 0) + else {}), + recursive=False + ) + }} + loop: "{{ pxe_mapping_dict.dict | dict2items }}" + loop_control: + label: "{{ item.value.FUNCTIONAL_GROUP_NAME }} -> {{ item.value.HOSTNAME }} ({{ item.value.ADMIN_IP | default('no-ip') }})" + +- name: Deduplicate host lists and IP wildcard patterns + ansible.builtin.set_fact: + k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | unique }}" + slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | unique }}" + k8s_cluster_ip_patterns: >- + {{ + (k8s_cluster_ip_patterns | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + slurm_cluster_ip_patterns: >- + {{ + (slurm_cluster_ip_patterns | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + omnia_cluster_ip_patterns: >- + {{ + (omnia_cluster_ip_patterns | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + +- name: DEBUG passwordless_ssh facts built from PXE mapping + ansible.builtin.debug: + msg: + k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | default([]) }}" + slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | default([]) }}" + k8s_cluster_ip_patterns: "{{ k8s_cluster_ip_patterns | default([]) }}" + slurm_cluster_ip_patterns: "{{ slurm_cluster_ip_patterns | default([]) }}" + omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" + omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" diff --git a/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml new file mode 100644 index 0000000000..6c7c297724 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml @@ -0,0 +1,80 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# tasks/configure_oim_ssh.yml + +- name: Gather cluster hostnames and IP wildcard patterns from localhost facts + ansible.builtin.set_fact: + k8s_cluster_hostnames: "{{ hostvars['localhost']['k8s_cluster_hostnames'] | default([]) }}" + slurm_cluster_hostnames: "{{ hostvars['localhost']['slurm_cluster_hostnames'] | default([]) }}" + omnia_cluster_ip_patterns_raw: "{{ hostvars['localhost']['omnia_cluster_ip_patterns'] | default([]) }}" + omnia_hosts_map: "{{ hostvars['localhost']['omnia_hosts_map'] | default({}) }}" + +- name: Normalize OIM cluster IP patterns to wildcard subnets (x.x.x.*) + ansible.builtin.set_fact: + omnia_cluster_ip_patterns: >- + {{ + (omnia_cluster_ip_patterns_raw | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + +- name: Build hostname wildcard patterns from actual cluster hostnames + ansible.builtin.set_fact: + omnia_cluster_hostname_patterns: >- + {{ + ( + (k8s_cluster_hostnames | default([])) + + + (slurm_cluster_hostnames | default([])) + ) + | map('regex_replace', '[0-9]+$', '*') + | list + | unique + }} + +- name: Build combined OIM SSH match list (hostname patterns + IP wildcard patterns) + ansible.builtin.set_fact: + omnia_cluster_ssh_matches: >- + {{ + (omnia_cluster_hostname_patterns + omnia_cluster_ip_patterns) + | map('regex_replace', '\.[0-9]+$', '.*') + | list + | unique + }} + + +- name: Manage /etc/hosts entries on OIM for Omnia cluster nodes + ansible.builtin.blockinfile: + path: /etc/hosts + create: true + mode: '0644' + marker: "# {mark} OMNIA_CLUSTER_NODES" + block: | + {% for h in omnia_hosts_map | dict2items %} + {{ h.value }} {{ h.key }} + {% endfor %} + when: omnia_hosts_map | default({}) | length > 0 + +# - name: DEBUG configure_oim_ssh facts + # ansible.builtin.debug: + # msg: + # k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | default([]) }}" + # slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | default([]) }}" + # omnia_cluster_ip_patterns_raw: "{{ omnia_cluster_ip_patterns_raw | default([]) }}" + # omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" + # omnia_cluster_hostname_patterns: "{{ omnia_cluster_hostname_patterns | default([]) }}" + # omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" + # omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" diff --git a/discovery/roles/passwordless_ssh/tasks/main.yml b/discovery/roles/passwordless_ssh/tasks/main.yml new file mode 100644 index 0000000000..aff8bee7e7 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/main.yml @@ -0,0 +1,23 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# tasks/main.yml + +- name: Build cluster host lists from PXE mapping (run on localhost/omnia_core) + when: inventory_hostname == 'localhost' + ansible.builtin.include_tasks: build_host_lists.yml + +- name: Configure OIM SSH based on PXE mapping (run on oim) + when: inventory_hostname == 'oim' + ansible.builtin.include_tasks: configure_oim_ssh.yml diff --git a/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml new file mode 100644 index 0000000000..093ce44790 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml @@ -0,0 +1,136 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# tasks/read_nodes_yaml.yml +--- + +- name: DEBUG passwordless_ssh facts from PXE mapping flow + ansible.builtin.debug: + msg: + k8s_cluster_hostnames: "{{ hostvars['localhost']['k8s_cluster_hostnames'] | default([]) }}" + slurm_cluster_hostnames: "{{ hostvars['localhost']['slurm_cluster_hostnames'] | default([]) }}" + k8s_cluster_ip_patterns: "{{ hostvars['localhost']['k8s_cluster_ip_patterns'] | default([]) }}" + slurm_cluster_ip_patterns: "{{ hostvars['localhost']['slurm_cluster_ip_patterns'] | default([]) }}" + omnia_cluster_ip_patterns: "{{ hostvars['localhost']['omnia_cluster_ip_patterns'] | default([]) }}" + omnia_hosts_map: "{{ hostvars['localhost']['omnia_hosts_map'] | default({}) }}" + +- name: Set nodes.yaml path for nodes.yaml debugging + ansible.builtin.set_fact: + omnia_nodes_yaml_path: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" + +- name: Read nodes.yaml for group/host/IP data + ansible.builtin.slurp: + src: "{{ omnia_nodes_yaml_path }}" + register: omnia_nodes_yaml_raw + +- name: Parse nodes.yaml content + ansible.builtin.set_fact: + omnia_nodes_data: "{{ omnia_nodes_yaml_raw.content | b64decode | from_yaml }}" + +- name: Build groups, hostnames and admin IPs from nodes.yaml + ansible.builtin.set_fact: + omnia_nodes_groups_from_yaml: >- + {{ + (omnia_nodes_data.nodes | default([])) + | map(attribute='group') + | list + | unique + }} + +- name: Initialize all_group_names_present flag + ansible.builtin.set_fact: + all_group_names_present: false + +- name: Set all_group_names_present when all required and optional groups are present + ansible.builtin.set_fact: + all_group_names_present: true + when: >- + ( + omnia_required_groups_from_nodes_yaml + | difference(omnia_nodes_groups_from_yaml | default([])) + ) | length == 0 + +- name: Build SSH Host pattern strings for k8s and slurm based on nodes.yaml completeness + ansible.builtin.set_fact: + k8s_ssh_patterns: >- + {{ + '*' + if (all_group_names_present | default(false)) + else ( + ( + (hostvars['localhost']['k8s_cluster_hostnames'] | default([])) + | map('regex_replace', '[0-9]+$', '*') + | list + | unique + ) + + (hostvars['localhost']['k8s_cluster_ip_patterns'] | default([])) + ) + | unique + | join(' ') + }} + slurm_ssh_patterns: >- + {{ + '*' + if (all_group_names_present | default(false)) + else ( + ( + (hostvars['localhost']['slurm_cluster_hostnames'] | default([])) + | map('regex_replace', '[0-9]+$', '*') + | list + | unique + ) + + (hostvars['localhost']['slurm_cluster_ip_patterns'] | default([])) + ) + | unique + | join(' ') + }} + +- name: Configure SSH on OIM with Host * when all groups are present in nodes.yaml + ansible.builtin.blockinfile: + path: "{{ ssh_private_key_path }}" + create: true + mode: '0600' + marker: "# {mark} OMNIA_CLUSTER_SSH" + block: | + Host * + IdentityFile ~/.ssh/oim_rsa + IdentitiesOnly yes + when: all_group_names_present + +- name: Configure SSH on OIM with derived hostname/IP patterns when groups are incomplete + ansible.builtin.blockinfile: + path: "{{ ssh_private_key_path }}" + create: true + mode: '0600' + marker: "# {mark} OMNIA_CLUSTER_SSH" + block: | + Host {{ omnia_cluster_ssh_matches + | default([]) + | list + | unique + | join(' ') }} + IdentityFile ~/.ssh/oim_rsa + IdentitiesOnly yes + when: + - not all_group_names_present | default(false) | bool + - omnia_cluster_ssh_matches | default([]) | length > 0 + +# - name: DEBUG summary from read_nodes_yaml flow + # ansible.builtin.debug: + # msg: + # omnia_nodes_yaml_path: "{{ omnia_nodes_yaml_path }}" + # omnia_nodes_groups_from_yaml: "{{ omnia_nodes_groups_from_yaml | default([]) }}" + # all_group_names_present: "{{ all_group_names_present | default(false) }}" + # omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" + # k8s_ssh_patterns: "{{ k8s_ssh_patterns | default('') }}" + # SLURM_SSH_patterns: "{{ slurm_ssh_patterns | default('') }}" diff --git a/discovery/roles/passwordless_ssh/vars/main.yml b/discovery/roles/passwordless_ssh/vars/main.yml new file mode 100644 index 0000000000..edd72e1e90 --- /dev/null +++ b/discovery/roles/passwordless_ssh/vars/main.yml @@ -0,0 +1,45 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vars/main.yml + +# K8s functional groups (x86_64 example; extend if you have aarch64 variants) +k8s_functional_groups: + - service_kube_control_plane_first_x86_64 + - service_kube_control_plane_x86_64 + - service_kube_node_x86_64 + +# Slurm / login functional groups +slurm_functional_groups: + - slurm_control_node_x86_64 + - slurm_node_x86_64 + - login_node_x86_64 + - login_compiler_node_x86_64 + +# Nodes.yaml group completeness checks +omnia_required_groups_from_nodes_yaml: + - service_kube_control_plane_first_x86_64 + - service_kube_control_plane_x86_64 + - service_kube_node_x86_64 + - slurm_control_node_x86_64 + - slurm_node_x86_64 + - login_node_x86_64 + - login_compiler_node_x86_64 + +omnia_optional_groups_from_nodes_yaml: + - service_kube_control_plane_first_aarch64 + - service_kube_control_plane_aarch64 + - service_kube_node_aarch64 + +ssh_private_key_path: /root/.ssh/config diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index 6429271040..acfe61e401 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ slurmctld_service_default_path: '/usr/lib/systemd/system/slurmctld.service' slurmd_service_default_path: '/usr/lib/systemd/system/slurmd.service' slurmdbd_service_default_path: '/usr/lib/systemd/system/slurmdbd.service' sys_env_path: '/etc/environment' -default_real_memory: 4 # This is the minimum default memory in GiB +default_real_memory: 4096 # This is the minimum default memory in MiB default_threadspercore: 1 default_corespersocket: 1 share_prefix: "/" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index f3547fee71..48b39097fe 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -18,6 +18,34 @@ - name: Include storage vars ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml" +- name: Load slurm_custom.json for x86_64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_x86_64 + failed_when: false + +- name: Load slurm_custom.json for aarch64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_aarch64 + failed_when: false + +- name: Extract CUDA runfile name for x86_64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: + - slurm_custom_x86_64 is defined + - slurm_custom_x86_64.slurm_node is defined + - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 + +- name: Extract CUDA runfile name for aarch64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: + - slurm_custom_aarch64 is defined + - slurm_custom_aarch64.slurm_node is defined + - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 + - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" @@ -103,6 +131,7 @@ mode: "{{ file_mode }}" become: true +# Move to packages directory moving forward - name: Create the cuda directory on share ansible.builtin.file: path: "{{ slurm_config_path }}/cuda" @@ -111,6 +140,78 @@ group: root mode: "{{ common_mode }}" +- name: Create x86_64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create aarch64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create x86_64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_x86_64 }}" + +- name: Create aarch64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_aarch64 }}" + +- name: Print copy paths for x86_64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + +- name: Print copy paths for aarch64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + +- name: Check x86_64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + register: x86_64_offline_pkg_sources + +- name: Check aarch64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + register: aarch64_offline_pkg_sources + +- name: Copy x86_64 offline packages + ansible.builtin.copy: + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 + +- name: Copy aarch64 offline packages + ansible.builtin.copy: + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 + - name: Create the runfile directory on share ansible.builtin.file: path: "{{ slurm_config_path }}/runfile" @@ -128,7 +229,6 @@ path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" register: src_dir_check_x86_64 - - name: Check if source directory exists ansible.builtin.stat: path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" @@ -300,3 +400,19 @@ ansible.builtin.set_fact: cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}" client_mount_path: "{{ share_path }}" + +- name: Ensure SSH key directory exists on Slurm share + ansible.builtin.file: + path: "{{ slurm_config_path }}/ssh" + state: directory + owner: root + group: root + mode: '0700' + +- name: Copy OIM private key to Slurm share for node-to-node SSH + ansible.builtin.copy: + src: "{{ ssh_private_key_path }}" + dest: "{{ slurm_config_path }}/ssh/oim_rsa" + owner: root + group: root + mode: '0600' diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac.yml b/discovery/roles/slurm_config/tasks/read_node_idrac.yml index 5b8b29f571..8955bc8a03 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ - name: Read Memory NodeParams ansible.builtin.uri: - url: "https://{{ bmc_ip_map[item] }}/redfish/v1/Systems/System.Embedded.1" + url: "https://{{ bmc_ip_map[item] }}/redfish/v1/Systems/System.Embedded.1/Memory?$expand=*($levels=1)" user: "{{ bmc_username }}" password: "{{ bmc_password }}" method: GET @@ -62,13 +62,15 @@ register: mem_info failed_when: false -- name: Calculate total memory in MB (GiB → MB) +- name: Calculate total memory sum of slots ansible.builtin.set_fact: - total_memory_mb: "{{ (mem_info.json.MemorySummary.TotalSystemMemoryGiB | default(default_real_memory)) * 1024 | int }}" + total_memory_mb: "{{ mem_info.json.Members | default([{'CapacityMiB': default_real_memory}]) + | map(attribute='CapacityMiB', default=default_real_memory) + | map('int', default=default_real_memory) | sum | int }}" -- name: Calculate 90% of real memory +- name: Calculate percentage of real memory ansible.builtin.set_fact: - real_memory: "{{ ((total_memory_mb | float) * 0.90) | int }}" + real_memory: "{{ ((total_memory_mb | int) * (memory_percentage / 100)) | int | round }}" - name: Calculate proc facts ansible.builtin.set_fact: diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 2c93ad32cb..fe30d967b1 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -103,3 +103,25 @@ auth_tls_certs_path: "/opt/omnia/auth/tls_certs/ldapserver.crt" slurm_installation_type: configless pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" controller_empty_msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun discovery.yml." +memory_percentage: 90 +packages_base_dir_x86_64: "{{ slurm_config_path }}/packages/x86_64" +packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" +offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" +offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" +packages_layout_x86_64: + - doca-ofed + - cuda +packages_layout_aarch64: + - doca-ofed + - cuda +print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" +offline_path_x86_64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" +offline_path_aarch64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" + +ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/input/config/aarch64/rhel/10.0/default_packages.json b/input/config/aarch64/rhel/10.0/default_packages.json index 3a49bf8f88..84709a7c66 100644 --- a/input/config/aarch64/rhel/10.0/default_packages.json +++ b/input/config/aarch64/rhel/10.0/default_packages.json @@ -59,7 +59,7 @@ {"package": "kexec-tools", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "which", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "iperf3", "type": "rpm", "repo_name": "aarch64_appstream"}, - { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "1.0", "type": "image" } + { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "1.1", "type": "image" } ] } } diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 1571a82198..3292aeab7d 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -6,7 +6,12 @@ {"package": "firewalld", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"} + {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, + {"package": "doca-ofed", + "type": "iso", + "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" + } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 62dc041faa..afc073a19f 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -31,7 +31,9 @@ { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, - { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" } + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"} ] }, "service_kube_control_plane": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index b52ca5540b..fc08673a34 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -4,7 +4,12 @@ {"package": "munge", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, - {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"} + {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, + {"package": "doca-ofed", + "type": "iso", + "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" + } ] }, "slurm_control_node": { @@ -12,7 +17,11 @@ {"package": "slurm-slurmctld", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, {"package": "slurm-slurmdbd", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, {"package": "python3-PyMySQL", "type": "rpm", "repo_name": "x86_64_appstream"}, - {"package": "mariadb-server", "type": "rpm", "repo_name": "x86_64_appstream"} + {"package": "mariadb-server", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "iscsi-initiator-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, + {"package": "device-mapper-multipath", "type": "rpm", "repo_name": "x86_64_baseos"}, + {"package": "sg3_utils", "type": "rpm", "repo_name": "x86_64_baseos"}, + {"package": "lsscsi", "type": "rpm", "repo_name": "x86_64_baseos"} ] }, "slurm_node": { diff --git a/input/network_spec.yml b/input/network_spec.yml index 0bb3a5e196..76e69b605b 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -33,6 +33,12 @@ # ntp_servers: # - { address: "172.16.10.80", type: "server" } +# 'ib_network' is a mandatory field, essential for IB network configuration. +# The 'ib_network' section contains the following variables: +# - 'subnet': The subnet of the IB network. +# - 'netmask_bits': The number of bits in the subnet mask. This value must be same as the admin_network netmask_bits. + + Networks: - admin_network: oim_nic_name: "eno1" @@ -42,3 +48,7 @@ Networks: dynamic_range: "172.16.107.201-172.16.107.250" dns: [] ntp_servers: [] + +- ib_network: + subnet: "192.168.0.0" + netmask_bits: "24" diff --git a/input/storage_config.yml b/input/storage_config.yml index 563ae0eb65..48eac2d5cc 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -12,12 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - # *********************************************************************** # DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. # *********************************************************************** +# -----------------------------Powervault------------------------------------------- +# powervault_config +# ip: ipv4 +# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. +# In this configuration, a single controller portal is provided. + +# port: +# Defines the TCP port for the iSCSI target service. +# Port 3260 is the standard port for iSCSI communication. + +# isci_initiators: +# Specifies the InitiatorName used by the host when connecting to the iSCSI target. +# This IQN uniquely identifies the host to the storage array. + +# volume_id: +# This is the unique WWN/identifier for the +# specific volume that should be used for persistent storage. +# The script uses this value during multipath scanning to select the correct mapped device + +#powervault_config: +# ip: +# - 172.1.2.3 +# port: 3260 +# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 + + # -----------------------------NFS------------------------------------------------ # This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node @@ -49,4 +75,5 @@ nfs_client_params: server_share_path: "/mnt/share/omnia_k8s" # Provide server share path of the NFS Server client_share_path: /share_omnia_k8s client_mount_options: "nosuid,rw,sync,hard,intr" - nfs_name: nfs_k8s \ No newline at end of file + nfs_name: nfs_k8s +