diff --git a/ansible.cfg b/ansible.cfg index 5d6abe5216..fd49f43315 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -9,6 +9,8 @@ executable = /bin/bash display_skipped_hosts = false deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = common/callback_plugins library = common/library/modules module_utils = common/library/module_utils diff --git a/build_image_aarch64/ansible.cfg b/build_image_aarch64/ansible.cfg index 6b59ea34a3..4e1714ecda 100644 --- a/build_image_aarch64/ansible.cfg +++ b/build_image_aarch64/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/build_image_x86_64/ansible.cfg b/build_image_x86_64/ansible.cfg index 1e6bc196a4..6d2dc793de 100644 --- a/build_image_x86_64/ansible.cfg +++ b/build_image_x86_64/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index 84785d385b..ba39b00f79 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.1" +pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.2" x86_64_local_tag: "x86_64-image-builder/ochami" pull_image_retries: "5" pull_image_delay: "10" diff --git a/build_stream/core/catalog/test_fixtures/catalog_rhel.json b/build_stream/core/catalog/test_fixtures/catalog_rhel.json index 8d8b8dd526..3b7c2c11a0 100644 --- a/build_stream/core/catalog/test_fixtures/catalog_rhel.json +++ b/build_stream/core/catalog/test_fixtures/catalog_rhel.json @@ -3272,8 +3272,8 @@ "x86_64" ], "Type": "image", - "Tag": "1.1", - "Version": "1.1" + "Tag": "1.2", + "Version": "1.2" }, "os_package_id_45": { "Name": "which", @@ -4734,4 +4734,4 @@ } } } -} \ No newline at end of file +} diff --git a/build_stream/orchestrator/upload/use_cases/upload_files.py b/build_stream/orchestrator/upload/use_cases/upload_files.py index de26a71119..75554b46e5 100644 --- a/build_stream/orchestrator/upload/use_cases/upload_files.py +++ b/build_stream/orchestrator/upload/use_cases/upload_files.py @@ -15,6 +15,7 @@ """Upload files use case implementation.""" import hashlib +import shutil from datetime import datetime, timezone from pathlib import Path from typing import List @@ -22,7 +23,7 @@ import yaml from api.logging_utils import log_secure_info -from common.config import BuildStreamConfig +from common.config import BuildStreamConfig, load_config from core.artifacts.entities import ArtifactRecord from core.artifacts.exceptions import ArtifactAlreadyExistsError from core.artifacts.interfaces import ArtifactMetadataRepository, ArtifactStore @@ -167,6 +168,13 @@ def execute(self, command: UploadFilesCommand) -> UploadFilesResult: # Always emit audit event with file details (for all uploads) self._emit_upload_files_audit_event(command, uploaded_files) + # Copy software_config.json from job artifacts to shared input directory. + # During build pipeline, generate-input-files has not run yet so the + # file won't exist — the copy is safely skipped. + # During deploy pipeline, the file was generated during the prior build + # and must be synced so the deploy uses the correct software config. + self._copy_software_config_from_artifacts(str(command.job_id)) + # Build result summary = UploadSummary( total_files=len(uploaded_files), @@ -574,6 +582,52 @@ def _emit_upload_files_audit_event( f"Files uploaded: job_id={command.job_id}, total={len(uploaded_files)}, changed={changed_count}, unchanged={unchanged_count}" ) + def _copy_software_config_from_artifacts(self, job_id: str) -> None: + """Copy software_config.json from job artifacts to shared input directory. + + The generate-input-files stage produces software_config.json in the + job-specific artifacts directory (artifacts/{job_id}/input/). + This method copies it to the shared playbook input directory so that + the deploy pipeline uses the software config matching the catalog + that was used to build the image. + + If the file does not exist (e.g. upload called from the build pipeline + before generate-input-files has run), the copy is silently skipped. + + Args: + job_id: Job identifier. + """ + try: + config = load_config() + artifacts_base = Path(config.file_store.base_path) + source = artifacts_base / job_id / "software_config.json" + + if not source.exists(): + log_secure_info( + 'debug', + "software_config.json not found in job artifacts, skipping copy", + job_id=job_id, + ) + return + + shared_input_dir = Path(DEFAULT_PLAYBOOK_INPUT_DIR) + shared_input_dir.mkdir(parents=True, exist_ok=True) + dest = shared_input_dir / "software_config.json" + + shutil.copy2(source, dest) + log_secure_info( + 'info', + f"Copied software_config.json from {source} to {dest}", + job_id=job_id, + ) + except Exception as exc: + log_secure_info( + 'warning', + f"Failed to copy software_config.json from job artifacts: {exc}", + job_id=job_id, + exc_info=True, + ) + def _emit_audit_event( self, command: UploadFilesCommand, diff --git a/common/callback_plugins/omnia_default.py b/common/callback_plugins/omnia_default.py new file mode 100644 index 0000000000..8679cfdea3 --- /dev/null +++ b/common/callback_plugins/omnia_default.py @@ -0,0 +1,171 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Custom Ansible stdout callback plugin for Omnia. + +Extends the built-in ``default`` callback to suppress the ``[ERROR]`` +source-context block introduced in ansible-core 2.19/2.20 (Data Tagging). +Renders multiline ``msg`` fields with real newlines on failure. +All other output (task banners, ok/changed/skipped lines, play recaps, +etc.) is unchanged. + +Usage — add to every ``ansible.cfg``:: + + [defaults] + stdout_callback = omnia_default + callback_plugins = /common/callback_plugins +""" +from __future__ import annotations + +import json +import re + +from ansible import constants as C # pylint: disable=no-name-in-module +from ansible.plugins.callback.default import CallbackModule as DefaultCallback + +DOCUMENTATION = r""" + name: omnia_default + type: stdout + short_description: Omnia default stdout callback + version_added: "2.1" + description: + - Inherits every behaviour of the built-in C(default) callback. + - Suppresses the C([ERROR]) source-context block added in + ansible-core 2.19/2.20. + - Renders multiline C(msg) fields with real newlines on failure. + - Produces only the classic single-line C(fatal:) output. + extends_documentation_fragment: + - default_callback +""" + +# Pattern to detect the 2.19/2.20 [ERROR] task-failure context block +_ERROR_CONTEXT_PATTERN = re.compile( + r"\[ERROR\]:\s*Task failed:|" + r"\[ERROR\]:\s*Action failed:|" + r"Origin:\s+\S+\.ya?ml:\d+:\d+|" + r"\s+\^\s+column\s+\d+" +) + + +class CallbackModule(DefaultCallback): # pylint: disable=too-many-ancestors + """ + Omnia stdout callback plugin. + + Extends the built-in default callback to suppress the ``[ERROR]`` + source-context block introduced in ansible-core 2.19/2.20 and + renders multiline failure messages with real newlines. + """ + + CALLBACK_VERSION = 2.0 + CALLBACK_TYPE = "stdout" + CALLBACK_NAME = "omnia_default" + + def __init__(self): + super().__init__() + self._patched = False + + def _patch_display(self): + """Monkey-patch Display.display to drop [ERROR] context blocks.""" + if self._patched: + return + self._patched = True + + original_display = self._display.display + + def filtered_display(msg, *args, **kwargs): + msg_str = str(msg) + if _ERROR_CONTEXT_PATTERN.search(msg_str): + return + original_display(msg, *args, **kwargs) + + self._display.display = filtered_display + + def set_options(self, task_keys=None, var_options=None, direct=None): + """Load options and apply the display patch.""" + super().set_options(task_keys=task_keys, var_options=var_options, direct=direct) + self._patch_display() + + def v2_playbook_on_play_start(self, play): + """Ensure patch is active before the first play.""" + self._patch_display() + super().v2_playbook_on_play_start(play) + + def _format_result_msg(self, result_dict): + """ + Format result dict for display. + + If ``msg`` contains newlines, display them as real line breaks + instead of escaped ``\\n`` characters. + """ + msg = result_dict.get("msg", "") + if isinstance(msg, str) and "\n" in msg: + filtered = {k: v for k, v in result_dict.items() if k != "msg"} + return f"{json.dumps(filtered, sort_keys=True)}\nmsg: |-\n {msg.replace(chr(10), chr(10) + ' ')}" + return self._dump_results(result_dict) + + def v2_runner_on_failed(self, result, ignore_errors=False): + """ + Render task failures as the classic single-line ``fatal:`` message. + + The ``[ERROR]`` block is suppressed by the ``Display.display`` patch. + Multiline ``msg`` values are rendered with real newlines. + """ + # pylint: disable=protected-access + self._patch_display() + delegated_vars = result._result.get("_ansible_delegated_vars", None) + self._clean_results(result._result, result._task.action) + + if self._last_task_banner != result._task._uuid: + self._print_task_banner(result._task) + + self._handle_exception( + result._result, + use_stderr=self.get_option("display_failed_stderr"), + ) + self._handle_warnings(result._result) + + if result._task.loop and "results" in result._result: + self._process_items(result) + else: + formatted = self._format_result_msg(result._result) + host_name = result._host.get_name() + stderr_opt = self.get_option("display_failed_stderr") + color = getattr(C, "COLOR_ERROR", "red") + + if delegated_vars: + self._display.display( + f"fatal: [{host_name} -> {delegated_vars['ansible_host']}]: FAILED! => {formatted}", + color=color, + stderr=stderr_opt, + ) + else: + self._display.display( + f"fatal: [{host_name}]: FAILED! => {formatted}", + color=color, + stderr=stderr_opt, + ) + + if ignore_errors: + color_skip = getattr(C, "COLOR_SKIP", "cyan") + self._display.display("...ignoring", color=color_skip) + # pylint: enable=protected-access + + def v2_playbook_on_stats(self, stats): + """Ensure patch is active during PLAY RECAP to suppress replayed errors.""" + self._patch_display() + super().v2_playbook_on_stats(stats) + + def _display_error_context(self, *args, **kwargs): + """Intentionally suppressed — prevents [ERROR] source-context rendering.""" diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 9240080e1b..588d112082 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -132,7 +132,7 @@ "intel_benchmarks": "2024.1.0", "ucx": "1.19.0", "openmpi": "5.0.8", - "csi_driver_powerscale": "v2.16.0", + "csi_driver_powerscale": "v2.17.0", "rocm": "6.3.1", "service_k8s": "1.35.1" } diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index d77c8e32db..dbaa2acc94 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -210,6 +210,46 @@ ) CLUSTER_OS_FAIL_MSG = "Cluster OS must be 'rhel' for RHEL Omnia Infrastructure Manager" +# additional_cloud_init +ADDITIONAL_CLOUD_INIT_FILE_NOT_FOUND_MSG = ( + "File not found. Verify additional_cloud_init_config_file " + "in provision_config.yml points to a valid file." +) +ADDITIONAL_CLOUD_INIT_YAML_SYNTAX_MSG = ( + "YAML syntax error in additional cloud-init config file." +) +ADDITIONAL_CLOUD_INIT_NOT_DICT_MSG = ( + "additional cloud-init config file must contain a YAML mapping." +) +ADDITIONAL_CLOUD_INIT_UNKNOWN_TOP_KEY_MSG = ( + "Unknown top-level key. Only 'common' and 'groups' are allowed." +) +ADDITIONAL_CLOUD_INIT_PROHIBITED_KEY_MSG = ( + "Prohibited key found. The keys 'bootcmd', 'network', " + "'network-config', and 'packages' are platform-managed " + "and must NOT be overridden." +) +ADDITIONAL_CLOUD_INIT_UNKNOWN_KEY_MSG = ( + "Unknown key found. Only 'write_files' and 'runcmd' " + "are allowed." +) +ADDITIONAL_CLOUD_INIT_WRITE_FILES_NOT_LIST_MSG = ( + "'write_files' must be a list." +) +ADDITIONAL_CLOUD_INIT_WRITE_FILES_MISSING_PATH_MSG = ( + "write_files entry is missing the required 'path' field." +) +ADDITIONAL_CLOUD_INIT_RUNCMD_NOT_LIST_MSG = "'runcmd' must be a list." +ADDITIONAL_CLOUD_INIT_RUNCMD_NOT_STRING_MSG = ( + "runcmd entry is not a string." +) +ADDITIONAL_CLOUD_INIT_INVALID_FG_MSG = ( + "is not a valid functional group name in the 'groups' section." +) +ADDITIONAL_CLOUD_INIT_SECTION_NOT_DICT_MSG = ( + "Section must be a mapping/dict." +) + # local_repo.yml REPO_STORE_PATH_MSG = "Please provide a valid repo_store_path value." OMNIA_REPO_URL_MSG = "Repo urls are empty. Please provide a url and corresponding key." @@ -619,7 +659,9 @@ def tls_ext_fail_msg(valid_extensions): "Check telemetry_config.yml and network_spec.yml") # high_availability -VIRTUAL_IP_NOT_IN_ADMIN_SUBNET = ("virtual ip address provided is not in admin subnet. " +VIRTUAL_IP_NOT_IN_ADMIN_SUBNET = ("virtual ip address provided is not in a valid subnet. " + "The VIP must be in either the admin subnet or the " + "additional subnet where the Kubernetes control plane nodes are configured. " "Check high_availability_config.yml and network_spec.yml") VIRTUAL_IP_NOT_VALID = ("should be outside the admin static and dynamic ranges. " "Check high_availability_config.yml and network_spec.yml") diff --git a/common/library/module_utils/input_validation/schema/provision_config.json b/common/library/module_utils/input_validation/schema/provision_config.json index bf313a9a94..26024077bf 100644 --- a/common/library/module_utils/input_validation/schema/provision_config.json +++ b/common/library/module_utils/input_validation/schema/provision_config.json @@ -27,6 +27,11 @@ "description": "Optional kernel version to pin for boot image selection. Leave empty to auto-select latest.", "pattern": "^(|[0-9]+\\.[0-9]+\\.[0-9]+-.+)$", "default": "" + }, + "additional_cloud_init_config_file": { + "type": "string", + "description": "Path to additional cloud-init configuration file for stateless node provisioning. Leave empty to disable.", + "default": "" } }, "required": [ diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py index 1630e5e9cc..1d635eaec3 100644 --- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py @@ -315,7 +315,9 @@ def validate_vip_address( admin_netmaskbits, oim_admin_ip, pxe_mapping_file_path=None, - additional_subnets=None + additional_subnets=None, + kcp_subnet_ip=None, + kcp_subnet_bits=None ): """ Validate a virtual IP address against a list of existing service node VIPs, @@ -330,6 +332,9 @@ def validate_vip_address( - admin_netmaskbits (str): The netmask bits value of the admin network. - oim_admin_ip (str): The IP address of the OIM admin interface. - pxe_mapping_file_path (str, optional): Path to PXE mapping file for additional validation. + - additional_subnets (list, optional): List of additional subnet dicts from network_spec.yml. + - kcp_subnet_ip (str, optional): Reference IP of the control plane nodes' subnet. + - kcp_subnet_bits (str, optional): Netmask bits of the control plane nodes' subnet. Returns: - None: The function does not return any value, it only appends @@ -359,8 +364,16 @@ def validate_vip_address( ) ) - # validate virtual_ip_address is in the admin subnet - if not validation_utils.is_ip_in_subnet(oim_admin_ip, admin_netmaskbits, vip_address): + # validate virtual_ip_address is in the expected subnet + # VIP is valid if it is in the primary admin subnet OR the control plane + # nodes' subnet (which may be an additional subnet). + in_admin_subnet = validation_utils.is_ip_in_subnet( + oim_admin_ip, admin_netmaskbits, vip_address) + in_kcp_subnet = False + if kcp_subnet_ip and kcp_subnet_bits: + in_kcp_subnet = validation_utils.is_ip_in_subnet( + kcp_subnet_ip, kcp_subnet_bits, vip_address) + if not in_admin_subnet and not in_kcp_subnet: errors.append( create_error_msg( f"{config_type} virtual_ip_address", @@ -390,7 +403,7 @@ def validate_vip_address( validate_vip_vs_pxe_mapping_host_ips(errors, config_type, vip_address, pxe_mapping_file_path) # Check all HOST_IPs are in same subnet as VIP - validate_all_host_ips_same_subnet_as_vip(errors, vip_address, pxe_mapping_file_path, admin_netmaskbits, additional_subnets) + validate_all_host_ips_same_subnet_as_vip(errors, vip_address, pxe_mapping_file_path, admin_netmaskbits, additional_subnets, oim_admin_ip) def validate_service_k8s_cluster_ha( errors, @@ -434,6 +447,26 @@ def validate_service_k8s_cluster_ha( pxe_admin_ips = [item["ADMIN_IP"] for item in pxe_list] pxe_bmc_ips = [item["BMC_IP"] for item in pxe_list] + # Determine control plane nodes' subnet for VIP validation + additional_subnets = network_spec_data.get("additional_subnets", []) + kcp_ips = [item["ADMIN_IP"] for item in pxe_list + if item.get("FUNCTIONAL_GROUP_NAME", "").startswith("service_kube_control_plane")] + kcp_subnet_ip = None + kcp_subnet_bits = None + if kcp_ips: + ref_ip = kcp_ips[0] + if validation_utils.is_ip_in_subnet(oim_admin_ip, admin_netmaskbits, ref_ip): + kcp_subnet_ip = oim_admin_ip + kcp_subnet_bits = admin_netmaskbits + elif additional_subnets: + for subnet_entry in additional_subnets: + s_addr = subnet_entry.get("subnet", "") + s_bits = subnet_entry.get("netmask_bits", "") + if s_addr and s_bits and validation_utils.is_ip_in_subnet(s_addr, s_bits, ref_ip): + kcp_subnet_ip = s_addr + kcp_subnet_bits = s_bits + break + with open(os.path.join(input_file_path, "omnia_config.yml"), "r", encoding="utf-8") as omniacfg: omnia_config = yaml.safe_load(omniacfg) pod_external_ip_list = [item.get("pod_external_ip_range") @@ -478,7 +511,9 @@ def validate_service_k8s_cluster_ha( admin_netmaskbits, oim_admin_ip, prov_cfg.get('pxe_mapping_file_path'), - network_spec_data.get("additional_subnets", []) + additional_subnets, + kcp_subnet_ip, + kcp_subnet_bits ) diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 1a894f4749..e5b1a673c7 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -998,6 +998,184 @@ def validate_functional_groups_software_consistency(pxe_mapping_file_path, softw if has_slurm_fg and "slurm_custom" in software_names: logger.info("✓ Slurm functional groups validated: slurm_custom found in software_config.json") +def _get_fg_names_from_mapping_file(pxe_mapping_file_path): + """Extract unique functional group names from PXE mapping CSV. + + Args: + pxe_mapping_file_path (str): Path to the PXE mapping CSV file. + + Returns: + list: Sorted list of unique functional group names. + """ + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + return [] + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + fg_col = fieldname_map.get("FUNCTIONAL_GROUP_NAME") + if not fg_col: + return [] + fg_names = set() + for row in reader: + fg = row.get(fg_col, "").strip() if row.get(fg_col) else "" + if fg: + fg_names.add(fg) + return sorted(fg_names) + + +def _validate_cloud_init_section(section_name, section_data): + """Validate a single cloud-init section (common or per-FG). + + Returns: + list: List of error dicts from create_error_msg. + """ + errors = [] + key_prefix = f"additional_cloud_init.{section_name}" + + if not isinstance(section_data, dict): + errors.append(create_error_msg( + key_prefix, str(type(section_data).__name__), + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_SECTION_NOT_DICT_MSG, + )) + return errors + + prohibited_keys = ["bootcmd", "network", "network-config", "packages"] + allowed_keys = ["write_files", "runcmd"] + + for key in section_data: + if key in prohibited_keys: + errors.append(create_error_msg( + f"{key_prefix}.{key}", key, + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_PROHIBITED_KEY_MSG, + )) + elif key not in allowed_keys: + errors.append(create_error_msg( + f"{key_prefix}.{key}", key, + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_UNKNOWN_KEY_MSG, + )) + + # write_files + if "write_files" in section_data: + wf = section_data["write_files"] + if not isinstance(wf, list): + errors.append(create_error_msg( + f"{key_prefix}.write_files", str(type(wf).__name__), + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_WRITE_FILES_NOT_LIST_MSG, + )) + else: + for idx, entry in enumerate(wf): + if isinstance(entry, dict): + path_val = entry.get("path", "") + if not path_val or not str(path_val).strip(): + errors.append(create_error_msg( + f"{key_prefix}.write_files[{idx}].path", "", + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_WRITE_FILES_MISSING_PATH_MSG, + )) + + # runcmd + if "runcmd" in section_data: + rc = section_data["runcmd"] + if not isinstance(rc, list): + errors.append(create_error_msg( + f"{key_prefix}.runcmd", str(type(rc).__name__), + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_RUNCMD_NOT_LIST_MSG, + )) + else: + for idx, entry in enumerate(rc): + if not isinstance(entry, str): + errors.append(create_error_msg( + f"{key_prefix}.runcmd[{idx}]", str(type(entry).__name__), + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_RUNCMD_NOT_STRING_MSG, + )) + + return errors + + +def validate_additional_cloud_init_config(config_file_path, pxe_mapping_file_path): + """Validate the additional cloud-init configuration file. + + Checks: + - File exists and is valid YAML + - Top-level keys are only 'common' and 'groups' + - Per-section: no prohibited keys, only allowed keys, type checks + - Group names match functional groups from pxe_mapping_file + + Args: + config_file_path (str): Path to additional_cloud_init config file. + pxe_mapping_file_path (str): Path to PXE mapping CSV for FG name validation. + + Returns: + list: List of error dicts (empty if valid). + """ + errors = [] + if not config_file_path or not config_file_path.strip(): + return errors + + config_file_path = config_file_path.strip() + + if not os.path.isfile(config_file_path): + errors.append(create_error_msg( + "additional_cloud_init_config_file", config_file_path, + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_FILE_NOT_FOUND_MSG, + )) + return errors + + try: + with open(config_file_path, "r", encoding="utf-8") as fh: + config_data = yaml.safe_load(fh) + except yaml.YAMLError as exc: + errors.append(create_error_msg( + "additional_cloud_init_config_file", config_file_path, + f"{en_us_validation_msg.ADDITIONAL_CLOUD_INIT_YAML_SYNTAX_MSG} {exc}", + )) + return errors + + if config_data is None: + return errors + + if not isinstance(config_data, dict): + errors.append(create_error_msg( + "additional_cloud_init_config_file", str(type(config_data).__name__), + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_NOT_DICT_MSG, + )) + return errors + + top_level_keys = ["common", "groups"] + for key in config_data: + if key not in top_level_keys: + errors.append(create_error_msg( + f"additional_cloud_init.{key}", key, + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_UNKNOWN_TOP_KEY_MSG, + )) + + common_data = config_data.get("common") or {} + groups_data = config_data.get("groups") or {} + + if common_data: + errors.extend(_validate_cloud_init_section("common", common_data)) + + if groups_data: + if not isinstance(groups_data, dict): + errors.append(create_error_msg( + "additional_cloud_init.groups", str(type(groups_data).__name__), + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_SECTION_NOT_DICT_MSG, + )) + else: + valid_fg_names = _get_fg_names_from_mapping_file(pxe_mapping_file_path) + for fg_name, section_data in groups_data.items(): + if valid_fg_names and fg_name not in valid_fg_names: + errors.append(create_error_msg( + f"additional_cloud_init.groups.{fg_name}", fg_name, + en_us_validation_msg.ADDITIONAL_CLOUD_INIT_INVALID_FG_MSG, + )) + if section_data: + errors.extend(_validate_cloud_init_section(fg_name, section_data)) + + return errors + + def validate_provision_config( input_file_path, data, logger, module, omnia_base_dir, module_utils_base, project_name ): @@ -1123,6 +1301,12 @@ def validate_provision_config( ) ) + # Validate additional cloud-init config file + aci_path = data.get("additional_cloud_init_config_file", "") + if aci_path: + aci_errors = validate_additional_cloud_init_config(aci_path, pxe_mapping_file_path) + errors.extend(aci_errors) + return errors def validate_network_spec( diff --git a/common/library/module_utils/input_validation/validation_flows/vip_pxe_validation.py b/common/library/module_utils/input_validation/validation_flows/vip_pxe_validation.py index 1cd060a991..f091060e0c 100644 --- a/common/library/module_utils/input_validation/validation_flows/vip_pxe_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/vip_pxe_validation.py @@ -96,10 +96,10 @@ def validate_vip_vs_pxe_mapping_host_ips( def validate_all_host_ips_same_subnet_as_vip( errors, vip_address, pxe_mapping_file_path, admin_netmaskbits, - additional_subnets=None): + additional_subnets=None, oim_admin_ip=None): """ Validate that all ADMIN_IPs in PXE mapping are in a known subnet - (primary admin subnet or any additional subnet). + (primary admin subnet, VIP subnet, or any additional subnet). Parameters: errors (list): List to append error messages @@ -108,17 +108,24 @@ def validate_all_host_ips_same_subnet_as_vip( admin_netmaskbits (str): Netmask bits for subnet validation additional_subnets (list, optional): List of additional subnet dicts with 'subnet' and 'netmask_bits' keys. + oim_admin_ip (str, optional): Primary OIM admin IP address for + checking the primary admin subnet. """ host_ips = extract_host_ips_from_pxe_mapping(pxe_mapping_file_path) if additional_subnets is None: additional_subnets = [] for host_ip in host_ips: - # Check if host_ip is in the primary admin subnet (VIP subnet) + # Check if host_ip is in the VIP subnet if validation_utils.is_ip_in_subnet( vip_address, admin_netmaskbits, host_ip): continue + # Check if host_ip is in the primary admin subnet + if oim_admin_ip and validation_utils.is_ip_in_subnet( + oim_admin_ip, admin_netmaskbits, host_ip): + continue + # Check if host_ip is in any additional subnet in_additional = False for subnet_entry in additional_subnets: diff --git a/discovery/ansible.cfg b/discovery/ansible.cfg index 8573a5fe3c..37d3eac714 100644 --- a/discovery/ansible.cfg +++ b/discovery/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = library:../common/library/modules module_utils = ../common/library/module_utils diff --git a/examples/additional_cloud_init.yml b/examples/additional_cloud_init.yml new file mode 100644 index 0000000000..310fa11513 --- /dev/null +++ b/examples/additional_cloud_init.yml @@ -0,0 +1,59 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Additional cloud-init configuration for stateless node provisioning. +# +# Structure: +# common: Applied to ALL nodes (common cloud-init group) +# groups: Per-functional-group overrides (keyed by functional group name) +# +# ALLOWED keys per section (config and final stage only): +# write_files - Write files to nodes (list of file entries) +# runcmd - Run commands at final stage (list of commands) +# +# PROHIBITED keys (validation will fail if present): +# bootcmd, network, network-config, packages +# These are platform-managed and must not be overridden by users. +# +# Merge behavior: +# Platform-defined defaults always take precedence (merge_how: no_replace). +# User entries are appended to platform lists (write_files, runcmd). +# Group-specific entries are merged AFTER common entries. + +# Common cloud-init applied to ALL nodes +# Example: +# common: +# write_files: +# - path: /etc/motd +# content: "Welcome to the HPC cluster\n" +# permissions: '0644' +# runcmd: +# - echo "Custom node setup complete" >> /var/log/custom_setup.log +common: {} + +# Per-functional-group cloud-init overrides +# Group names must match functional groups defined in pxe_mapping_file.csv +# Example: +# groups: +# slurm_node_x86_64: +# runcmd: +# - echo "Slurm node setup" >> /var/log/custom.log +# os_x86_64: +# write_files: +# - path: /etc/profile.d/cluster.sh +# content: | +# export CLUSTER_NAME=mycluster +# permissions: '0644' +groups: {} diff --git a/examples/catalog/catalog_rhel.json b/examples/catalog/catalog_rhel.json index 4132c82c20..6da9d9b3bb 100644 --- a/examples/catalog/catalog_rhel.json +++ b/examples/catalog/catalog_rhel.json @@ -2706,8 +2706,8 @@ "x86_64" ], "Type": "image", - "Tag": "1.1", - "Version": "1.1" + "Tag": "1.2", + "Version": "1.2" }, "dracut": { "Name": "dracut", diff --git a/examples/catalog/catalog_rhel_aarch64_with_slurm_only.json b/examples/catalog/catalog_rhel_aarch64_with_slurm_only.json index 9dc880c004..fda0cb815e 100644 --- a/examples/catalog/catalog_rhel_aarch64_with_slurm_only.json +++ b/examples/catalog/catalog_rhel_aarch64_with_slurm_only.json @@ -1307,8 +1307,8 @@ "x86_64" ], "Type": "image", - "Tag": "1.1", - "Version": "1.1" + "Tag": "1.2", + "Version": "1.2" }, "dracut": { "Name": "dracut", diff --git a/examples/catalog/catalog_rhel_with_nfs_provisioner.json b/examples/catalog/catalog_rhel_with_nfs_provisioner.json index 556c97d65e..860fb775ab 100644 --- a/examples/catalog/catalog_rhel_with_nfs_provisioner.json +++ b/examples/catalog/catalog_rhel_with_nfs_provisioner.json @@ -2579,8 +2579,8 @@ "x86_64" ], "Type": "image", - "Tag": "1.1", - "Version": "1.1" + "Tag": "1.2", + "Version": "1.2" }, "dracut": { "Name": "dracut", diff --git a/examples/catalog/catalog_rhel_x86_64.json b/examples/catalog/catalog_rhel_x86_64.json index a70b4f94aa..85d1a666ea 100644 --- a/examples/catalog/catalog_rhel_x86_64.json +++ b/examples/catalog/catalog_rhel_x86_64.json @@ -2451,8 +2451,8 @@ "x86_64" ], "Type": "image", - "Tag": "1.1", - "Version": "1.1" + "Tag": "1.2", + "Version": "1.2" }, "dracut": { "Name": "dracut", diff --git a/examples/catalog/catalog_rhel_x86_64_with_slurm_only.json b/examples/catalog/catalog_rhel_x86_64_with_slurm_only.json index 7c8d819d0e..906cc8908e 100644 --- a/examples/catalog/catalog_rhel_x86_64_with_slurm_only.json +++ b/examples/catalog/catalog_rhel_x86_64_with_slurm_only.json @@ -1077,8 +1077,8 @@ "x86_64" ], "Type": "image", - "Tag": "1.1", - "Version": "1.1" + "Tag": "1.2", + "Version": "1.2" }, "dracut": { "Name": "dracut", diff --git a/gitlab/ansible.cfg b/gitlab/ansible.cfg index beeac5e8fc..099abd8c8e 100644 --- a/gitlab/ansible.cfg +++ b/gitlab/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/input/additional_cloud_init.yml b/input/additional_cloud_init.yml new file mode 100644 index 0000000000..310fa11513 --- /dev/null +++ b/input/additional_cloud_init.yml @@ -0,0 +1,59 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Additional cloud-init configuration for stateless node provisioning. +# +# Structure: +# common: Applied to ALL nodes (common cloud-init group) +# groups: Per-functional-group overrides (keyed by functional group name) +# +# ALLOWED keys per section (config and final stage only): +# write_files - Write files to nodes (list of file entries) +# runcmd - Run commands at final stage (list of commands) +# +# PROHIBITED keys (validation will fail if present): +# bootcmd, network, network-config, packages +# These are platform-managed and must not be overridden by users. +# +# Merge behavior: +# Platform-defined defaults always take precedence (merge_how: no_replace). +# User entries are appended to platform lists (write_files, runcmd). +# Group-specific entries are merged AFTER common entries. + +# Common cloud-init applied to ALL nodes +# Example: +# common: +# write_files: +# - path: /etc/motd +# content: "Welcome to the HPC cluster\n" +# permissions: '0644' +# runcmd: +# - echo "Custom node setup complete" >> /var/log/custom_setup.log +common: {} + +# Per-functional-group cloud-init overrides +# Group names must match functional groups defined in pxe_mapping_file.csv +# Example: +# groups: +# slurm_node_x86_64: +# runcmd: +# - echo "Slurm node setup" >> /var/log/custom.log +# os_x86_64: +# write_files: +# - path: /etc/profile.d/cluster.sh +# content: | +# export CLUSTER_NAME=mycluster +# permissions: '0644' +groups: {} diff --git a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json index 7ff53df763..b33b8cf744 100644 --- a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json +++ b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json @@ -2,81 +2,81 @@ "csi_driver_powerscale": { "cluster": [ { - "package": "csi-powerscale-v2.16.0", + "package": "csi-powerscale-v2.17.0", "url": "https://github.com/dell/csi-powerscale.git", "type": "git", - "version": "v2.16.0" + "version": "v2.17.0" }, { - "package": "external-snapshotter-v8.4.0", + "package": "external-snapshotter-v8.5.0", "url": "https://github.com/kubernetes-csi/external-snapshotter.git", "type": "git", - "version": "v8.4.0" + "version": "v8.5.0" }, { - "package": "helm-charts-2.16.0", + "package": "helm-charts-2.17.0", "url": "https://github.com/dell/helm-charts.git", "type": "git", - "version": "csi-isilon-2.16.0" + "version": "csi-isilon-2.17.0" }, { "package": "quay.io/dell/container-storage-modules/csi-isilon", - "tag": "v2.16.0", + "tag": "v2.17.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/csi-attacher", - "tag": "v4.10.0", + "tag": "v4.11.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/csi-provisioner", - "tag": "v6.1.0", + "tag": "v6.2.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/csi-snapshotter", - "tag": "v8.4.0", + "tag": "v8.5.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/csi-resizer", - "tag": "v2.0.0", + "tag": "v2.1.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/csi-node-driver-registrar", - "tag": "v2.15.0", + "tag": "v2.16.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/csi-external-health-monitor-controller", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { "package": "quay.io/dell/container-storage-modules/dell-csi-replicator", - "tag": "v1.14.0", + "tag": "v1.15.0", "type": "image" }, { "package": "quay.io/dell/container-storage-modules/podmon", - "tag": "v1.15.0", + "tag": "v1.16.0", "type": "image" }, { "package": "quay.io/dell/container-storage-modules/csm-authorization-sidecar", - "tag": "v2.4.0", + "tag": "v2.5.0", "type": "image" }, { "package": "quay.io/dell/container-storage-modules/csi-metadata-retriever", - "tag": "v1.13.0", + "tag": "v1.14.0", "type": "image" }, { "package": "registry.k8s.io/sig-storage/snapshot-controller", - "tag": "v8.4.0", + "tag": "v8.5.0", "type": "image" }, { @@ -87,3 +87,4 @@ ] } } + diff --git a/input/config/x86_64/rhel/10.0/default_packages.json b/input/config/x86_64/rhel/10.0/default_packages.json index 29a091f26a..6e6d16279f 100644 --- a/input/config/x86_64/rhel/10.0/default_packages.json +++ b/input/config/x86_64/rhel/10.0/default_packages.json @@ -35,7 +35,7 @@ {"package": "cloud-init", "type": "rpm", "repo_name": "appstream"}, {"package": "glibc-langpack-en", "type": "rpm", "repo_name": "baseos"}, {"package": "gedit", "type": "rpm", "repo_name": "epel"}, - {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.1", "type": "image" } + {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.2", "type": "image" } ] } } diff --git a/input/provision_config.yml b/input/provision_config.yml index 91184df878..33493ba6af 100644 --- a/input/provision_config.yml +++ b/input/provision_config.yml @@ -49,6 +49,15 @@ dns_enabled: false #### Optional # Pin a specific kernel version for boot image selection. +# The specified version applies to both x86_64 and aarch64 architectures. # Leave empty ("") to auto-select the latest available image from S3. -# Example: kernel_version_override: "6.12.0-55.76.1.el10_0.x86_64" +# Example: kernel_version_override: "6.12.0-55.76.1.el10_0" kernel_version_override: "" + +#### Optional +# Path to additional cloud-init configuration file for stateless node provisioning. +# The file supports both common (all nodes) and per-functional-group cloud-init sections. +# An example file is provided at omnia/examples/additional_cloud_init.yml +# Leave empty ("") to disable additional cloud-init. +# Default: "" (disabled) +additional_cloud_init_config_file: "" diff --git a/input_validation/ansible.cfg b/input_validation/ansible.cfg index c1fb788afb..ad2f32eb86 100644 --- a/input_validation/ansible.cfg +++ b/input_validation/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/local_repo/ansible.cfg b/local_repo/ansible.cfg index 0580c918dc..677c2f78a3 100644 --- a/local_repo/ansible.cfg +++ b/local_repo/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = roles/parse_and_download/library:../common/library/modules module_utils = ../common/library/module_utils diff --git a/log_collector/ansible.cfg b/log_collector/ansible.cfg index d4161bca13..9ebf18c08c 100644 --- a/log_collector/ansible.cfg +++ b/log_collector/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/prepare_oim/ansible.cfg b/prepare_oim/ansible.cfg index e42f6fea14..e04c90fd11 100644 --- a/prepare_oim/ansible.cfg +++ b/prepare_oim/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml b/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml index d9b1ccbb2b..d4eec6b6c9 100644 --- a/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/build_stream/vars/main.yml @@ -92,8 +92,9 @@ postgres_password: "{{ hostvars['localhost']['postgres_password'] }}" postgres_db_name: "build_stream_db" # MinIO S3 configuration (credentials loaded from vault via get_config_credentials.yml) -minio_s3_username: "{{ hostvars['localhost']['minio_s3_username'] | default('admin') }}" -minio_s3_password: "{{ hostvars['localhost']['minio_s3_password'] | default('') }}" +# For MinIO provider: s3_access_id defaults to 'admin', s3_secret_key is prompted +minio_s3_username: "{{ (hostvars['localhost']['s3_access_id'] if hostvars['localhost']['s3_access_id'] else 'admin') }}" +minio_s3_password: "{{ hostvars['localhost']['s3_secret_key'] | default('') }}" admin_nic_ip: "{{ hostvars['localhost']['admin_nic_ip'] | default('localhost') }}" # Systemd service name generated by Quadlet diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/configs/firewall.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/configs/firewall.yml index 7a8b02eb5a..bafa6ee74f 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/configs/firewall.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/configs/firewall.yml @@ -33,6 +33,16 @@ state: enabled loop: "{{ udp_ports }}" +- name: Open DNS port for CoreDNS (when dns_enabled) + ansible.posix.firewalld: + port: "{{ item }}" + permanent: true + state: enabled + loop: + - 53/tcp + - 53/udp + when: dns_enabled | default(false) | bool + - name: Add Podman interfaces to trusted zone ansible.posix.firewalld: interface: "{{ item }}" diff --git a/provision/ansible.cfg b/provision/ansible.cfg index 56c96f755c..0d728ce383 100644 --- a/provision/ansible.cfg +++ b/provision/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = library:../common/library/modules module_utils = ../common/library/module_utils diff --git a/provision/roles/configure_ochami/tasks/configure_bss_cloud_init.yml b/provision/roles/configure_ochami/tasks/configure_bss_cloud_init.yml index 171224dd21..0b5a0da05e 100644 --- a/provision/roles/configure_ochami/tasks/configure_bss_cloud_init.yml +++ b/provision/roles/configure_ochami/tasks/configure_bss_cloud_init.yml @@ -127,6 +127,10 @@ - hostname_yaml_stat.stat.exists | default(false) - not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) +- name: Configure additional cloud-init group + ansible.builtin.include_tasks: configure_cloud_init_additional.yml + when: additional_cloud_init_enabled | default(false) | bool + - name: Set openchami SELinux context for Local flow ansible.builtin.command: chcon -R system_u:object_r:container_file_t:s0 "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami" changed_when: true diff --git a/provision/roles/configure_ochami/tasks/configure_cloud_init_additional.yml b/provision/roles/configure_ochami/tasks/configure_cloud_init_additional.yml new file mode 100644 index 0000000000..2f9f6b50e2 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/configure_cloud_init_additional.yml @@ -0,0 +1,72 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Renders and sets additional cloud-init groups (common + per-functional-group). +# Common group is applied to ALL nodes; per-FG groups target specific functional groups. +# Uses merge_how with no_replace so platform defaults always take precedence. + +# --- Common additional cloud-init group --- +- name: Delete ci group configuration - {{ additional_cloud_init_common_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami cloud-init group delete --no-confirm + -f yaml -d @{{ cloud_init_dir }}/ci-group-{{ additional_cloud_init_common_group_name }}.yaml + changed_when: true + failed_when: false + when: + - additional_cloud_init_common_enabled | default(false) | bool + - not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +- name: Render ci group template - {{ additional_cloud_init_common_group_name }} + ansible.builtin.template: + src: "{{ ci_group_additional_template }}" + dest: "{{ cloud_init_dir }}/ci-group-{{ additional_cloud_init_common_group_name }}.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + vars: + ci_additional_group_name: "{{ additional_cloud_init_common_group_name }}" + ci_additional_data: "{{ additional_cloud_init_common_data }}" + when: additional_cloud_init_common_enabled | default(false) | bool + +- name: Set ci group configuration - {{ additional_cloud_init_common_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami cloud-init group set + -f yaml -d @{{ cloud_init_dir }}/ci-group-{{ additional_cloud_init_common_group_name }}.yaml + changed_when: true + when: + - additional_cloud_init_common_enabled | default(false) | bool + - not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +- name: Verify ci group - {{ additional_cloud_init_common_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami cloud-init group get config {{ additional_cloud_init_common_group_name }} + changed_when: false + register: _ci_common_additional_output + when: + - additional_cloud_init_common_enabled | default(false) | bool + - not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +- name: Verify ci group output - {{ additional_cloud_init_common_group_name }} + ansible.builtin.debug: + msg: "{{ _ci_common_additional_output.stdout_lines }}" + verbosity: 2 + when: + - additional_cloud_init_common_enabled | default(false) | bool + - not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +# --- Per-functional-group additional cloud-init groups --- +- name: Configure per-FG additional cloud-init groups + ansible.builtin.include_tasks: configure_cloud_init_additional_fg.yml + vars: + _ci_fg_name: "{{ item }}" + with_items: "{{ additional_cloud_init_fg_names | default([]) }}" + when: additional_cloud_init_groups_enabled | default(false) | bool diff --git a/provision/roles/configure_ochami/tasks/configure_cloud_init_additional_fg.yml b/provision/roles/configure_ochami/tasks/configure_cloud_init_additional_fg.yml new file mode 100644 index 0000000000..3b5cf5f9dd --- /dev/null +++ b/provision/roles/configure_ochami/tasks/configure_cloud_init_additional_fg.yml @@ -0,0 +1,60 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Renders and sets a single per-functional-group additional cloud-init group. +# Variables expected: +# _ci_fg_name - the functional group name (e.g. slurm_node_x86_64) + +- name: Set per-FG cloud-init group variables + ansible.builtin.set_fact: + _ci_fg_group_name: "{{ additional_cloud_init_fg_group_prefix }}{{ _ci_fg_name }}" + _ci_fg_data: "{{ additional_cloud_init_groups_data[_ci_fg_name] }}" + _ci_fg_dest: "{{ cloud_init_dir }}/ci-group-{{ additional_cloud_init_fg_group_prefix }}{{ _ci_fg_name }}.yaml" + +- name: Delete ci group configuration - {{ _ci_fg_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami cloud-init group delete --no-confirm + -f yaml -d @{{ _ci_fg_dest }} + changed_when: true + failed_when: false + when: not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +- name: Render ci group template - {{ _ci_fg_group_name }} + ansible.builtin.template: + src: "{{ ci_group_additional_template }}" + dest: "{{ _ci_fg_dest }}" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + vars: + ci_additional_group_name: "{{ _ci_fg_group_name }}" + ci_additional_data: "{{ _ci_fg_data }}" + +- name: Set ci group configuration - {{ _ci_fg_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami cloud-init group set + -f yaml -d @{{ _ci_fg_dest }} + changed_when: true + when: not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +- name: Verify ci group configuration - {{ _ci_fg_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami cloud-init group get config {{ _ci_fg_group_name }} + changed_when: false + register: _ci_fg_additional_output + when: not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) + +- name: Verify ci group output - {{ _ci_fg_group_name }} + ansible.builtin.debug: + msg: "{{ _ci_fg_additional_output.stdout_lines }}" + verbosity: 2 + when: not (hostvars['localhost']['upgrade_mode'] | default(false) | bool) diff --git a/provision/roles/configure_ochami/tasks/create_groups_additional_fg.yml b/provision/roles/configure_ochami/tasks/create_groups_additional_fg.yml new file mode 100644 index 0000000000..095d893bc1 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/create_groups_additional_fg.yml @@ -0,0 +1,43 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Creates an SMD group for per-functional-group additional cloud-init. +# The group name is prefixed with additional_cloud_init_ and contains +# only nodes belonging to the target functional group. + +- name: Set per-FG additional cloud-init group variables + ansible.builtin.set_fact: + functional_group_name: "{{ item }}" + additional_cloud_init_fg_smd_group_name: "{{ additional_cloud_init_fg_group_prefix }}{{ item }}" + +- name: Render SMD group template - {{ additional_cloud_init_fg_smd_group_name }} + ansible.builtin.template: + src: "{{ additional_cloud_init_fg_groups_template }}" + dest: "{{ nodes_dir }}/groups-{{ additional_cloud_init_fg_smd_group_name }}.yml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + vars: + nodes: "{{ hostvars['localhost']['read_mapping_file']['dict'] | dict2items }}" + +- name: Delete SMD group - {{ additional_cloud_init_fg_smd_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami smd group delete --no-confirm + {{ additional_cloud_init_fg_smd_group_name }} + changed_when: true + failed_when: false + +- name: POST SMD group - {{ additional_cloud_init_fg_smd_group_name }} + ansible.builtin.command: >- + /usr/bin/ochami smd group add -f yaml + -d @{{ nodes_dir }}/groups-{{ additional_cloud_init_fg_smd_group_name }}.yml + changed_when: true diff --git a/provision/roles/configure_ochami/tasks/main.yml b/provision/roles/configure_ochami/tasks/main.yml index 19f98e96c1..2c99d7a958 100644 --- a/provision/roles/configure_ochami/tasks/main.yml +++ b/provision/roles/configure_ochami/tasks/main.yml @@ -32,6 +32,20 @@ ansible.builtin.include_tasks: create_groups_common.yml loop: "{{ common_cloud_init_groups }}" + - name: Validate additional cloud-init configuration + ansible.builtin.include_tasks: validate_additional_cloud_init.yml + + - name: Create SMD group for additional_cloud_init common + ansible.builtin.include_tasks: create_groups_common.yml + loop: + - "{{ additional_cloud_init_common_group_name }}" + when: additional_cloud_init_common_enabled | default(false) | bool + + - name: Create SMD groups for per-FG additional cloud-init + ansible.builtin.include_tasks: create_groups_additional_fg.yml + with_items: "{{ additional_cloud_init_fg_names | default([]) }}" + when: additional_cloud_init_groups_enabled | default(false) | bool + - name: Configure bss and cloud-init ansible.builtin.include_tasks: configure_bss_cloud_init.yml diff --git a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml new file mode 100644 index 0000000000..65747e39a4 --- /dev/null +++ b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml @@ -0,0 +1,79 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Loads user-provided additional cloud-init config and sets facts for +# configure_cloud_init_additional.yml and create_groups_additional_fg.yml. +# Input validation (prohibited keys, allowed keys, types, FG names) is +# performed earlier in the L2 validation flow (provision_validation.py). +# This file only parses the YAML and sets enabled/data facts. + +- name: Read additional_cloud_init_config_file path + ansible.builtin.set_fact: + additional_cloud_init_file_path: >- + {{ hostvars['localhost']['additional_cloud_init_config_file'] | default('') | trim }} + +- name: Set defaults when additional cloud-init is disabled + ansible.builtin.set_fact: + additional_cloud_init_enabled: false + additional_cloud_init_common_enabled: false + additional_cloud_init_groups_enabled: false + additional_cloud_init_common_data: {} + additional_cloud_init_groups_data: {} + additional_cloud_init_fg_names: [] + when: additional_cloud_init_file_path == '' + +- name: Load additional cloud-init config + when: additional_cloud_init_file_path != '' + block: + - name: Set OIM-local path for additional cloud-init config + ansible.builtin.set_fact: + additional_cloud_init_oim_path: "{{ cloud_init_dir }}/additional_cloud_init.yml" + + - name: Copy additional cloud-init config file to OIM workdir + ansible.builtin.copy: + src: "{{ additional_cloud_init_file_path }}" + dest: "{{ additional_cloud_init_oim_path }}" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + + - name: Load additional cloud-init config file + ansible.builtin.slurp: + src: "{{ additional_cloud_init_oim_path }}" + register: _aci_file_content + + - name: Parse additional cloud-init config YAML + ansible.builtin.set_fact: + _aci_config: "{{ _aci_file_content.content | b64decode | from_yaml | default({}) }}" + + - name: Extract common and groups data + ansible.builtin.set_fact: + additional_cloud_init_common_data: "{{ _aci_config.common | default({}) }}" + additional_cloud_init_groups_data: "{{ _aci_config.groups | default({}) }}" + + - name: Set enabled flags and FG name list + ansible.builtin.set_fact: + additional_cloud_init_common_enabled: "{{ additional_cloud_init_common_data | length > 0 }}" + additional_cloud_init_groups_enabled: "{{ additional_cloud_init_groups_data | length > 0 }}" + additional_cloud_init_fg_names: "{{ additional_cloud_init_groups_data.keys() | list }}" + additional_cloud_init_enabled: >- + {{ (additional_cloud_init_common_data | length > 0) or + (additional_cloud_init_groups_data | length > 0) }} + + - name: Log additional cloud-init load result + ansible.builtin.debug: + msg: >- + additional_cloud_init loaded. + File: {{ additional_cloud_init_oim_path }}. + Common enabled: {{ additional_cloud_init_common_enabled }}. + Groups enabled: {{ additional_cloud_init_groups_enabled }}. + FG names: {{ additional_cloud_init_fg_names }}. diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-additional_cloud_init.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-additional_cloud_init.yaml.j2 new file mode 100644 index 0000000000..327a0c7bef --- /dev/null +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-additional_cloud_init.yaml.j2 @@ -0,0 +1,38 @@ +- name: {{ ci_additional_group_name }} + description: "User-provided additional cloud-init (config and final stages only)" + file: + encoding: plain + content: | + ## template: jinja + #cloud-config + merge_how: + - name: list + settings: [append] + - name: dict + settings: [no_replace, recurse_list] +{% if ci_additional_data.write_files is defined and ci_additional_data.write_files | length > 0 %} + write_files: +{% for wf in ci_additional_data.write_files %} + - path: {{ wf.path }} +{% if wf.permissions is defined %} + permissions: '{{ wf.permissions }}' +{% endif %} +{% if wf.owner is defined %} + owner: {{ wf.owner }} +{% endif %} +{% if wf.append is defined %} + append: {{ wf.append | lower }} +{% endif %} +{% if wf.encoding is defined %} + encoding: {{ wf.encoding }} +{% endif %} + content: | + {{ wf.content | indent(12) }} +{% endfor %} +{% endif %} +{% if ci_additional_data.runcmd is defined and ci_additional_data.runcmd | length > 0 %} + runcmd: +{% for cmd in ci_additional_data.runcmd %} + - {{ cmd }} +{% endfor %} +{% endif %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 0ac3d9f7a4..dfdaa37111 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -198,6 +198,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} - /usr/local/bin/configure_vast_installation.sh # DOCA prerequisites - mount /cert and prepare for DOCA installation - mkdir -p {{ client_mount_path }}/slurm/ssh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 7cedbbbc00..0c727b0c01 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -198,6 +198,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} - /usr/local/bin/configure_vast_installation.sh # DOCA prerequisites - mount /cert and prepare for DOCA installation - mkdir -p {{ client_mount_path }}/slurm/ssh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 5410ac2d5a..c617a46219 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -154,6 +154,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} - /usr/local/bin/configure_vast_installation.sh # DOCA prerequisites - mount /cert and prepare for DOCA installation - mkdir -p {{ client_mount_path }}/slurm/ssh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 93cac94165..653efdbbd2 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -153,6 +153,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} - /usr/local/bin/configure_vast_installation.sh # DOCA prerequisites - mount /cert and prepare for DOCA installation - mkdir -p {{ client_mount_path }}/slurm/ssh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index db721a3d86..f9d28fd065 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -524,7 +524,7 @@ cp {{ k8s_client_mount_path }}/calico/{{ calico_package }}.yml {{ k8s_client_mount_path }}/calico/updated-{{ calico_package }}.yml CALICO_YAML="{{ k8s_client_mount_path }}/calico/updated-{{ calico_package }}.yml" - ADMIN_NIC_CIDR="{{ admin_nic_cidr }}" + ADMIN_NIC_CIDR="{{ calico_cidr }}" # Only add if not already present if ! grep -q 'name: IP_AUTODETECTION_METHOD' "$CALICO_YAML"; then @@ -640,22 +640,22 @@ {% if dns_enabled | default(false) | bool %} # Forward cluster-internal DNS domain to OIM CoreDNS # This allows K8s pods to resolve Slurm/MPI hostnames via CoreDNS - python3 -c ' - import sys, yaml - cfg_path = sys.argv[1] - with open(cfg_path) as f: - doc = yaml.safe_load(f) - corefile = doc["data"]["Corefile"] - fwd_block = "{{ domain_name }}:53 {\n errors\n cache 30\n forward . {{ admin_nic_ip }}\n}\n" - if "{{ domain_name }}:53" not in corefile: - corefile = fwd_block + corefile - doc["data"]["Corefile"] = corefile - with open(cfg_path, "w") as f: - yaml.dump(doc, f, default_flow_style=False) - print("Added {{ domain_name }} forward zone to K8s CoreDNS") - else: - print("{{ domain_name }} forward zone already present in K8s CoreDNS") - ' "$cfg" + python3 - "$cfg" << 'PYEOF' + import sys, yaml + cfg_path = sys.argv[1] + with open(cfg_path) as f: + doc = yaml.safe_load(f) + corefile = doc["data"]["Corefile"] + fwd_block = "{{ domain_name }}:53 {\n errors\n cache 30\n forward . {{ admin_nic_ip }}\n}\n" + if "{{ domain_name }}:53" not in corefile: + corefile = fwd_block + corefile + doc["data"]["Corefile"] = corefile + with open(cfg_path, "w") as f: + yaml.dump(doc, f, default_flow_style=False) + print("Added {{ domain_name }} forward zone to K8s CoreDNS") + else: + print("{{ domain_name }} forward zone already present in K8s CoreDNS") + PYEOF {% endif %} # Apply the patched ConfigMap diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index d79471ce96..4849f8bd88 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -535,7 +535,22 @@ systemctl daemon-reload systemctl restart kubelet - kubectl -n kube-system wait pod/kube-controller-manager-{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %} --for=condition=Ready --timeout=300s + # Wait for kube-controller-manager pod to exist before checking readiness + echo "Waiting for kube-controller-manager pod to be created..." + KCM_POD="kube-controller-manager-{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" + TIMEOUT=120 + ELAPSED=0 + while ! kubectl -n kube-system get pod "$KCM_POD" >/dev/null 2>&1; do + if [ $ELAPSED -ge $TIMEOUT ]; then + echo "ERROR: Timed out waiting for $KCM_POD to be created" + exit 1 + fi + echo "Pod $KCM_POD not found yet, waiting 5s... ($ELAPSED/$TIMEOUT seconds)" + sleep 5 + ELAPSED=$((ELAPSED + 5)) + done + echo "Pod $KCM_POD found, waiting for it to be Ready..." + kubectl -n kube-system wait pod/"$KCM_POD" --for=condition=Ready --timeout=300s systemctl restart nfs-client.target systemctl restart rpcbind # Mark initialization complete so all of above is skipped on reboot! diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index e442c2b5f8..ae0714f27c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -286,6 +286,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} # DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} {{ slurm_ctld_pid_dir_effective }} {{ slurmdbd_pid_dir_effective }} {{ slurm_state_save_location_effective }} {% if slurm_sched_log_dir_effective %}{{ slurm_sched_log_dir_effective }} {% endif %}/etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 096829024f..cdce20193e 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -408,6 +408,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} - /usr/local/bin/configure_vast_installation.sh # DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts - /usr/local/bin/configure_dirs_and_mounts.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 427703257c..ee33e0ff6f 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -414,6 +414,12 @@ runcmd: - /usr/local/bin/set-ssh.sh +{% if dns_enabled | default(false) | bool %} + - | + # Restore CoreDNS resolv.conf after set-ssh.sh (nmcli con up may overwrite it) + printf 'search {{ domain_name }}\nnameserver {{ admin_nic_ip }}\noptions timeout:1 attempts:2\n' > /etc/resolv.conf + chattr +i /etc/resolv.conf || true +{% endif %} - /usr/local/bin/configure_vast_installation.sh # DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts - /usr/local/bin/configure_dirs_and_mounts.sh diff --git a/provision/roles/configure_ochami/templates/nodes/groups_additional_fg.yaml.j2 b/provision/roles/configure_ochami/templates/nodes/groups_additional_fg.yaml.j2 new file mode 100644 index 0000000000..4c2338b41d --- /dev/null +++ b/provision/roles/configure_ochami/templates/nodes/groups_additional_fg.yaml.j2 @@ -0,0 +1,8 @@ +- label: {{ additional_cloud_init_fg_smd_group_name }} + members: + ids: +{% for item in nodes | sort(attribute='value.XNAME') %} +{% if item.value.FUNCTIONAL_GROUP_NAME == functional_group_name %} + - {{ item.value.XNAME }} +{% endif %} +{% endfor %} diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_csi.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_csi.sh.j2 index b6d9d6c6b0..c8f451dc89 100644 --- a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_csi.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_csi.sh.j2 @@ -122,14 +122,6 @@ else echo "Snapshot Controller deployment failed (expected). Continuing..." } - echo "Updating Snapshot Controller image to v8.4.0..." - kubectl set image deployment/snapshot-controller snapshot-controller=registry.k8s.io/sig-storage/snapshot-controller:v8.4.0 -n kube-system >/dev/null 2>&1 || true - - echo "Waiting for Snapshot Controller rollout to finish (timeout: 5 minutes)..." - kubectl rollout status deployment/snapshot-controller -n kube-system --timeout=300s >/dev/null 2>&1 || { - echo "Snapshot Controller rollout did not complete in time." - } - sleep 10 echo "Waiting for Snapshot Controller pods to reach Running state..." MAX_ATTEMPTS=60 WAIT_TIME=5 @@ -165,7 +157,7 @@ else INSTALL_SCRIPT="/opt/omnia/{{ csi_powerscale_dir }}/dell-csi-helm-installer/csi-install.sh" if [ -x "$INSTALL_SCRIPT" ]; then cd "$(dirname "$INSTALL_SCRIPT")" || true - ./csi-install.sh --namespace isilon --values /opt/omnia/{{ csi_powerscale_dir }}/values.yaml --skip-verify & + ./csi-install.sh --namespace isilon --values /opt/omnia/{{ csi_powerscale_dir }}/values.yaml & CSI_PID=$! echo "Waiting for CSI install script (PID $CSI_PID) to complete..." wait $CSI_PID diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 982a164460..b99e59cd07 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -56,6 +56,12 @@ ssh_private_key_path: /root/.ssh/oim_rsa ci_group_common_template: cloud_init/ci-group-common.yaml.j2 ci_group_common_dest: "{{ cloud_init_dir }}/ci-group-common.yaml" +# Usage: configure_cloud_init_additional.yml +ci_group_additional_template: cloud_init/ci-group-additional_cloud_init.yaml.j2 +additional_cloud_init_common_group_name: additional_cloud_init +additional_cloud_init_fg_group_prefix: "additional_cloud_init_" +additional_cloud_init_fg_groups_template: "{{ role_path }}/templates/nodes/groups_additional_fg.yaml.j2" + # Usage: provision_completion.yml provision_completion_msg: | The provision.yml playbook has completed successfully. diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 47a48f976d..87f29eb7bf 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -66,6 +66,40 @@ ansible.builtin.set_fact: admin_nic_cidr: "{{ (admin_nic_ip + '/' + admin_netmask_bits) | ansible.utils.ipaddr('network/prefix') }}" +- name: Read PXE mapping to find control plane node subnet + community.general.read_csv: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + register: pxe_data_for_cidr + delegate_to: localhost + connection: local + run_once: true + +- name: Determine Calico CIDR from control plane nodes subnet + ansible.builtin.set_fact: + calico_cidr: >- + {%- set kcp_ips = pxe_data_for_cidr.list + | selectattr('FUNCTIONAL_GROUP_NAME', 'match', '^service_kube_control_plane') + | map(attribute='ADMIN_IP') | list -%} + {%- if kcp_ips | length > 0 -%} + {%- set ref_ip = kcp_ips[0] -%} + {%- set primary_net = (admin_nic_ip + '/' + admin_netmask_bits) | ansible.utils.ipaddr('network/prefix') -%} + {%- if ref_ip | ansible.utils.ipaddr(primary_net) -%} + {{ primary_net }} + {%- else -%} + {%- set additional = hostvars['localhost']['network_data']['admin_network']['additional_subnets'] | default([]) -%} + {%- set ns = namespace(found='') -%} + {%- for s in additional if not ns.found -%} + {%- set subnet_cidr = (s.subnet + '/' + s.netmask_bits) | ansible.utils.ipaddr('network/prefix') -%} + {%- if ref_ip | ansible.utils.ipaddr(subnet_cidr) -%} + {%- set ns.found = subnet_cidr -%} + {%- endif -%} + {%- endfor -%} + {{ ns.found if ns.found else primary_net }} + {%- endif -%} + {%- else -%} + {{ (admin_nic_ip + '/' + admin_netmask_bits) | ansible.utils.ipaddr('network/prefix') }} + {%- endif -%} + - name: Fetch server_ip and server_share_path from list when nfs sever is localhost ansible.builtin.set_fact: nfs_server_ip: "{{ hostvars['127.0.0.1']['admin_nic_ip'] }}" diff --git a/rollback/ansible.cfg b/rollback/ansible.cfg index 051ecdb550..c4795ac63f 100644 --- a/rollback/ansible.cfg +++ b/rollback/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins roles_path = roles:../upgrade/roles:../utils/roles library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/rollback/playbooks/rollback_k8s.yml b/rollback/playbooks/rollback_k8s.yml index 2708c9b658..2811ae69c7 100644 --- a/rollback/playbooks/rollback_k8s.yml +++ b/rollback/playbooks/rollback_k8s.yml @@ -36,23 +36,49 @@ when: - rollback_manifest.component_status[component_name] | default('pending') == 'completed' - - name: "Mark as skipped — BuildStream terminal gate active (C-24)" - ansible.builtin.copy: - content: >- - {{ rollback_manifest | combine({ - 'component_status': rollback_manifest.component_status | combine({ - component_name: 'skipped' - }) - }) | to_nice_yaml }} - dest: "{{ rollback_manifest_path }}" - mode: '0644' + - name: "BuildStream terminal gate active (C-24)" when: - hostvars['localhost']['build_stream_terminal'] | default(false) | bool + - hostvars['localhost']['upgrade_manifest'] is defined + - hostvars['localhost']['upgrade_manifest'].component_status is defined + - hostvars['localhost']['upgrade_manifest'].component_status.k8s | default('pending') not in ['completed', 'in-progress', 'failed'] + - hostvars['localhost']['upgrade_manifest'].component_status.telemetry | default('pending') not in ['completed', 'in-progress', 'failed'] + block: + - name: "Mark as skipped — BuildStream terminal gate active (C-24)" + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'skipped' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' - - name: "Skip — BuildStream terminal gate active (C-24)" - ansible.builtin.meta: end_play + - name: "Skip — BuildStream terminal gate active (C-24)" + ansible.builtin.meta: end_play + + # ── Gate 3: Skip rollback if k8s/telemetry were never started during upgrade ── + - name: "K8s/telemetry never started during upgrade" when: - - hostvars['localhost']['build_stream_terminal'] | default(false) | bool + - hostvars['localhost']['upgrade_manifest'] is defined + - hostvars['localhost']['upgrade_manifest'].component_status is defined + - hostvars['localhost']['upgrade_manifest'].component_status.k8s | default('pending') not in ['completed', 'in-progress', 'failed'] + - hostvars['localhost']['upgrade_manifest'].component_status.telemetry | default('pending') not in ['completed', 'in-progress', 'failed'] + block: + - name: "Mark as skipped — k8s/telemetry never started during upgrade" + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'skipped' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + - name: "Skip — k8s/telemetry never started during upgrade" + ansible.builtin.meta: end_play # ── Pre-check: Skip rollback if service_k8s is not in software_config.json ── - name: "Load software_config.json" diff --git a/rollback/playbooks/rollback_slurm.yml b/rollback/playbooks/rollback_slurm.yml index 9cb4b01833..821f54d2c9 100644 --- a/rollback/playbooks/rollback_slurm.yml +++ b/rollback/playbooks/rollback_slurm.yml @@ -22,6 +22,7 @@ gather_facts: false vars: rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + upgrade_manifest_path: /opt/omnia/.data/upgrade_manifest.yml component_name: slurm tasks: - name: Check if rollback manifest exists @@ -40,42 +41,47 @@ - name: Read rollback_manifest.yml ansible.builtin.include_vars: file: "{{ rollback_manifest_path }}" - name: manifest + name: rollback_manifest + + - name: Read upgrade_manifest.yml + ansible.builtin.include_vars: + file: "{{ upgrade_manifest_path }}" + name: upgrade_manifest + failed_when: false + + - name: Set upgrade_manifest to empty dict if not found + ansible.builtin.set_fact: + slurm_skip: false + upgrade_manifest: "{{ hostvars['localhost']['upgrade_manifest'] }}" - name: Read software_config.json ansible.builtin.include_vars: - file: "{{ manifest.backup_dir }}/input/project_default/software_config.json" + file: "{{ rollback_manifest.backup_dir }}/input/project_default/software_config.json" name: software_config - name: Determine slurm_skip status ansible.builtin.set_fact: - slurm_skip: >- - {{ - (manifest.component_status[component_name] | default('pending') == 'completed') - or (software_config.softwares | selectattr('name', 'equalto', 'slurm_custom') | list | length == 0) - }} + slurm_skip: true cacheable: true + when: > + (rollback_manifest.component_status.slurm | default('pending') == 'completed') + or (software_config.softwares | selectattr('name', 'equalto', 'slurm_custom') | list | length == 0) + or (build_stream_terminal | default(false) | bool) + or (upgrade_manifest.component_status.slurm | default('pending') not in ['completed', 'in-progress', 'failed']) - name: "Handle BuildStream terminal gate (C-24)" - when: ((build_stream_terminal | default(false) | bool) or - (manifest.component_status.build_stream | default('pending') == 'completed')) + when: slurm_skip block: - name: "Mark as skipped — BuildStream terminal gate active" ansible.builtin.copy: content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ component_name: 'skipped' }) }) | to_nice_yaml }} dest: "{{ rollback_manifest_path }}" mode: '0644' - when: not slurm_skip - - - name: "Set slurm_skip — BuildStream terminal gate active" - ansible.builtin.set_fact: - slurm_skip: true - cacheable: true - name: "Skip — BuildStream terminal gate active" ansible.builtin.meta: end_play @@ -86,8 +92,8 @@ - name: Set slurm rollback status to in-progress ansible.builtin.copy: content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ component_name: 'in-progress' }) }) | to_nice_yaml }} diff --git a/rollback/rollback.yml b/rollback/rollback.yml index 24c812f713..671d487147 100644 --- a/rollback/rollback.yml +++ b/rollback/rollback.yml @@ -256,15 +256,25 @@ {{ (build_stream_config.enable_build_stream | default(false) | bool) or ((upgrade_manifest is defined) and (upgrade_manifest.component_status is defined) - and (upgrade_manifest.component_status.build_stream | default('pending') in ['completed', 'skipped'])) }} + and (upgrade_manifest.component_status.build_stream | default('pending') in ['completed'])) }} cacheable: true - name: Identify components skipped by BuildStream terminal gate ansible.builtin.set_fact: bs_rollback_skipped: >- - {{ ['slurm', 'k8s-telemetry'] - if (build_stream_terminal | bool) - else [] }} + {%- set skipped = [] -%} + {%- if build_stream_terminal | bool -%} + {%- if upgrade_manifest is defined and upgrade_manifest.component_status is defined -%} + {%- if upgrade_manifest.component_status.slurm | default('pending') not in ['completed', 'in-progress', 'failed'] -%} + {%- set _ = skipped.append('slurm') -%} + {%- endif -%} + {%- if (upgrade_manifest.component_status.k8s | default('pending') not in ['completed', 'in-progress', 'failed']) + and (upgrade_manifest.component_status.telemetry | default('pending') not in ['completed', 'in-progress', 'failed']) -%} + {%- set _ = skipped.append('k8s-telemetry') -%} + {%- endif -%} + {%- endif -%} + {%- endif -%} + {{ skipped }} - name: Report BuildStream terminal gate activation (rollback) ansible.builtin.debug: @@ -275,19 +285,46 @@ Only build_stream and oim will be rolled back. when: build_stream_terminal | bool - - name: Report already-rolled-back components (will be skipped) + - name: Report component rollback actions based on upgrade manifest ansible.builtin.debug: - msg: "Component '{{ item }}' already rolled back — will be skipped." + msg: >- + {%- if rollback_manifest.component_status[item] is defined and rollback_manifest.component_status[item] == 'completed' -%} + Component '{{ item }}' already rolled back — will be skipped. + {%- elif rollback_manifest.component_status[item] is defined and rollback_manifest.component_status[item] == 'skipped' -%} + Component '{{ item }}' was skipped during rollback — will be skipped. + {%- elif upgrade_manifest is defined and upgrade_manifest.component_status is defined -%} + {%- if item == 'k8s-telemetry' -%} + {%- set k8s_status = upgrade_manifest.component_status.k8s | default('pending') -%} + {%- set telemetry_status = upgrade_manifest.component_status.telemetry | default('pending') -%} + {%- if k8s_status == 'skipped' and telemetry_status == 'skipped' -%} + Component '{{ item }}' was skipped during upgrade — will be skipped during rollback. + {%- elif k8s_status in ['completed', 'in-progress', 'failed'] or telemetry_status in ['completed', 'in-progress', 'failed'] -%} + Component '{{ item }}' will be rolled back (upgrade status: k8s={{ k8s_status }}, telemetry={{ telemetry_status }}). + {%- else -%} + Component '{{ item }}' was not upgraded — will be skipped during rollback. + {%- endif -%} + {%- elif upgrade_manifest.component_status[item] is defined -%} + {%- set upgrade_status = upgrade_manifest.component_status[item] -%} + {%- if upgrade_status == 'skipped' -%} + Component '{{ item }}' was skipped during upgrade — will be skipped during rollback. + {%- elif upgrade_status in ['completed', 'in-progress', 'failed'] -%} + Component '{{ item }}' will be rolled back (upgrade status: {{ upgrade_status }}). + {%- else -%} + Component '{{ item }}' was not upgraded — will be skipped during rollback. + {%- endif -%} + {%- endif -%} + {%- endif -%} loop: "{{ requested_tags }}" when: - - rollback_manifest.component_status[item] is defined - - rollback_manifest.component_status[item] in ['completed', 'skipped'] + - (rollback_manifest.component_status[item] is defined and rollback_manifest.component_status[item] in ['completed', 'skipped']) + or (upgrade_manifest is defined and upgrade_manifest.component_status is defined) # ────────────────────────────────────────────────────────────────────── # Load credentials (provision_password etc.) from omnia_config_credentials.yml # Mirrors upgrade pattern: ../utils/credential_utility/get_config_credentials.yml # Read-only — no interactive prompts, no re-encryption. # ────────────────────────────────────────────────────────────────────── + - name: Load rollback credentials ansible.builtin.import_playbook: playbooks/load_rollback_credentials.yml tags: [build_stream, buildstream, oim] diff --git a/telemetry/ansible.cfg b/telemetry/ansible.cfg index e952b30779..4e21bb4a8a 100644 --- a/telemetry/ansible.cfg +++ b/telemetry/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/upgrade/ansible.cfg b/upgrade/ansible.cfg index b52a289123..48e778585b 100644 --- a/upgrade/ansible.cfg +++ b/upgrade/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins roles_path = roles:../utils/roles:../prepare_oim/roles library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/upgrade/playbooks/upgrade_build_stream.yml b/upgrade/playbooks/upgrade_build_stream.yml index 9f2c202a58..1b86021f32 100644 --- a/upgrade/playbooks/upgrade_build_stream.yml +++ b/upgrade/playbooks/upgrade_build_stream.yml @@ -123,33 +123,6 @@ _backup_enable_build_stream: "{{ (_backup_bs_config_slurp.content | b64decode | from_yaml).enable_build_stream | default(false) | bool }}" when: _backup_bs_config_slurp is not failed - - name: "Abort — disabling BuildStream during upgrade is not supported" - ansible.builtin.fail: - msg: | - ══════════════════════════════════════════════════════════════ - UNSUPPORTED TOPOLOGY CHANGE DETECTED - ══════════════════════════════════════════════════════════════ - - BuildStream was ENABLED in the pre-upgrade (2.1) configuration - but is now DISABLED in the current build_stream_config.yml. - - Disabling BuildStream during an upgrade is not a supported - topology. Once BuildStream is enabled, it cannot be disabled - through the upgrade process. - - To proceed, restore enable_build_stream: true in: - {{ input_project_dir }}/build_stream_config.yml - - Pre-upgrade config (backup): - enable_build_stream: {{ _backup_enable_build_stream }} - Current config: - enable_build_stream: {{ enable_build_stream }} - - ══════════════════════════════════════════════════════════════ - when: - - _backup_enable_build_stream | default(false) | bool - - not enable_build_stream | bool - - name: Mark build_stream as skipped when not enabled ansible.builtin.copy: content: >- diff --git a/upgrade/playbooks/upgrade_validations.yml b/upgrade/playbooks/upgrade_validations.yml new file mode 100644 index 0000000000..31d31b92a2 --- /dev/null +++ b/upgrade/playbooks/upgrade_validations.yml @@ -0,0 +1,106 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# upgrade_validations.yml — Global Pre-flight Validations +# ============================================================================ +# Runs BEFORE any component upgrade sub-flow to enforce fail-fast semantics. +# All topology and configuration validations that could leave the cluster in +# a partially upgraded state if they fail late MUST be placed here. +# +# Current validations: +# 1. BuildStream topology change detection (enabled→disabled is blocked) +# +# This playbook reads the upgrade manifest and backup configuration to +# detect unsupported topology changes before any component is modified. +# ============================================================================ + +- name: "Global pre-flight validations — fail-fast topology checks" + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + tasks: + # ── Read manifest for backup_dir and current state ── + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: _val_raw_manifest + + - name: Parse upgrade manifest + ansible.builtin.set_fact: + _val_manifest: "{{ _val_raw_manifest.content | b64decode | from_yaml }}" + + - name: Set backup directory from manifest + ansible.builtin.set_fact: + _val_backup_dir: "{{ _val_manifest.backup_dir }}" + + # ══════════════════════════════════════════════════════════════════ + # Validation 1: BuildStream topology change detection + # ══════════════════════════════════════════════════════════════════ + # Disabling BuildStream during upgrade (enabled in 2.1 → disabled + # in 2.2) is an unsupported topology change. This MUST be caught + # before OIM upgrade starts to prevent a partially upgraded cluster. + + - name: Read current build_stream_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/build_stream_config.yml" + failed_when: false + + - name: Set current enable_build_stream fact + ansible.builtin.set_fact: + _val_current_bs: "{{ enable_build_stream | default(false) | bool }}" + + - name: Read backup build_stream_config.yml (pre-upgrade state) + ansible.builtin.slurp: + src: "{{ _val_backup_dir }}/input/project_default/build_stream_config.yml" + register: _val_backup_bs_slurp + failed_when: false + + - name: Parse backup build_stream_config.yml + ansible.builtin.set_fact: + _val_backup_bs: "{{ (_val_backup_bs_slurp.content | b64decode | from_yaml).enable_build_stream | default(false) | bool }}" + when: _val_backup_bs_slurp is not failed + + - name: "Abort — disabling BuildStream during upgrade is not supported" + ansible.builtin.fail: + msg: | + ══════════════════════════════════════════════════════════════ + UNSUPPORTED TOPOLOGY CHANGE DETECTED + ══════════════════════════════════════════════════════════════ + + BuildStream was ENABLED in the pre-upgrade (2.1) configuration + but is now DISABLED in the current build_stream_config.yml. + + Disabling BuildStream during an upgrade is not a supported + topology. Once BuildStream is enabled, it cannot be disabled + through the upgrade process. + + To proceed, restore enable_build_stream: true in: + {{ input_project_dir }}/build_stream_config.yml + + Pre-upgrade config (backup): + enable_build_stream: {{ _val_backup_bs | default(false) }} + Current config: + enable_build_stream: {{ _val_current_bs }} + + ══════════════════════════════════════════════════════════════ + when: + - _val_backup_bs | default(false) | bool + - not _val_current_bs | bool + + - name: "Pre-flight validations passed" + ansible.builtin.debug: + msg: "All global pre-flight topology validations passed." diff --git a/upgrade/roles/import_input_parameters/templates/provision_config.j2 b/upgrade/roles/import_input_parameters/templates/provision_config.j2 index 004f8e1359..375493df22 100644 --- a/upgrade/roles/import_input_parameters/templates/provision_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/provision_config.j2 @@ -49,6 +49,15 @@ dns_enabled: false #### Optional # Pin a specific kernel version for boot image selection. +# The specified version applies to both x86_64 and aarch64 architectures. # Leave empty ("") to auto-select the latest available image from S3. -# Example: kernel_version_override: "6.12.0-55.76.1.el10_0.x86_64" +# Example: kernel_version_override: "6.12.0-55.76.1.el10_0" kernel_version_override: "{{ provision_kernel_version_override }}" + +#### Optional +# Path to additional cloud-init configuration file for stateless node provisioning. +# The file supports both common (all nodes) and per-functional-group cloud-init sections. +# An example file is provided at omnia/examples/additional_cloud_init.yml +# Leave empty ("") to disable additional cloud-init. +# Default: "" (disabled) +additional_cloud_init_config_file: "" diff --git a/upgrade/roles/upgrade_k8s/tasks/validate_cluster_nodes.yml b/upgrade/roles/upgrade_k8s/tasks/validate_cluster_nodes.yml index af98d72a13..c87774ecc5 100644 --- a/upgrade/roles/upgrade_k8s/tasks/validate_cluster_nodes.yml +++ b/upgrade/roles/upgrade_k8s/tasks/validate_cluster_nodes.yml @@ -312,10 +312,13 @@ ansible.builtin.debug: msg: "Checking all nodes are in Ready state..." -- name: Get node Ready status from cluster - ansible.builtin.command: - cmd: >- - kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}|{range .status.conditions[?(@.type=="Ready")]}{.status}{end}{"\n"}{end}' +- name: Get node status from cluster + ansible.builtin.shell: + cmd: | + set -o pipefail + kubectl get nodes --no-headers | awk '{print $1"|"$2}' + args: + executable: /bin/bash delegate_to: "{{ kube_vip }}" register: node_ready_status changed_when: false @@ -324,15 +327,20 @@ ansible.builtin.set_fact: nodes_not_ready: >- {{ nodes_not_ready | default([]) + ( - [item.split('|')[0]] if (item.split('|')[1] | default('False')) != 'True' else [] + [item.split('|')[0]] if item.split('|')[1] != 'Ready' else [] ) }} loop: "{{ node_ready_status.stdout_lines | select('match', '.+') | list }}" loop_control: label: "{{ item.split('|')[0] }}" +- name: Convert not-ready IPs to hostnames + ansible.builtin.set_fact: + nodes_not_ready_hostnames: >- + {{ nodes_not_ready | default([]) | map('extract', ip_to_hostname) | select('defined') | list }} + - name: Filter not-ready nodes to only validated nodes ansible.builtin.set_fact: - validated_nodes_not_ready: "{{ nodes_not_ready | default([]) | intersect(all_validated_nodes) }}" + validated_nodes_not_ready: "{{ nodes_not_ready_hostnames | intersect(all_validated_nodes) }}" - name: Display node Ready status results ansible.builtin.debug: diff --git a/upgrade/upgrade.yml b/upgrade/upgrade.yml index b101d0d2e2..7bf744ca69 100644 --- a/upgrade/upgrade.yml +++ b/upgrade/upgrade.yml @@ -590,6 +590,15 @@ ansible.builtin.import_playbook: ../input_validation/validate_config.yml tags: always +# ────────────────────────────────────────────────────────────────────── +# Global pre-flight validations — topology checks that MUST pass +# before any component upgrade begins. Catches unsupported changes +# (e.g., BuildStream enabled→disabled) to prevent partial upgrades. +# ────────────────────────────────────────────────────────────────────── +- name: Global pre-flight validations (fail-fast topology checks) + ansible.builtin.import_playbook: playbooks/upgrade_validations.yml + tags: always + - name: Load Omnia credential utility ansible.builtin.import_playbook: ../utils/credential_utility/get_config_credentials.yml tags: always diff --git a/utils/ansible.cfg b/utils/ansible.cfg index 7b1befb966..e22c6ad50f 100644 --- a/utils/ansible.cfg +++ b/utils/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../common/callback_plugins library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/utils/credential_utility/ansible.cfg b/utils/credential_utility/ansible.cfg index 92cfff7494..cfa0e80cc2 100644 --- a/utils/credential_utility/ansible.cfg +++ b/utils/credential_utility/ansible.cfg @@ -8,6 +8,8 @@ executable = /bin/bash interpreter_python = /usr/bin/python3 deprecation_warnings = false show_task_path_on_failure = false +stdout_callback = omnia_default +callback_plugins = ../../common/callback_plugins library = ../../common/library/modules module_utils = ../../common/library/module_utils diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index cdfa1adff8..bfa2246b3f 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -28,26 +28,41 @@ - name: End play for this host ansible.builtin.meta: end_host -- name: Show status of the Lifecycle Controller - dellemc.openmanage.idrac_lifecycle_controller_status_info: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" - idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" +- name: Get LC status of iDRAC + ansible.builtin.uri: + url: "https://{{ inventory_hostname }}/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellLCService/Actions/DellLCService.GetRemoteServicesAPIStatus" + user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + method: POST + force_basic_auth: true validate_certs: false - register: lc_check_status - until: - - lc_check_status.lc_status_info is defined - - lc_check_status.lc_status_info.LCReady is defined - - lc_check_status.lc_status_info.LCReady + return_content: true + body_format: json + body: {} + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: [200, 201, 202] + register: idrac_status + until: idrac_status.status in [200, 201, 202] retries: 3 delay: 5 ignore_errors: true - ignore_unreachable: true - name: IDRAC ops when ready when: - - lc_check_status is success - - lc_check_status.lc_status_info.LCReady + - idrac_status is success + - idrac_status.json is defined + - idrac_status.json.LCStatus | lower == "ready" + module_defaults: + dellemc.openmanage.redfish_powerstate: + baseuri: "{{ inventory_hostname }}" + username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + validate_certs: false + resource_id: "System.Embedded.1" block: - name: Set boot option from pxe dellemc.openmanage.idrac_boot: @@ -59,16 +74,13 @@ boot_source_override_target: "{{ boot_source_override_target }}" boot_source_override_enabled: "{{ boot_source_override_enabled }}" reset_type: "none" # Dont Restart here as to Handle poweroff case + resource_id: "System.Embedded.1" register: pxe_provisioning ignore_errors: true ignore_unreachable: true - name: Try ForceRestart dellemc.openmanage.redfish_powerstate: - baseuri: "{{ inventory_hostname }}" - username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" - password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" - validate_certs: false reset_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" when: restart_host register: restart_op @@ -76,10 +88,6 @@ - name: Try On if ForceRestart did not change dellemc.openmanage.redfish_powerstate: - baseuri: "{{ inventory_hostname }}" - username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" - password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" - validate_certs: false reset_type: "On" register: power_on_op failed_when: false @@ -91,7 +99,7 @@ ansible.builtin.set_fact: reboot_failed: true reboot_status: "{{ lc_check_fail_msg }}" - when: lc_check_status is unreachable or lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) + when: idrac_status is failed or not (idrac_status.json.LCStatus | lower == "ready") - name: Fail if PXE provisioning failed ansible.builtin.set_fact: