Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5589247
task failure ansible.cfg update
abhishek-sa1 Jun 9, 2026
ffb1e74
fix(validation): validate HA VIP against service_kube_control_plane s…
sujit-jadhav Jun 10, 2026
69baa32
fix: wait for kube-controller-manager pod before checking readiness
Katakam-Rakesh Jun 10, 2026
5143237
Merge branch 'dell:pub/q2_upgrade' into pub/q2_upgrade
Katakam-Rakesh Jun 10, 2026
8b45dc1
Fix nodes not ready logic
Katakam-Rakesh Jun 10, 2026
62156aa
Merge branch 'pub/q2_upgrade' of github.com:Katakam-Rakesh/omnia into…
Katakam-Rakesh Jun 10, 2026
2aa8ed9
update logic
Katakam-Rakesh Jun 10, 2026
34a8503
Merge pull request #4725 from Katakam-Rakesh/pub/q2_upgrade
abhishek-sa1 Jun 10, 2026
71e2b8b
Merge pull request #4719 from abhishek-sa1/pub/q2_upgrade
abhishek-sa1 Jun 10, 2026
2ea288d
Update MinIO S3 credential variable mapping to use s3_access_id and s…
Venu-p1 Jun 10, 2026
1458c7f
Merge pull request #4731 from Venu-p1/fix/cleanup-image-2
abhishek-sa1 Jun 10, 2026
6dd2bb9
Set PXE boot replace lc check moduel with POST call
jagadeeshnv Jun 10, 2026
aa1771c
fix(provision): fix DNS resolution on slurm/login nodes when dns_enab…
sujit-jadhav Jun 10, 2026
e5d6052
csi version change from 2.16 to 2.17
sakshi-singla-1735 Jun 10, 2026
e9b182b
minor fix
sakshi-singla-1735 Jun 10, 2026
84016a0
Merge pull request #4732 from dell/fix/slurm-dns-resolv-conf-protection
snarthan Jun 10, 2026
07482f3
fix(provision): fix Python script embedding in CoreDNS cloud-init tem…
sujit-jadhav Jun 10, 2026
89985f1
fix(provision): use control plane nodes subnet for Calico IP autodete…
sujit-jadhav Jun 10, 2026
f4adb44
Merge pull request #4723 from dell/fix/OMN01D-2534-ha-vip-multisubnet
abhishek-sa1 Jun 10, 2026
162f6ab
defct fix for input valdition and pxe mapping check
Kratika-P Jun 10, 2026
202cb27
adding connection for task
Kratika-P Jun 10, 2026
e4ba546
Merge pull request #4736 from Kratika-P/pub/q2_upgrade
sujit-jadhav Jun 10, 2026
842f214
Merge pull request #4739 from Rajeshkumar-s2/pub/q2_upgrade
Rajeshkumar-s2 Jun 11, 2026
ea40a4b
Merge pull request #4734 from sakshi-singla-1735/pub/q2_upgrade
snarthan Jun 11, 2026
c32b296
Update image builder container tag for vulnerability (#4728)
abhishek-sa1 Jun 11, 2026
571d82c
feat(provision): Add user-defined cloud-init config support with cent…
abhishek-sa1 Jun 11, 2026
7e0d6ef
Merge pull request #4733 from jagadeeshnv/pub/q2_upgrade
jagadeeshnv Jun 11, 2026
21f6be6
fix rollback: Check upgrade manifest status before skipping component…
Katakam-Rakesh Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ executable = /bin/bash
display_skipped_hosts = false
deprecation_warnings = false
show_task_path_on_failure = false
stdout_callback = omnia_default
callback_plugins = common/callback_plugins
library = common/library/modules
module_utils = common/library/module_utils

Expand Down
2 changes: 2 additions & 0 deletions build_image_aarch64/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ executable = /bin/bash
interpreter_python = /usr/bin/python3
deprecation_warnings = false
show_task_path_on_failure = false
stdout_callback = omnia_default
callback_plugins = ../common/callback_plugins
library = ../common/library/modules
module_utils = ../common/library/module_utils

Expand Down
2 changes: 2 additions & 0 deletions build_image_x86_64/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ executable = /bin/bash
interpreter_python = /usr/bin/python3
deprecation_warnings = false
show_task_path_on_failure = false
stdout_callback = omnia_default
callback_plugins = ../common/callback_plugins
library = ../common/library/modules
module_utils = ../common/library/module_utils

Expand Down
2 changes: 1 addition & 1 deletion build_image_x86_64/roles/image_creation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
---
pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.1"
pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.2"
x86_64_local_tag: "x86_64-image-builder/ochami"
pull_image_retries: "5"
pull_image_delay: "10"
Expand Down
6 changes: 3 additions & 3 deletions build_stream/core/catalog/test_fixtures/catalog_rhel.json
Original file line number Diff line number Diff line change
Expand Up @@ -3272,8 +3272,8 @@
"x86_64"
],
"Type": "image",
"Tag": "1.1",
"Version": "1.1"
"Tag": "1.2",
"Version": "1.2"
},
"os_package_id_45": {
"Name": "which",
Expand Down Expand Up @@ -4734,4 +4734,4 @@
}
}
}
}
}
56 changes: 55 additions & 1 deletion build_stream/orchestrator/upload/use_cases/upload_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@
"""Upload files use case implementation."""

import hashlib
import shutil
from datetime import datetime, timezone
from pathlib import Path
from typing import List

import yaml

from api.logging_utils import log_secure_info
from common.config import BuildStreamConfig
from common.config import BuildStreamConfig, load_config
from core.artifacts.entities import ArtifactRecord
from core.artifacts.exceptions import ArtifactAlreadyExistsError
from core.artifacts.interfaces import ArtifactMetadataRepository, ArtifactStore
Expand Down Expand Up @@ -167,6 +168,13 @@ def execute(self, command: UploadFilesCommand) -> UploadFilesResult:
# Always emit audit event with file details (for all uploads)
self._emit_upload_files_audit_event(command, uploaded_files)

# Copy software_config.json from job artifacts to shared input directory.
# During build pipeline, generate-input-files has not run yet so the
# file won't exist — the copy is safely skipped.
# During deploy pipeline, the file was generated during the prior build
# and must be synced so the deploy uses the correct software config.
self._copy_software_config_from_artifacts(str(command.job_id))

# Build result
summary = UploadSummary(
total_files=len(uploaded_files),
Expand Down Expand Up @@ -574,6 +582,52 @@ def _emit_upload_files_audit_event(
f"Files uploaded: job_id={command.job_id}, total={len(uploaded_files)}, changed={changed_count}, unchanged={unchanged_count}"
)

def _copy_software_config_from_artifacts(self, job_id: str) -> None:
"""Copy software_config.json from job artifacts to shared input directory.

The generate-input-files stage produces software_config.json in the
job-specific artifacts directory (artifacts/{job_id}/input/).
This method copies it to the shared playbook input directory so that
the deploy pipeline uses the software config matching the catalog
that was used to build the image.

If the file does not exist (e.g. upload called from the build pipeline
before generate-input-files has run), the copy is silently skipped.

Args:
job_id: Job identifier.
"""
try:
config = load_config()
artifacts_base = Path(config.file_store.base_path)
source = artifacts_base / job_id / "software_config.json"

if not source.exists():
log_secure_info(
'debug',
"software_config.json not found in job artifacts, skipping copy",
job_id=job_id,
)
return

shared_input_dir = Path(DEFAULT_PLAYBOOK_INPUT_DIR)
shared_input_dir.mkdir(parents=True, exist_ok=True)
dest = shared_input_dir / "software_config.json"

shutil.copy2(source, dest)
log_secure_info(
'info',
f"Copied software_config.json from {source} to {dest}",
job_id=job_id,
)
except Exception as exc:
log_secure_info(
'warning',
f"Failed to copy software_config.json from job artifacts: {exc}",
job_id=job_id,
exc_info=True,
)

def _emit_audit_event(
self,
command: UploadFilesCommand,
Expand Down
171 changes: 171 additions & 0 deletions common/callback_plugins/omnia_default.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Custom Ansible stdout callback plugin for Omnia.

Extends the built-in ``default`` callback to suppress the ``[ERROR]``
source-context block introduced in ansible-core 2.19/2.20 (Data Tagging).
Renders multiline ``msg`` fields with real newlines on failure.
All other output (task banners, ok/changed/skipped lines, play recaps,
etc.) is unchanged.

Usage — add to every ``ansible.cfg``::

[defaults]
stdout_callback = omnia_default
callback_plugins = <relative-path-to>/common/callback_plugins
"""
from __future__ import annotations

import json
import re

from ansible import constants as C # pylint: disable=no-name-in-module
from ansible.plugins.callback.default import CallbackModule as DefaultCallback

DOCUMENTATION = r"""
name: omnia_default
type: stdout
short_description: Omnia default stdout callback
version_added: "2.1"
description:
- Inherits every behaviour of the built-in C(default) callback.
- Suppresses the C([ERROR]) source-context block added in
ansible-core 2.19/2.20.
- Renders multiline C(msg) fields with real newlines on failure.
- Produces only the classic single-line C(fatal:) output.
extends_documentation_fragment:
- default_callback
"""

# Pattern to detect the 2.19/2.20 [ERROR] task-failure context block
_ERROR_CONTEXT_PATTERN = re.compile(
r"\[ERROR\]:\s*Task failed:|"
r"\[ERROR\]:\s*Action failed:|"
r"Origin:\s+\S+\.ya?ml:\d+:\d+|"
r"\s+\^\s+column\s+\d+"
)


class CallbackModule(DefaultCallback): # pylint: disable=too-many-ancestors
"""
Omnia stdout callback plugin.

Extends the built-in default callback to suppress the ``[ERROR]``
source-context block introduced in ansible-core 2.19/2.20 and
renders multiline failure messages with real newlines.
"""

CALLBACK_VERSION = 2.0
CALLBACK_TYPE = "stdout"
CALLBACK_NAME = "omnia_default"

def __init__(self):
super().__init__()
self._patched = False

def _patch_display(self):
"""Monkey-patch Display.display to drop [ERROR] context blocks."""
if self._patched:
return
self._patched = True

original_display = self._display.display

def filtered_display(msg, *args, **kwargs):
msg_str = str(msg)
if _ERROR_CONTEXT_PATTERN.search(msg_str):
return
original_display(msg, *args, **kwargs)

self._display.display = filtered_display

def set_options(self, task_keys=None, var_options=None, direct=None):
"""Load options and apply the display patch."""
super().set_options(task_keys=task_keys, var_options=var_options, direct=direct)
self._patch_display()

def v2_playbook_on_play_start(self, play):
"""Ensure patch is active before the first play."""
self._patch_display()
super().v2_playbook_on_play_start(play)

def _format_result_msg(self, result_dict):
"""
Format result dict for display.

If ``msg`` contains newlines, display them as real line breaks
instead of escaped ``\\n`` characters.
"""
msg = result_dict.get("msg", "")
if isinstance(msg, str) and "\n" in msg:
filtered = {k: v for k, v in result_dict.items() if k != "msg"}
return f"{json.dumps(filtered, sort_keys=True)}\nmsg: |-\n {msg.replace(chr(10), chr(10) + ' ')}"
return self._dump_results(result_dict)

def v2_runner_on_failed(self, result, ignore_errors=False):
"""
Render task failures as the classic single-line ``fatal:`` message.

The ``[ERROR]`` block is suppressed by the ``Display.display`` patch.
Multiline ``msg`` values are rendered with real newlines.
"""
# pylint: disable=protected-access
self._patch_display()
delegated_vars = result._result.get("_ansible_delegated_vars", None)
self._clean_results(result._result, result._task.action)

if self._last_task_banner != result._task._uuid:
self._print_task_banner(result._task)

self._handle_exception(
result._result,
use_stderr=self.get_option("display_failed_stderr"),
)
self._handle_warnings(result._result)

if result._task.loop and "results" in result._result:
self._process_items(result)
else:
formatted = self._format_result_msg(result._result)
host_name = result._host.get_name()
stderr_opt = self.get_option("display_failed_stderr")
color = getattr(C, "COLOR_ERROR", "red")

if delegated_vars:
self._display.display(
f"fatal: [{host_name} -> {delegated_vars['ansible_host']}]: FAILED! => {formatted}",
color=color,
stderr=stderr_opt,
)
else:
self._display.display(
f"fatal: [{host_name}]: FAILED! => {formatted}",
color=color,
stderr=stderr_opt,
)

if ignore_errors:
color_skip = getattr(C, "COLOR_SKIP", "cyan")
self._display.display("...ignoring", color=color_skip)
# pylint: enable=protected-access

def v2_playbook_on_stats(self, stats):
"""Ensure patch is active during PLAY RECAP to suppress replayed errors."""
self._patch_display()
super().v2_playbook_on_stats(stats)

def _display_error_context(self, *args, **kwargs):
"""Intentionally suppressed — prevents [ERROR] source-context rendering."""
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
"intel_benchmarks": "2024.1.0",
"ucx": "1.19.0",
"openmpi": "5.0.8",
"csi_driver_powerscale": "v2.16.0",
"csi_driver_powerscale": "v2.17.0",
"rocm": "6.3.1",
"service_k8s": "1.35.1"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,46 @@
)
CLUSTER_OS_FAIL_MSG = "Cluster OS must be 'rhel' for RHEL Omnia Infrastructure Manager"

# additional_cloud_init
ADDITIONAL_CLOUD_INIT_FILE_NOT_FOUND_MSG = (
"File not found. Verify additional_cloud_init_config_file "
"in provision_config.yml points to a valid file."
)
ADDITIONAL_CLOUD_INIT_YAML_SYNTAX_MSG = (
"YAML syntax error in additional cloud-init config file."
)
ADDITIONAL_CLOUD_INIT_NOT_DICT_MSG = (
"additional cloud-init config file must contain a YAML mapping."
)
ADDITIONAL_CLOUD_INIT_UNKNOWN_TOP_KEY_MSG = (
"Unknown top-level key. Only 'common' and 'groups' are allowed."
)
ADDITIONAL_CLOUD_INIT_PROHIBITED_KEY_MSG = (
"Prohibited key found. The keys 'bootcmd', 'network', "
"'network-config', and 'packages' are platform-managed "
"and must NOT be overridden."
)
ADDITIONAL_CLOUD_INIT_UNKNOWN_KEY_MSG = (
"Unknown key found. Only 'write_files' and 'runcmd' "
"are allowed."
)
ADDITIONAL_CLOUD_INIT_WRITE_FILES_NOT_LIST_MSG = (
"'write_files' must be a list."
)
ADDITIONAL_CLOUD_INIT_WRITE_FILES_MISSING_PATH_MSG = (
"write_files entry is missing the required 'path' field."
)
ADDITIONAL_CLOUD_INIT_RUNCMD_NOT_LIST_MSG = "'runcmd' must be a list."
ADDITIONAL_CLOUD_INIT_RUNCMD_NOT_STRING_MSG = (
"runcmd entry is not a string."
)
ADDITIONAL_CLOUD_INIT_INVALID_FG_MSG = (
"is not a valid functional group name in the 'groups' section."
)
ADDITIONAL_CLOUD_INIT_SECTION_NOT_DICT_MSG = (
"Section must be a mapping/dict."
)

# local_repo.yml
REPO_STORE_PATH_MSG = "Please provide a valid repo_store_path value."
OMNIA_REPO_URL_MSG = "Repo urls are empty. Please provide a url and corresponding key."
Expand Down Expand Up @@ -619,7 +659,9 @@ def tls_ext_fail_msg(valid_extensions):
"Check telemetry_config.yml and network_spec.yml")

# high_availability
VIRTUAL_IP_NOT_IN_ADMIN_SUBNET = ("virtual ip address provided is not in admin subnet. "
VIRTUAL_IP_NOT_IN_ADMIN_SUBNET = ("virtual ip address provided is not in a valid subnet. "
"The VIP must be in either the admin subnet or the "
"additional subnet where the Kubernetes control plane nodes are configured. "
"Check high_availability_config.yml and network_spec.yml")
VIRTUAL_IP_NOT_VALID = ("should be outside the admin static and dynamic ranges. "
"Check high_availability_config.yml and network_spec.yml")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
"description": "Optional kernel version to pin for boot image selection. Leave empty to auto-select latest.",
"pattern": "^(|[0-9]+\\.[0-9]+\\.[0-9]+-.+)$",
"default": ""
},
"additional_cloud_init_config_file": {
"type": "string",
"description": "Path to additional cloud-init configuration file for stateless node provisioning. Leave empty to disable.",
"default": ""
}
},
"required": [
Expand Down
Loading
Loading