From cd107a7b2225928a98d51d0529d732155530b36b Mon Sep 17 00:00:00 2001 From: Mithilesh Reddy Date: Mon, 20 Apr 2026 16:44:17 +0530 Subject: [PATCH 01/15] Merge pull request #4294 from mithileshreddy04/pub/q2_dev OpenCHAMI upgrade changes in prepare_oim and oim_cleanup --- .../roles/deploy_containers/openchami/vars/main.yml | 12 ++++++------ .../oim_cleanup/oim_container_cleanup/vars/main.yml | 4 ++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 2d7db2ca85..f7234139f7 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -17,7 +17,7 @@ openchami_git_repo: https://github.com/OpenCHAMI/deployment-recipes.git openchami_share_dir: /opt/omnia/openchami openchami_clone_path: "{{ openchami_share_dir }}/deployment-recipes" -openchami_git_version: main +openchami_git_version: 92310bd537c05c201d5156137357bb742b09d5e0 clone_retry: "5" clone_delay: "10" dir_permissions_755: "0755" @@ -43,10 +43,10 @@ pull_image_delay: 10 # OpenCHAMI image tags openchami_local_ca_tag: "v0.2.2" openchami_opaal_tag: "v0.3.10" -openchami_smd_tag: "v2.18.0" -openchami_bss_tag: "v1.32.0" -openchami_cloud_init_tag: "v1.2.3" -openchami_coredhcp_tag: "v0.3.0" +openchami_smd_tag: "v2.19.0" +openchami_bss_tag: "v1.32.1" +openchami_cloud_init_tag: "v1.3.0" +openchami_coresmd_tag: "v0.4.0" # Third-party image tags for OpenCHAMI minio_tag: "latest" postgres_tag: "11.5-alpine" @@ -63,7 +63,7 @@ openchami_images: - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}" - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" - - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}" + - "ghcr.io/openchami/coresmd:{{ openchami_coresmd_tag }}" - "docker.io/minio/minio:{{ minio_tag }}" - "docker.io/library/postgres:{{ postgres_tag }}" - "docker.io/oryd/hydra:{{ hydra_tag }}" diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml index be275fd870..ae2a86d511 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml @@ -59,6 +59,8 @@ openchami_containers: - cloud-init-server - haproxy - coresmd + - coresmd-coredhcp + - coresmd-coredns openchami_volumes: - haproxy-certs @@ -78,6 +80,7 @@ openchami_secrets: - bss_postgres_password tcp_ports: + - 53 - 9000 - 9001 - 5000 @@ -88,6 +91,7 @@ tcp_ports: - 8443 udp_ports: + - 53 - 69 - 67 - 68 From 4f0d824ebc81cdb0d992a3f1f8719d3e85bc2598 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Tue, 21 Apr 2026 12:26:25 +0530 Subject: [PATCH 02/15] Feature branch sync - pub/telemetry to pub/q2_dev (#4293) * Update openchami git version (#4251) Co-authored-by: mithileshreddy04 Co-authored-by: priti-parate <140157516+priti-parate@users.noreply.github.com> * powerscale teleemtry support with direct authentication mode * use existing vmagent * update messages in vars * merge Pub/q2 dev to pub/telemetry (#4254) * removing input template * Fix for pulp remote RemoteArtifacts is 0 after repo migration Signed-off-by: pullan1 --------- Signed-off-by: pullan1 Co-authored-by: pullan1 Co-authored-by: snarthan * Powerscale teleemtry support using helm * deploy powerscale telemetry using cloud-init * offline deployment of powerscale telemetry * fix for cert-manager failure * fix for cert manager failure * powerscale telemetry deployment with telemetry namespace * sync q2_dev changes (#4263) * removing input template * Fix for pulp remote RemoteArtifacts is 0 after repo migration Signed-off-by: pullan1 * Feature/ome discovery pxe mapping enhancements (#4245) * feat(discovery): OME static group extraction, PXE mapping IP/SU/parent tag enhancements ome_server_inventory.py: - Fix static group extraction: find 'Static Groups' container by name and select only direct children via ParentId; avoids picking system/nested groups - Emit module.warn() for static groups that exist but have no devices assigned - Fix idrac_hostname: read InstrumentationName/DnsName from DeviceManagement ManagementType==2 entry instead of DeviceName which returns the IP address generate_pxe_mapping.py: - ADMIN_IP: derive from first 2 octets of admin_network.subnet + last 2 of BMC IP - IB_IP: derive from first 2 octets of ib_network.subnet + last 2 of BMC IP - Skip IB_IP/IB_MAC when server has no IB NIC (ib_nic_mac is empty) - Add extract_su_from_hostname() with regex (SU[A-Z]?\d+)(?=R\d+) to parse Scalable Unit from BMC hostname; rejects service-tag-only hostnames (idrac-JCGT033) and falls back to grp0 when no SU pattern is found - Set GROUP_NAME to extracted SU identifier (fallback: grp0) - Post-process rows to assign PARENT_SERVICE_TAG from the service_kube_control_plane_x86_64 node within the same SU group - Remove BMC_HOSTNAME from CSV headers and output rows - Lint: remove dead try/except in calculate_admin_ip/calculate_ib_ip, reuse ib_mac variable, suppress broad-except pylint warning generate_pxe_mapping.yml: - Load network_spec.yml via include_vars - Set admin_subnet and ib_subnet using selectattr on Networks list - Pass both subnets as parameters to the generate_pxe_mapping module defaults/main.yml: - Add admin_subnet and ib_subnet default variables (empty string) provision_validation.py: - Comment out validate_admin_ips_against_network_spec function and its call site; ADMIN_IPs are now derived from subnet octets + BMC IP and will not necessarily fall within primary_oim_admin_ip/netmask_bits range * refactor: rename discovery directory to provision, update network_spec.yml - Renamed discovery/ to provision/ (git detected as rename, no content loss) - Updated input/network_spec.yml with latest network configuration changes * Update discovery.yml * refactor: unify OME credentials into get_config_credentials flow - Added ome_ip, ome_username, ome_password to omnia_credential.j2 template - Added 'discovery' service entry to omnia_credentials in update_config/vars/main.yml - Added 'discovery' to the hardcoded service key trigger list in fetch_credentials.yml - Replaced custom vault logic in get_ome_credentials.yml with unified decrypt_include_encrypt.yml call against omnia_config_credentials.yml - Updated ome_discovery/vars/main.yml to reference omnia_config_credentials_file and omnia_config_credentials_vault_key instead of the separate .vault/ paths - Deleted .vault/ome_credentials.yml and .vault/.vault_password (no longer needed) * chore: update copyright year from 2025 to 2026 in modified files Updated copyright header in all ome_discovery files modified during this feature branch: - library/generate_pxe_mapping.py - library/ome_server_inventory.py - tasks/generate_pxe_mapping.yml - tasks/get_ome_credentials.yml - defaults/main.yml - vars/main.yml * fix: restore discovery_validations role missed during discovery-to-provision rename discovery/roles/discovery_validations/ was accidentally dropped when renaming the discovery/ directory to provision/. Add it back under provision/roles/discovery_validations/ to resolve the PR merge conflict. * chore: update copyright year to 2026 in provision/roles/discovery_validations files * fix: remove duplicate discovery_validations role (provision_validations already exists) provision/roles/provision_validations/ is the correct renamed equivalent of discovery/roles/discovery_validations/. The discovery_validations copy added to provision/ was redundant. * feat: apply upstream telemetry upgrade changes from dell/omnia pub/q2_dev - Replace kubectl command with kubernetes.core.k8s module for iDRAC StatefulSet - Preserve existing replica count during iDRAC StatefulSet upgrade - Add LDMS store daemon check, restart, and readiness wait tasks * fix: quote build_stream_job_id_absent message in provision_validations vars * feat: add discovery/roles/discovery_validations and telemetry files - Add discovery/roles/discovery_validations/vars/main.yml with task definitions for validation flow - Add discovery/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml with upstream telemetry upgrade logic (replica preservation + LDMS store) * fix: wrap long line in fetch_credentials.yml to satisfy yaml[line-length] lint * refactor: move ome_ip from credentials to discovery_config.yml - Create input/discovery_config.yml for non-credential discovery settings (ome_ip, future Magellan config) - Remove ome_ip from omnia_credential.j2 and credential update vars - Load ome_ip via include_vars from discovery_config.yml in get_ome_credentials.yml - Add discovery_config.yml to provision_validations discovery_inputs - Remove redundant ib_subnet/admin_subnet defaults from ome_discovery * fix: add newline at end of ome_discovery/defaults/main.yml * fix: override role_path to absolute path for decrypt_include_encrypt.yml role_path resolves to ome_discovery role path, causing encrypt_files_vars.yml to be looked up incorrectly. Override to playbook_dir dirname (/opt/omnia/omnia). * fix: inline credential loading to avoid role_path resolution issue role_path cannot be overridden in include_tasks vars. Replace the call to decrypt_include_encrypt.yml with direct include_vars using stat checks for encrypted vs unencrypted credential file handling. * fix: skip load-failure rule in ansible-lint to avoid CI false positives ansible-lint fails to resolve role_path relative paths during static analysis in GitHub Actions, causing false load-failure errors for files that exist and work at runtime. * Update ansible.cfg * Update ansible.cfg * refactor: rename discovery references to provision and add discovery_config variable - Rename discover_mapping_nodes.yml to provision_mapping_nodes.yml - Replace "discovery" terminology with "provision" across playbooks, vars, READMEs, and task names in provision roles - Add subnet as required field with IP pattern validation in network_spec schema - Define discovery_config variable in ome_discovery vars and use it in get_ome_credentials.yml (consistent with provision_config pattern) - Rename discovery_inputs to provision_inputs in validation vars - Rename discovery_mech_mapping to provision_mech_mapping - Update user-facing messages to reference provision.yml Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix: credential rules, vault handling, GROUP_NAME validation, and discovery playbook improvements - Add ome_username and ome_password validation rules to credential_rules.json - Add 'discovery' tag to prepare_oim omnia_run_tags so OME credentials are prompted - Fix vault-encrypted credential loading in get_ome_credentials.yml (use decrypt-include-reencrypt pattern instead of unsupported vault_password_file) - Add include_input_dir.yml import to discovery.yml so input_project_dir is set - Accept SU1-SU100 (case-insensitive) in addition to grp0-grp100 for GROUP_NAME - Fix Magellan message to use list format (avoids \n in debug output) - Remove escaped quotes from discovery usage examples Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix: extend SU group name support to build_image validation and schemas - Add build_aarch_image tag to input_file_inventory so build_image_aarch64.yml runs provision_config validation (was missing, causing no validation to run for aarch64 builds) - Update GROUP_NAME patterns in functional_groups_config.json and omnia_config.json schemas to accept SU1-SU100 format alongside grp0-grp100 - Update INVALID_GROUP_NAME_MSG to reflect both accepted formats Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --------- Signed-off-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * Cleanup discovery roles: move library modules, remove unused roles (#4261) * Cleanup discovery roles: move library modules, remove unused roles - Move ome_server_inventory.py and generate_pxe_mapping.py from discovery/roles/ome_discovery/library/ to common/library/modules/ so they are shared via the common module search path already configured in discovery/ansible.cfg - Remove unused discovery/roles/telemetry/ directory - Remove unused discovery/roles/discovery_validations/ directory - Load discovery_config.yml at playbook level in discovery.yml (consistent with how build_stream_config.yml is loaded in provision.yml) - Fix discovery_complete_msg formatting for readable Ansible output Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * Remove unused discovery_validations role Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --------- Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix for set_pxe_boot.yml when custom inventory given (#4260) * Update generate_bmc_inventory.yml Signed-off-by: SOWJANYAJAGADISH123 * Update pre_checks.yml Signed-off-by: SOWJANYAJAGADISH123 * lint issue Signed-off-by: SOWJANYAJAGADISH123 --------- Signed-off-by: SOWJANYAJAGADISH123 --------- Signed-off-by: pullan1 Signed-off-by: Sujit Jadhav Signed-off-by: SOWJANYAJAGADISH123 Co-authored-by: pullan1 Co-authored-by: snarthan Co-authored-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: SOWJANYAJAGADISH123 * resolving merge conflict * revert openchami commit id * resolving review comments * addressing review comments * fix for vmagent scraping powerscale metrics * cleanup script correction for powerscale telemetry cleanup * victoria operator and victoria log input validation * vitoria log input and input validation * remving L2 vslidation for victoria log which is not required * input validation and review comment addressing * change idrac_telemetry_collection_type to telemetry_collection_type * Remove invisible Unicode LRM (U+200E) characters from victoria-operator template filenames * VictoriaLogs container image references and default variable * port check * resolve merge conflict * correction for schema * Update telemetry_config.json * Update validate_input.py * merge conflict telemetry_prereq.yml * change victoria_configurations to victoria_metrics_configurations * remove deployment mode input variable * update for upgrade scenarios * update comments * update comment * resolving issues due to merge conflict * vitoria log changes * victoria log cluster component and VLAgent deployment * updating pod name * removing the changes of adding cert * victoria log changes * remivng victoria log pod calidation playbook * cleanup changes for victoria log * Update ansible-lint.yml and pylint for pub/telemetry (#4296) * Update ansible-lint.yml Signed-off-by: Kratika Patidar * Update pylint.yml Signed-off-by: Kratika Patidar * fixing ansible-lint * lint * line-lenght --------- Signed-off-by: Kratika Patidar --------- Signed-off-by: pullan1 Signed-off-by: Sujit Jadhav Signed-off-by: SOWJANYAJAGADISH123 Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Signed-off-by: Kratika Patidar Co-authored-by: mithileshreddy04 Co-authored-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Co-authored-by: pullan1 Co-authored-by: snarthan Co-authored-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: SOWJANYAJAGADISH123 Co-authored-by: Kratika_Patidar --- .github/workflows/ansible-lint.yml | 1 + .github/workflows/pylint.yml | 1 + .../common_utils/en_us_validation_msg.py | 71 ++++- .../schema/telemetry_config.json | 100 +++++-- .../validation_flows/common_validation.py | 265 +++++++++++++++++- common/library/modules/validate_input.py | 2 +- .../rhel/10.0/csi_driver_powerscale.json | 2 +- .../config/x86_64/rhel/10.0/service_k8s.json | 36 +-- input/telemetry_config.yml | 157 +++++++++-- .../ci-group-default_x86_64.yaml.j2 | 18 ++ ...ce_kube_control_plane_first_x86_64.yaml.j2 | 4 + .../deploy_powerscale_telemetry.sh.j2 | 178 ++++++++++++ .../verify_powerscale_telemetry.sh.j2 | 182 ++++++++++++ .../templates/telemetry/telemetry.sh.j2 | 17 +- .../tasks/check_kube_vip_reachability.yml | 52 ++++ .../tasks/deploy_powerscale_metrics.yml | 124 ++++++++ .../tasks/generate_telemetry_deployments.yml | 29 +- .../get_powerscale_telemetry_dependencies.yml | 162 +++++++++++ provision/roles/telemetry/tasks/main.yml | 17 +- .../telemetry/tasks/telemetry_prereq.yml | 6 +- .../telemetry/cleanup_telemetry.sh.j2 | 263 +++++++++++++---- .../common/telemetry_secret_creation.yaml.j2 | 2 +- .../idrac_telemetry_statefulset.yaml.j2 | 2 +- .../kafka/kafka.kafkapump_user.yaml.j2 | 2 +- .../kafka/kafka.tls_test_job.yaml.j2 | 6 +- .../templates/telemetry/kustomization.yaml.j2 | 41 +-- .../csm-metrics-deployment-direct.yaml.j2 | 82 ++++++ .../victoria/gen_victoria_certs.sh.j2 | 65 +++-- .../victoria-agent-deployment.yaml.j2 | 8 + .../victoria-cluster-vminsert.yaml.j2 | 161 ----------- .../victoria-cluster-vmselect.yaml.j2 | 160 ----------- .../victoria-cluster-vmstorage.yaml.j2 | 183 ------------ .../victoria-operator-vmagent.yaml.j2 | 72 +++++ .../victoria-operator-vmcluster.yaml.j2 | 241 ++++++++++++++++ .../victoria-operator-vmpodscrape.yaml.j2 | 46 +++ .../victoria-operator-vmsingle.yaml.j2 | 86 ++++++ .../victoria/victoria-statefulset.yaml.j2 | 4 +- .../victoria/victoria-tls-test-job.yaml.j2 | 20 +- .../victoria/victoria-vmagent-rbac.yaml.j2 | 6 +- .../victorialogs-operator-vlagent.yaml.j2 | 208 ++++++++++++++ .../victorialogs-operator-vlcluster.yaml.j2 | 231 +++++++++++++++ .../victorialogs-vlagent-config.yaml.j2 | 155 ++++++++++ .../victoria/vmagent-scrape-config.yaml.j2 | 17 +- provision/roles/telemetry/vars/main.yml | 249 ++++++++++++++-- .../tasks/transform_telemetry_config.yml | 8 +- .../templates/telemetry_config.j2 | 31 +- .../tasks/main.yml | 34 +-- .../vars/main.yml | 11 +- 48 files changed, 3057 insertions(+), 761 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 create mode 100644 provision/roles/telemetry/tasks/check_kube_vip_reachability.yml create mode 100644 provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml create mode 100644 provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml create mode 100644 provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 9d08d236a9..621bd0b930 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -8,6 +8,7 @@ on: - release_1.7.1 - pub/build_stream - pub/q2_dev + - pub/telemetry jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index be748d1fe1..c979ce72ca 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -8,6 +8,7 @@ on: - release_1.7.1 - pub/build_stream - pub/q2_dev + - pub/telemetry jobs: build: diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 18694dcbec..fe1baa69e2 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -70,7 +70,7 @@ "and Ports fields.") SWITCH_DETAILS_NO_BMC_DETAILS_MSG = ("If switch details are provided then bmc_detail's " "static_range must also be provided.") -INVALID_GROUP_NAME_MSG = "Groups must be defined in the form of grp where n is 0-100, or SU where n is 1-100." +INVALID_GROUP_NAME_MSG = "Groups must be defined in the form of grp where n is 0-99." INVALID_LOCATION_ID_MSG = ("location_id must follow the format SU-.RACK- where n is 0-99. " "This input is case-sensitive. Please use uppercase letters only.") INVALID_ATTRIBUTES_ROLE_MSG = ("Please provide valid attributes for the role, " @@ -264,6 +264,75 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "service k8s/slurm roles in the mapping file or remove ldms from " "software_config.json and rerun the playbook.") +# PowerScale telemetry validation messages +POWERSCALE_VICTORIA_REQUIRED_MSG = ( + "PowerScale telemetry requires VictoriaMetrics to be deployed. " + "When powerscale_configurations.powerscale_telemetry_support is true, 'victoria' must be included in " + "telemetry_collection_type (e.g., 'victoria' or 'victoria,kafka')." +) +POWERSCALE_CSI_DRIVER_MISSING_MSG = ( + "csi_driver_powerscale is not configured in software_config.json. " + "PowerScale telemetry requires the CSI driver for PowerScale to be configured." +) +POWERSCALE_SERVICE_CLUSTER_MISSING_MSG = ( + "service cluster is not defined in functional_groups_config.yml. " + "PowerScale telemetry requires a service cluster." +) +POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( + "powerscale_configurations section is required and must contain powerscale_telemetry_support." +) +POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( + "must be a non-empty string in format 'XGi' (e.g., '5Gi')" +) +POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( + "csm_observability_values_file_path is required when powerscale_configurations.powerscale_telemetry_support is true. " + "Please provide the path to the CSM Observability values.yaml file." +) +POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( + "karaviMetricsPowerscale.authorization.proxyHost is required in the CSM Observability values file " + "when karaviMetricsPowerscale.authorization.enabled is true. " + "Please provide the hostname or IP of the CSM Authorization Proxy server." +) +def powerscale_csm_values_not_found_msg(path): + """Returns error message when CSM Observability values.yaml file is not found.""" + return ( + f"CSM Observability values.yaml file not found at '{path}'. " + "Please verify the file path is correct." + ) +POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = ( + "CSM Observability values.yaml must contain a valid YAML dictionary." +) +def powerscale_csm_values_parse_error_msg(error): + """Returns error message when CSM Observability values.yaml fails to parse.""" + return f"Failed to parse CSM Observability values.yaml: {error}" +POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = ( + "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section." +) +POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = ( + "CSM Metrics PowerScale image is required in CSM Observability values.yaml." +) +POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( + "OTEL Collector image is required in CSM Observability values.yaml." +) +POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = ( + "Each additional_remote_write_endpoint must have a non-empty 'url' field." +) +POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = ( + "URL must start with 'http://' or 'https://'." +) +def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): + """Returns error message when CSM values.yaml image version doesn't match service_k8s.json.""" + return ( + f"Image version mismatch for '{image_name}': " + f"CSM Observability values.yaml has '{values_image}' but " + f"service_k8s.json has '{service_k8s_image}'. " + f"Please update service_k8s.json to match the values.yaml version " + f"and re-run local_repo.yml to mirror the correct image to Pulp." + ) +POWERSCALE_SERVICE_K8S_JSON_NOT_FOUND_MSG = ( + "service_k8s.json not found. Cannot validate PowerScale telemetry image versions. " + "Please ensure local_repo.yml has been executed." +) def boolean_fail_msg(value): """Returns a formatted message indicating boolean_fail_msg.""" return f"{value} must be set to either true or false." diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index a6523462e8..6b511a5f12 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -10,7 +10,7 @@ "type": "boolean", "description": "Enable or disable NVIDIA DCGM (Data Center GPU Manager) on GPU compute nodes. When true, nvidia-dcgm.service is started during cloud-init provisioning. Default: true" }, - "idrac_telemetry_collection_type": { + "telemetry_collection_type": { "anyOf": [ { "type": "string", @@ -43,6 +43,53 @@ "default": 10001, "description": "LDMS sampler port on compute nodes. Valid range: 10001-10100. Default: 10001" }, + "powerscale_configurations": { + "type": "object", + "properties": { + "powerscale_telemetry_support": { + "type": "boolean", + "default": true, + "description": "Enable or disable PowerScale telemetry support. Requires csi_driver_powerscale in software_config.json." + }, + "powerscale_log_enabled": { + "type": "boolean", + "default": false, + "description": "Enable or disable PowerScale log collection (syslog to VictoriaLogs). Requires powerscale_telemetry_support: true." + }, + "otel_collector_storage_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "5Gi", + "description": "PVC size for OTEL Collector metric batching and buffering." + }, + "csm_observability_values_file_path": { + "type": "string", + "description": "Path to the user-provided Helm values file for karavi-observability chart. Required when powerscale_telemetry_support is true." + }, + "additional_remote_write_endpoints": { + "type": "array", + "default": [], + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "pattern": "^https?://", + "description": "VictoriaMetrics remote_write endpoint URL." + }, + "tls_insecure_skip_verify": { + "type": "boolean", + "default": false, + "description": "Skip TLS certificate verification for this endpoint." + } + }, + "required": ["url"] + }, + "description": "Additional VictoriaMetrics remote_write endpoints. vmagent writes to all configured endpoints." + } + }, + "required": ["powerscale_telemetry_support", "otel_collector_storage_size", "csm_observability_values_file_path"] + }, "ldms_sampler_configurations": { "anyOf": [ { @@ -126,7 +173,7 @@ ] } }, - "required": ["idrac_telemetry_support", "dcgm_support", "idrac_telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port" ], + "required": ["idrac_telemetry_support", "dcgm_support", "telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port", "powerscale_configurations" ], "$defs": { "kafka_configurations": { "type": "object", @@ -177,7 +224,7 @@ } }, "uniqueItems": true, - "description": "IMPORTANT: At least one Kafka topic must be defined. Topic names 'idrac' and 'ldms' are CONSTANTS. 'idrac' is required if idrac_telemetry_support is true and kafka is in idrac_telemetry_collection_type. 'ldms' is required if LDMS software is configured in software_config.json (automatic detection). Only partition counts can be changed.", + "description": "IMPORTANT: At least one Kafka topic must be defined. Topic names 'idrac' and 'ldms' are CONSTANTS. 'idrac' is required if idrac_telemetry_support is true and kafka is in telemetry_collection_type. 'ldms' is required if LDMS software is configured in software_config.json (automatic detection). Only partition counts can be changed.", "errorMessage": { "minItems": "At least 1 Kafka topic must be defined. Configure based on enabled features.", "maxItems": "Maximum 2 topics allowed: 'idrac' and 'ldms'", @@ -194,18 +241,9 @@ ], "additionalProperties": false }, - "victoria_configurations": { + "victoria_metrics_configurations": { "type": "object", "properties": { - "deployment_mode": { - "type": "string", - "enum": ["single-node", "cluster"], - "default": "cluster", - "description": "VictoriaMetrics deployment mode. 'single-node' for simple deployment (1 pod), 'cluster' for high-availability deployment (7 pods). Default: 'cluster'", - "errorMessage": { - "enum": "deployment_mode must be either 'single-node' or 'cluster'" - } - }, "persistence_size": { "type": "string", "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$" @@ -216,11 +254,29 @@ } }, "required": [ - "deployment_mode", "persistence_size", "retention_period" ], "additionalProperties": false + }, + "victoria_logs_configurations": { + "type": "object", + "description": "VictoriaLogs cluster mode configuration. Deployed alongside VictoriaMetrics when 'victoria' is in collection type.", + "properties": { + "storage_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "8Gi", + "description": "Storage size per vlstorage replica PVC. Total = storage_size x 3 replicas.", + "errorMessage": "storage_size must be a valid Kubernetes PVC size (e.g., '8Gi', '50Gi', '1Ti')" + }, + "retention_period": { + "type": "integer", + "minimum": 24 + } + }, + "required": ["storage_size", "retention_period"], + "additionalProperties": false } }, "allOf": [ @@ -228,7 +284,7 @@ "if": { "properties": { "idrac_telemetry_support": { "const": true }, - "idrac_telemetry_collection_type": { "pattern": "(?i)^kafka$" } + "telemetry_collection_type": { "pattern": "(?i)^kafka$" } } }, "then": { @@ -242,13 +298,14 @@ "if": { "properties": { "idrac_telemetry_support": { "const": true }, - "idrac_telemetry_collection_type": { "pattern": "(?i)^victoria$" } + "telemetry_collection_type": { "pattern": "(?i)^victoria$" } } }, "then": { - "required": ["victoria_configurations"], + "required": ["victoria_metrics_configurations", "victoria_logs_configurations"], "properties": { - "victoria_configurations": { "$ref": "#/$defs/victoria_configurations" } + "victoria_metrics_configurations": { "$ref": "#/$defs/victoria_metrics_configurations" }, + "victoria_logs_configurations": { "$ref": "#/$defs/victoria_logs_configurations" } } } }, @@ -256,16 +313,17 @@ "if": { "properties": { "idrac_telemetry_support": { "const": true }, - "idrac_telemetry_collection_type": { + "telemetry_collection_type": { "pattern": "(?i)^(victoria,kafka|kafka,victoria)$" } } }, "then": { - "required": ["kafka_configurations", "victoria_configurations"], + "required": ["kafka_configurations", "victoria_metrics_configurations", "victoria_logs_configurations"], "properties": { "kafka_configurations": { "$ref": "#/$defs/kafka_configurations" }, - "victoria_configurations": { "$ref": "#/$defs/victoria_configurations" } + "victoria_metrics_configurations": { "$ref": "#/$defs/victoria_metrics_configurations" }, + "victoria_logs_configurations": { "$ref": "#/$defs/victoria_logs_configurations" } } } } diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index cf4e74d4f9..2ceb8e6335 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -21,6 +21,7 @@ import ipaddress import json import os +import re from collections import Counter import yaml @@ -1476,7 +1477,7 @@ def validate_telemetry_config( # Validate topic_partitions configuration kafka_config = data.get("kafka_configurations", {}) topic_partitions = kafka_config.get("topic_partitions", []) - idrac_telemetry_collection_type = data.get("idrac_telemetry_collection_type", "") + telemetry_collection_type = data.get("telemetry_collection_type", "") # Check if LDMS software is configured but kafka_configurations is missing entirely if ldms_support_from_software_config and not kafka_config: @@ -1537,12 +1538,12 @@ def validate_telemetry_config( # Validate required topics based on feature flags # If iDRAC telemetry is enabled with Kafka, idrac topic is required - if idrac_telemetry_support and 'kafka' in idrac_telemetry_collection_type.split(','): + if idrac_telemetry_support and 'kafka' in telemetry_collection_type.split(','): if 'idrac' not in present_topics: errors.append(create_error_msg( "kafka_configurations.topic_partitions", "missing 'idrac' topic", - "idrac topic is required when idrac_telemetry_support is true and 'kafka' is in idrac_telemetry_collection_type" + "idrac topic is required when idrac_telemetry_support is true and 'kafka' is in telemetry_collection_type" )) # If LDMS software is configured in software_config.json, ldms topic is required @@ -1597,7 +1598,263 @@ def validate_telemetry_config( f"'{plugin_name}'", "plugin_name cannot be empty. Must be one of: meminfo, procstat2, vmstat, loadavg, slurm_sampler, procnetdev2" )) - + + # Validate PowerScale telemetry configuration + powerscale_config = data.get("powerscale_configurations") + if not powerscale_config: + errors.append(create_error_msg( + "powerscale_configurations", + "not defined", + en_us_validation_msg.POWERSCALE_CONFIGURATIONS_MISSING_MSG + )) + else: + powerscale_telemetry_support = powerscale_config.get("powerscale_telemetry_support", False) + + if powerscale_telemetry_support: + logger.info("PowerScale telemetry support is enabled, performing PowerScale validation") + + # Check victoria is in telemetry_collection_type + # PowerScale telemetry pipeline requires VictoriaMetrics (writes to vminsert via shared vmagent) + collection_types = [t.strip() for t in telemetry_collection_type.split(',')] + if 'victoria' not in collection_types: + errors.append(create_error_msg( + "telemetry_collection_type", + telemetry_collection_type, + en_us_validation_msg.POWERSCALE_VICTORIA_REQUIRED_MSG + )) + + # Check CSI driver PowerScale is in software_config.json + csi_powerscale_found = False + if os.path.exists(software_config_file_path): + try: + with open(software_config_file_path, 'r', encoding='utf-8') as f: + software_config = json.load(f) + softwares = software_config.get("softwares", []) + csi_powerscale_found = any( + software.get("name") == "csi_driver_powerscale" for software in softwares + ) + except (json.JSONDecodeError, IOError) as e: + logger.warn(f"Could not load software_config.json for PowerScale validation: {e}") + + if not csi_powerscale_found: + errors.append(create_error_msg( + "powerscale_configurations.powerscale_telemetry_support", + powerscale_telemetry_support, + en_us_validation_msg.POWERSCALE_CSI_DRIVER_MISSING_MSG + )) + + # Check service cluster is defined + if not is_service_cluster_defined: + errors.append(create_error_msg( + "powerscale_configurations.powerscale_telemetry_support", + powerscale_telemetry_support, + en_us_validation_msg.POWERSCALE_SERVICE_CLUSTER_MISSING_MSG + )) + + # Validate otel_collector_storage_size + otel_storage = powerscale_config.get("otel_collector_storage_size", "") + if not otel_storage or not isinstance(otel_storage, str): + errors.append(create_error_msg( + "powerscale_configurations.otel_collector_storage_size", + otel_storage, + en_us_validation_msg.POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG + )) + + # Validate csm_observability_values_file_path + csm_values_path = powerscale_config.get("csm_observability_values_file_path", "") + if not csm_values_path or not isinstance(csm_values_path, str) or csm_values_path.strip() == "": + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG + )) + elif not os.path.exists(csm_values_path): + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.powerscale_csm_values_not_found_msg(csm_values_path) + )) + else: + # Validate the CSM Observability values.yaml content + try: + with open(csm_values_path, 'r', encoding='utf-8') as f: + csm_values = yaml.safe_load(f) + if not isinstance(csm_values, dict): + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.POWERSCALE_CSM_VALUES_INVALID_YAML_MSG + )) + else: + # Validate required keys + karavi_metrics = csm_values.get("karaviMetricsPowerscale", {}) + if not karavi_metrics: + errors.append(create_error_msg( + "csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG + )) + else: + # Validate image reference exists + if not karavi_metrics.get("image"): + errors.append(create_error_msg( + "karaviMetricsPowerscale.image", + "not defined", + en_us_validation_msg.POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG + )) + + otel_config = csm_values.get("otelCollector", {}) + if not otel_config or not otel_config.get("image"): + errors.append(create_error_msg( + "otelCollector.image", + "not defined", + en_us_validation_msg.POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG + )) + + # Validate Karavi Authorization config in Helm values + karavi_auth = karavi_metrics.get("authorization", {}) if karavi_metrics else {} + if karavi_auth.get("enabled", False): + proxy_host = karavi_auth.get("proxyHost", "") + if not proxy_host or not isinstance(proxy_host, str) or proxy_host.strip() == "": + errors.append(create_error_msg( + "karaviMetricsPowerscale.authorization.proxyHost", + proxy_host, + en_us_validation_msg.POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG + )) + + # Cross-validate image versions between values.yaml and service_k8s.json + service_k8s_json_path = os.path.join( + input_dir, "config", "x86_64", + data.get("cluster_os_type", "rhel") if "cluster_os_type" in data else "rhel", + data.get("cluster_os_version", "10.0") if "cluster_os_version" in data else "10.0", + "service_k8s.json" + ) + # Try reading cluster_os_type/version from software_config.json + if os.path.exists(software_config_file_path): + try: + with open(software_config_file_path, 'r', encoding='utf-8') as scf: + sc_data = json.load(scf) + sc_os_type = sc_data.get("cluster_os_type", "rhel") + sc_os_version = sc_data.get("cluster_os_version", "10.0") + service_k8s_json_path = os.path.join( + input_dir, "config", "x86_64", + sc_os_type, sc_os_version, "service_k8s.json" + ) + except (json.JSONDecodeError, IOError): + pass + + if os.path.exists(service_k8s_json_path): + try: + with open(service_k8s_json_path, 'r', encoding='utf-8') as sk8s_f: + service_k8s_data = json.load(sk8s_f) + + # Build lookup: package -> tag from service_k8s.json + sk8s_images = {} + for entry in service_k8s_data.get("service_k8s", {}).get("cluster", []): + if entry.get("type") == "image" and "tag" in entry: + sk8s_images[entry["package"]] = entry["tag"] + + # Images to cross-validate: (description, values.yaml image, service_k8s package key) + images_to_check = [] + + if karavi_metrics and karavi_metrics.get("image"): + images_to_check.append(( + "csm-metrics-powerscale", + karavi_metrics["image"], + "quay.io/dell/container-storage-modules/csm-metrics-powerscale" + )) + if otel_config and otel_config.get("image"): + images_to_check.append(( + "opentelemetry-collector", + otel_config["image"], + "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector" + )) + karavi_auth = karavi_metrics.get("authorization", {}) if karavi_metrics else {} + sidecar_proxy = karavi_auth.get("sidecarProxy", {}) + if sidecar_proxy and sidecar_proxy.get("image"): + # csm-authorization-sidecar is in csi_driver_powerscale.json, not service_k8s.json + csi_ps_json_path = os.path.join( + os.path.dirname(service_k8s_json_path), "csi_driver_powerscale.json" + ) + if os.path.exists(csi_ps_json_path): + try: + with open(csi_ps_json_path, 'r', encoding='utf-8') as csi_f: + csi_ps_data = json.load(csi_f) + for entry in csi_ps_data.get("csi_driver_powerscale", {}).get("cluster", []): + if (entry.get("type") == "image" and + entry.get("package") == "quay.io/dell/container-storage-modules/csm-authorization-sidecar"): + sidecar_values_tag = sidecar_proxy["image"].split(":")[-1] if ":" in sidecar_proxy["image"] else "" + if sidecar_values_tag and sidecar_values_tag != entry["tag"]: + errors.append(create_error_msg( + "powerscale image: csm-authorization-sidecar", + sidecar_proxy["image"], + en_us_validation_msg.powerscale_image_version_mismatch_msg( + "csm-authorization-sidecar", + sidecar_proxy["image"], + f"{entry['package']}:{entry['tag']}" + ) + )) + else: + logger.info(f"Image version match for csm-authorization-sidecar: {sidecar_values_tag}") + break + except (json.JSONDecodeError, IOError) as csi_err: + logger.warn(f"Could not read csi_driver_powerscale.json: {csi_err}") + + for img_name, values_image, sk8s_key in images_to_check: + if sk8s_key in sk8s_images: + # Extract tag from values.yaml image (format: registry/repo:tag) + values_tag = values_image.split(":")[-1] if ":" in values_image else "" + sk8s_tag = sk8s_images[sk8s_key] + if values_tag and values_tag != sk8s_tag: + sk8s_full = f"{sk8s_key}:{sk8s_tag}" + errors.append(create_error_msg( + f"powerscale image: {img_name}", + values_image, + en_us_validation_msg.powerscale_image_version_mismatch_msg( + img_name, values_image, sk8s_full + ) + )) + else: + logger.info(f"Image version match for {img_name}: {values_tag}") + else: + logger.warn(f"Image {sk8s_key} not found in service_k8s.json, skipping version check") + + except (json.JSONDecodeError, IOError) as sk8s_err: + logger.warn(f"Could not read service_k8s.json for image version validation: {sk8s_err}") + else: + logger.warn(f"service_k8s.json not found at {service_k8s_json_path}, skipping image version validation") + + logger.info("CSM Observability values.yaml validation passed") + except (yaml.YAMLError, IOError) as e: + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.powerscale_csm_values_parse_error_msg(str(e)) + )) + + # Validate additional_remote_write_endpoints + additional_endpoints = powerscale_config.get("additional_remote_write_endpoints", []) + if additional_endpoints and isinstance(additional_endpoints, list): + if len(additional_endpoints) > 5: + logger.warn(f"More than 5 additional_remote_write_endpoints configured ({len(additional_endpoints)}). " + "This may impact performance.") + for idx, endpoint in enumerate(additional_endpoints): + if not isinstance(endpoint, dict): + continue + url = endpoint.get("url", "") + if not url or not isinstance(url, str): + errors.append(create_error_msg( + f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", + url, + en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG + )) + elif not url.startswith("http://") and not url.startswith("https://"): + errors.append(create_error_msg( + f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", + url, + en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG + )) + return errors def validate_additional_software( diff --git a/common/library/modules/validate_input.py b/common/library/modules/validate_input.py index 022a87aaee..21ce2920c6 100644 --- a/common/library/modules/validate_input.py +++ b/common/library/modules/validate_input.py @@ -139,7 +139,7 @@ def main(): if input_file_path is None: error_message = ( - f"file not found in directory: {omnia_base_dir}/{project_name}" + f"{fname} file not found in directory: {omnia_base_dir}/{project_name}" ) logger.error(error_message) module.fail_json(msg=error_message) diff --git a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json index ac7953a0dc..15ed7a3bed 100644 --- a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json +++ b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json @@ -66,7 +66,7 @@ }, { "package": "quay.io/dell/container-storage-modules/csm-authorization-sidecar", - "tag": "v2.3.0", + "tag": "v2.4.0", "type": "image" }, { diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 6deed2309b..64f4c8bdff 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -3,7 +3,7 @@ "cluster": [ { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, - { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, @@ -17,6 +17,8 @@ { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/victoria-logs", "type": "image", "tag": "v1.49.0" }, + { "package": "docker.io/victoriametrics/vlagent", "type": "image", "tag": "v1.49.0" }, { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, @@ -29,11 +31,20 @@ { "package": "cffi==1.17.1", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, - { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, + { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.148.0", "type": "image" }, + { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, + { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, + { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, + { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, + { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, - { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "service_kube_control_plane": { @@ -57,7 +68,6 @@ { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "PyMySQL==1.1.2", "type": "pip_module" } - ] }, "service_kube_control_plane_first": { @@ -75,33 +85,23 @@ { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { - "package": "calico-v3.30.3", - "type": "manifest", - "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" - }, - { - "package": "metallb-native-v0.15.2", - "type": "manifest", - "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" - }, + { "package": "calico-v3.30.3", "type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" }, + { "package": "metallb-native-v0.15.2", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" }, { "package": "helm-v3.19.0-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.19.0-linux-amd64.tar.gz" }, - { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, { "package": "prettytable==3.14.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "PyMySQL==1.1.2", "type": "pip_module" } - ] }, - "service_kube_node": { "cluster": [ { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } + { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } ] } } diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index 397806c594..36eb0844ec 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -38,8 +38,6 @@ # ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ # │ Deployment Mode │ Per-Pod Storage │ Number of Pods │ Total Storage │ # ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ -# │ Single-node │ persistence_size │ 1 pod │ 1× storage │ -# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ # │ Cluster │ persistence_size │ 3 vmstorage │ 3× storage │ # └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ # Example: 8Gi per pod → Single-node: 8Gi total, Cluster: 24Gi total @@ -58,7 +56,6 @@ # # COMBINED STORAGE EXAMPLES: # Default (8Gi each): VictoriaMetrics Cluster (24Gi) + Kafka (48Gi) = 72Gi total -# Single-node mode: VictoriaMetrics Single (8Gi) + Kafka (48Gi) = 56Gi total # # STORAGE OPTIONS: # - VictoriaMetrics: Store iDRAC telemetry in time-series database @@ -83,7 +80,7 @@ idrac_telemetry_support: true # - "kafka" : Store in Kafka only # - "victoria,kafka" : Store in both (recommended) # Default: "victoria,kafka" -idrac_telemetry_collection_type: "victoria,kafka" +telemetry_collection_type: "victoria,kafka" # ============================================================================ # NVIDIA DCGM (Data Center GPU Manager) CONFIGURATION @@ -111,34 +108,14 @@ dcgm_support: true # VICTORIAMETRICS CONFIGURATION # ============================================================================ # VictoriaMetrics is a time-series database for storing telemetry metrics. -# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'victoria' is enabled in telemetry_collection_type. # # DEPLOYMENT MODES: -# - single-node: Simple deployment with one pod (suitable for small deployments) # - cluster: High-availability deployment with multiple components # (recommended for production and large-scale deployments) -victoria_configurations: - # VictoriaMetrics deployment mode - # Supported values: - # - "single-node" : Simple deployment (1 pod, suitable for dev/test) - # - "cluster" : High-availability deployment (7 pods, recommended for production) - # Default: "cluster" - # - # Cluster Mode Benefits: - # - High availability (no single point of failure) - # - Horizontal scalability (scale components independently) - # - Better performance (4x ingestion, 2x query speed) - # - Production-ready architecture - # - # Single-Node Benefits: - # - Simple setup (fewer resources) - # - Suitable for small deployments (<10 nodes) - # - Lower resource usage (~4Gi memory vs ~10Gi for cluster) - deployment_mode: "cluster" - +victoria_metrics_configurations: # The amount of storage allocated for EACH VictoriaMetrics persistent volume. # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: - # - Single-node mode: Total storage = persistence_size × 1 pod # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" @@ -149,11 +126,43 @@ victoria_configurations: # Default: 168 (7 days) retention_period: 168 +# ============================================================================ +# VICTORIALOGS CONFIGURATION +# ============================================================================ +# VictoriaLogs provides centralized log storage and querying (cluster mode only). +# Deployed alongside VictoriaMetrics when 'victoria' is in telemetry_collection_type. +# +# DEPLOYMENT: +# - Always cluster mode (vlstorage, vlinsert, vlselect, VLAgent) +# - Co-deployed with VictoriaMetrics — same deployment gate +# - Shares TLS infrastructure with VictoriaMetrics +# +# STORAGE REQUIREMENTS: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Component │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ vlstorage │ storage_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ VLAgent buffer │ 5Gi (fixed) │ 1 pod │ 5Gi │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi × 3 vlstorage = 24Gi + 5Gi VLAgent = 29Gi total +victoria_logs_configurations: + # Storage size per vlstorage replica PVC + # IMPORTANT: Total VictoriaLogs storage = storage_size × 3 vlstorage pods + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 24Gi total storage) + storage_size: "8Gi" + + # Log retention period (duration format) + # Logs older than this period are automatically purged by vlstorage. + # Default: 168 (7 days) + retention_period: 168 + # ============================================================================ # KAFKA CONFIGURATION # ============================================================================ # Apache Kafka is a distributed streaming platform for storing telemetry data. -# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'kafka' is enabled in telemetry_collection_type. # Also used for LDMS telemetry when LDMS software is configured. # # NOTE: Kafka topics are auto-generated based on enabled features: @@ -281,3 +290,97 @@ ldms_sampler_configurations: - plugin_name: procnetdev2 config_parameters: "" # Monitor all interfaces activation_parameters: "interval=30000000 offset=0" # interval=30000000 microseconds (30 seconds), offset=0 + +# ============================================================================ +# POWERSCALE TELEMETRY CONFIGURATION +# ============================================================================ +# PowerScale telemetry collects storage metrics from Dell PowerScale (OneFS) +# clusters using the CSM (Container Storage Modules) Metrics PowerScale exporter. +# +# DATA PIPELINE: +# CSM Metrics PowerScale → OTEL Collector → vmagent-powerscale → VictoriaMetrics +# +# Metrics collected: capacity, performance, topology, and quota metrics via +# OneFS REST API. Metrics are processed through an OpenTelemetry Collector +# and ingested into the shared VictoriaMetrics cluster (same as iDRAC telemetry). +# +# AUTHENTICATION MODES (auto-detected from CSM Observability values.yaml): +# - Direct Authentication: CSM Metrics connects directly to PowerScale using +# credentials from the CSI driver secret.yaml (isilon-creds) +# - Karavi Authorization: CSM Metrics connects via Karavi Authorization Proxy +# sidecar with token-based authentication +# +# PREREQUISITES: +# - csi_driver_powerscale must be configured in software_config.json +# - Service cluster must be defined in functional_groups_config.yml +# - VictoriaMetrics must be enabled: 'victoria' must be included in +# telemetry_collection_type (e.g., "victoria" or "victoria,kafka") +# - CSI driver secret.yaml with valid isilonClusters credentials +# - CSM Observability values.yaml must be provided +# +# STORAGE REQUIREMENTS: +# ┌─────────────────────┬──────────────┬──────────────┬──────────────────┐ +# │ Component │ Per-Pod PVC │ Pods │ Total Storage │ +# ├─────────────────────┼──────────────┼──────────────┼──────────────────┤ +# │ OTEL Collector │ 5Gi │ 1 per cluster│ 5Gi per cluster │ +# └─────────────────────┴──────────────┴──────────────┴──────────────────┘ + +powerscale_configurations: + # Enable or disable PowerScale telemetry support + # Accepted values: true or false + # Default: true + powerscale_telemetry_support: true + + # Enable or disable PowerScale log collection (syslog → VictoriaLogs) + # Requires powerscale_telemetry_support: true + # Accepted values: true or false + # Default: false + powerscale_log_enabled: false + + # PVC size for OTEL Collector metric batching and buffering. + # Adjust based on cluster scale and metric volume. + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: "5Gi" + otel_collector_storage_size: "5Gi" + + # Path to the CSM Observability (Karavi Observability) values.yaml file. + # This file provides: + # - Container image versions for CSM Metrics, OTEL Collector, and Karavi sidecar + # - Authentication settings (Direct vs. Karavi Authorization) + # - Poll frequencies for each metric domain + # - ISI client options for PowerScale OneFS API + # + # At minimum, configure the karaviMetricsPowerscale section (set enabled: true, + # disable other storage backends like powerflex/powerstore/powermax): + # karaviMetricsPowerscale: + # enabled: true + # ... + # + # AUTHENTICATION MODE (configured in the Helm values file): + # Mode A - Direct Authentication (default): + # karaviMetricsPowerscale.authorization.enabled: false (or omit) + # Only isilon-creds secret is copied to the telemetry namespace. + # + # Mode B - Karavi Authorization: + # karaviMetricsPowerscale.authorization.enabled: true + # karaviMetricsPowerscale.authorization.proxyHost: "" + # Requires CSI driver deployed with CSM Authorization enabled. + # Additional resources are automatically copied to the telemetry namespace: + # - isilon-config-params ConfigMap + # - isilon-proxy-server-root-certificate Secret + # - isilon-proxy-authz-tokens Secret + # + # Required when powerscale_configurations.powerscale_telemetry_support: true + # Reference: https://github.com/dell/helm-charts/blob/main/charts/karavi-observability/values.yaml + csm_observability_values_file_path: "" + + # Additional VictoriaMetrics remote_write endpoints (optional) + # vmagent will write the same PowerScale metrics to ALL configured endpoints. + # Each endpoint receives an identical copy of all metrics. + # The primary Omnia VictoriaMetrics endpoint (vminsert) is always included automatically. + # Default: [] (empty — only the primary Omnia VictoriaMetrics endpoint is used) + # Example: + # additional_remote_write_endpoints: + # - url: "https://external-victoria.example.com:8480/insert/0/prometheus/api/v1/write" + # tls_insecure_skip_verify: true + additional_remote_write_endpoints: [] diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 new file mode 100644 index 0000000000..383c3f3506 --- /dev/null +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 @@ -0,0 +1,18 @@ +- name: {{ functional_group_name }} + description: "{{ functional_group_name }} config" + file: + encoding: plain + content: | + ## template: jinja + #cloud-config + merge_how: + - name: list + settings: [append] + - name: dict + settings: [no_replace, recurse_list] + users: + - name: root + ssh_authorized_keys: "{{ read_ssh_key.stdout }}" + lock_passwd: false + hashed_passwd: "{{ hashed_password_output.stdout }}" + disable_root: false diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index b98df53d7d..43c6866ab0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -1041,6 +1041,10 @@ systemctl restart nfs-client.target systemctl restart rpcbind +{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% include 'powerscale/deploy_powerscale_telemetry.sh.j2' %} +{% endif %} + {% if hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] %} echo "Applying Telemetry Kubernetes deployments" /root/telemetry.sh diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 new file mode 100644 index 0000000000..d1017bc76c --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 @@ -0,0 +1,178 @@ +{# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} +{# PowerScale Telemetry - CSM Observability Helm Deployment (cloud-init fragment) + This template is included by ci-group-service_kube_control_plane_first_x86_64.yaml.j2 + when powerscale_telemetry_support is enabled. +#} +{% set csm_ns = hostvars['localhost']['csm_observability_namespace'] | default('telemetry') %} + # ===== PowerScale Telemetry - CSM Observability Helm Deployment ===== + echo "===== Starting PowerScale Telemetry (CSM Observability) deployment =====" + PS_TEL_FAILED=0 + CSM_NS="{{ csm_ns }}" + + # Step 1: Ensure namespace exists (shared with iDRAC telemetry) + echo "Ensuring ${CSM_NS} namespace exists..." + if kubectl get namespace "${CSM_NS}" >/dev/null 2>&1; then + echo "${CSM_NS} namespace already exists." + else + kubectl create namespace "${CSM_NS}" || { + echo "ERROR: Failed to create ${CSM_NS} namespace." + PS_TEL_FAILED=1 + } + fi + + # Step 2: Apply cert-manager CRDs (required before Helm install per official guide) + if [ "$PS_TEL_FAILED" -eq 0 ]; then + HELM_CHART_PATH="{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" + CRDS_FILE="${HELM_CHART_PATH}/crds/cert-manager.crds.yaml" + if [ -f "$CRDS_FILE" ]; then + echo "Applying cert-manager CRDs from chart crds/ directory..." + kubectl apply --validate=false -f "$CRDS_FILE" || { + echo "WARNING: Failed to apply cert-manager CRDs from crds/ directory." + } + else + echo "No cert-manager CRDs file found at ${CRDS_FILE}, Helm will handle CRDs." + fi + fi + + # Step 3: Copy isilon-creds secret from CSI driver namespace to ${CSM_NS} namespace + if [ "$PS_TEL_FAILED" -eq 0 ]; then + echo "Copying isilon-creds secret to ${CSM_NS} namespace..." + kubectl delete secret isilon-creds -n "${CSM_NS}" --ignore-not-found=true 2>/dev/null + kubectl get secret isilon-creds -n isilon -o json \ + | jq 'del(.metadata.resourceVersion,.metadata.uid,.metadata.creationTimestamp,.metadata.annotations,.metadata.managedFields)' \ + | jq ".metadata.namespace = \"${CSM_NS}\"" \ + | kubectl create -f - || { + echo "ERROR: Failed to copy isilon-creds secret to ${CSM_NS} namespace." + PS_TEL_FAILED=1 + } + fi + + # Step 4: Copy Karavi Authorization resources (if authorization enabled in Helm values) + if [ "$PS_TEL_FAILED" -eq 0 ]; then + HELM_VALUES_FILE="{{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml" + KARAVI_AUTH_ENABLED="false" + if [ -f "$HELM_VALUES_FILE" ]; then + # Parse authorization.enabled from the Helm values YAML + KARAVI_AUTH_ENABLED=$(grep -A5 'authorization:' "$HELM_VALUES_FILE" | grep 'enabled:' | head -1 | awk '{print $2}' | tr -d ' "') + fi + if [ "$KARAVI_AUTH_ENABLED" = "true" ]; then + echo "Karavi Authorization enabled (Mode B) - copying authorization resources..." + kubectl get configmap isilon-config-params -n isilon -o yaml \ + | sed "s/namespace: isilon/namespace: ${CSM_NS}/" \ + | kubectl apply -f - || echo "WARNING: Failed to copy isilon-config-params." + + kubectl get secret proxy-server-root-certificate proxy-authz-tokens -n isilon -o yaml \ + | sed "s/namespace: isilon/namespace: ${CSM_NS}/" \ + | sed 's/name: proxy-server-root-certificate/name: isilon-proxy-server-root-certificate/' \ + | sed 's/name: proxy-authz-tokens/name: isilon-proxy-authz-tokens/' \ + | kubectl apply -f - || echo "WARNING: Failed to copy proxy secrets." + else + echo "Direct Authentication (Mode A) - skipping Karavi authorization resources." + fi + fi + + # Step 5: Install karavi-observability Helm chart directly from NFS share + if [ "$PS_TEL_FAILED" -eq 0 ]; then + HELM_CHART_PATH="{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" + HELM_VALUES_FILE="{{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml" + + # Verify cert-manager subchart is available (directory or archive) + if [ -d "${HELM_CHART_PATH}/charts/cert-manager" ]; then + echo "cert-manager subchart found as directory in charts/." + elif ls "${HELM_CHART_PATH}/charts/cert-manager"*.tgz 1>/dev/null 2>&1 || \ + ls "${HELM_CHART_PATH}/charts/cert-manager"*.tar.gz 1>/dev/null 2>&1; then + echo "cert-manager subchart found as archive in charts/." + else + echo "WARNING: cert-manager subchart not found in ${HELM_CHART_PATH}/charts/." + fi + + if [ -d "$HELM_CHART_PATH" ] && [ -f "$HELM_VALUES_FILE" ]; then + echo "Installing karavi-observability Helm chart from NFS share..." + if helm list -n "${CSM_NS}" --filter karavi-observability -q 2>/dev/null | grep -q karavi-observability; then + echo "Upgrading existing karavi-observability release..." + helm upgrade karavi-observability "$HELM_CHART_PATH" \ + -n "${CSM_NS}" \ + -f "$HELM_VALUES_FILE" \ + --wait --timeout 10m || { + echo "ERROR: Helm upgrade failed." + PS_TEL_FAILED=1 + } + else + echo "Fresh install of karavi-observability..." + helm install karavi-observability "$HELM_CHART_PATH" \ + -n "${CSM_NS}" \ + -f "$HELM_VALUES_FILE" \ + --wait --timeout 10m || { + echo "ERROR: Helm install failed." + PS_TEL_FAILED=1 + } + fi + else + echo "ERROR: Helm chart or values file not found on NFS share." + echo " Chart path: $HELM_CHART_PATH" + echo " Values file: $HELM_VALUES_FILE" + PS_TEL_FAILED=1 + fi + fi + + if [ "$PS_TEL_FAILED" -eq 0 ]; then + echo "===== PowerScale Telemetry (CSM Observability) deployed successfully =====" + + # Step 6: Patch OTEL Collector service to expose Prometheus metrics port + echo "Patching OTEL Collector service to expose port 8889 for Prometheus metrics..." + kubectl patch svc otel-collector -n "${CSM_NS}" --patch '{"spec":{"ports":[{"name":"prometheus","port":8889,"targetPort":8889,"protocol":"TCP"}]}}' || { + echo "WARNING: Failed to patch OTEL Collector service for Prometheus metrics." + } + + # Step 7: Create PVC for OTEL Collector persistent buffering + OTEL_PVC_SIZE="{{ hostvars['localhost']['telemetry_config']['powerscale_configurations']['otel_collector_storage_size'] | default('5Gi') }}" + echo "Creating OTEL Collector PVC (${OTEL_PVC_SIZE}) for metric buffering..." + OTEL_PVC_FILE=$(mktemp /tmp/otel-pvc-XXXXXX.yaml) + echo "apiVersion: v1" > "$OTEL_PVC_FILE" + echo "kind: PersistentVolumeClaim" >> "$OTEL_PVC_FILE" + echo "metadata:" >> "$OTEL_PVC_FILE" + echo " name: otel-collector-data" >> "$OTEL_PVC_FILE" + echo " namespace: ${CSM_NS}" >> "$OTEL_PVC_FILE" + echo " labels:" >> "$OTEL_PVC_FILE" + echo " app.kubernetes.io/name: otel-collector" >> "$OTEL_PVC_FILE" + echo " app.kubernetes.io/instance: karavi-observability" >> "$OTEL_PVC_FILE" + echo "spec:" >> "$OTEL_PVC_FILE" + echo " accessModes:" >> "$OTEL_PVC_FILE" + echo " - ReadWriteOnce" >> "$OTEL_PVC_FILE" + echo " resources:" >> "$OTEL_PVC_FILE" + echo " requests:" >> "$OTEL_PVC_FILE" + echo " storage: ${OTEL_PVC_SIZE}" >> "$OTEL_PVC_FILE" + kubectl apply -f "$OTEL_PVC_FILE" + rm -f "$OTEL_PVC_FILE" + + echo "Waiting for OTEL Collector PVC to be bound..." + kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/otel-collector-data -n "${CSM_NS}" --timeout=120s || { + echo "WARNING: OTEL Collector PVC not bound yet. Continuing..." + } + + # Step 8: Patch OTEL Collector deployment to mount the PVC + echo "Patching OTEL Collector deployment with persistent volume..." + kubectl patch deployment otel-collector -n "${CSM_NS}" --type='json' -p='[{"op":"add","path":"/spec/template/spec/volumes/-","value":{"name":"otel-collector-data","persistentVolumeClaim":{"claimName":"otel-collector-data"}}},{"op":"add","path":"/spec/template/spec/containers/0/volumeMounts/-","value":{"name":"otel-collector-data","mountPath":"/data"}}]' || { + echo "WARNING: Failed to patch OTEL Collector with PVC. Metrics will use in-memory only." + } + + echo "Waiting for OTEL Collector rollout..." + kubectl rollout status deployment/otel-collector -n "${CSM_NS}" --timeout=120s || { + echo "WARNING: OTEL Collector rollout not complete yet." + } + else + echo "===== PowerScale Telemetry deployment had errors (see above) =====" + fi diff --git a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 new file mode 100644 index 0000000000..e6f3412462 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 @@ -0,0 +1,182 @@ +#!/bin/bash +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# PowerScale Telemetry Verification Script +# Generated by Omnia provision playbook +# Validates Helm-based CSM Observability (karavi-observability) deployment + +NAMESPACE="{{ telemetry_namespace }}" +HELM_RELEASE="karavi-observability" +PASS=0 +FAIL=0 +WARN=0 + +echo "==============================================" +echo " PowerScale Telemetry Verification" +echo "==============================================" +echo "" + +# 1. Check Kubernetes connectivity +echo "[1/9] Checking Kubernetes connectivity..." +if kubectl cluster-info &>/dev/null; then + echo " PASS: Kubernetes cluster is reachable" + ((PASS++)) +else + echo " FAIL: Cannot connect to Kubernetes cluster" + ((FAIL++)) +fi + +# 2. Check namespace +echo "[2/9] Checking telemetry namespace..." +if kubectl get namespace "$NAMESPACE" &>/dev/null; then + echo " PASS: Namespace '$NAMESPACE' exists" + ((PASS++)) +else + echo " FAIL: Namespace '$NAMESPACE' does not exist" + ((FAIL++)) +fi + +# 3. Check Helm release +echo "[3/9] Checking Helm release..." +HELM_STATUS=$(helm status "$HELM_RELEASE" -n "$NAMESPACE" -o json 2>/dev/null | grep -o '"status":"[^"]*"' | head -1) +if echo "$HELM_STATUS" | grep -q "deployed"; then + echo " PASS: Helm release '$HELM_RELEASE' is deployed" + ((PASS++)) +else + echo " FAIL: Helm release '$HELM_RELEASE' not found or not deployed" + ((FAIL++)) +fi + +# 4. Check PowerScale CSI secret (copied from isilon namespace) +echo "[4/9] Checking PowerScale credentials secret..." +if kubectl get secret isilon-creds -n "$NAMESPACE" &>/dev/null; then + echo " PASS: Secret 'isilon-creds' exists in '$NAMESPACE'" + ((PASS++)) +else + echo " FAIL: Secret 'isilon-creds' not found in namespace '$NAMESPACE'" + ((FAIL++)) +fi + +# 5. Check CSM Metrics PowerScale deployment (Helm-managed) +echo "[5/9] Checking CSM Metrics PowerScale deployment..." +CSM_DEPLOY=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/name=karavi-metrics-powerscale -o name 2>/dev/null) +if [ -n "$CSM_DEPLOY" ]; then + for dep in $CSM_DEPLOY; do + DEP_NAME=$(echo "$dep" | sed 's|deployment.apps/||') + READY=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}' 2>/dev/null) + EXPECTED=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null) + if [ "${READY:-0}" == "$EXPECTED" ]; then + echo " PASS: $DEP_NAME is ready ($READY/$EXPECTED)" + ((PASS++)) + else + echo " FAIL: $DEP_NAME is not ready (${READY:-0}/$EXPECTED)" + ((FAIL++)) + fi + done +else + echo " FAIL: No CSM Metrics PowerScale deployment found" + ((FAIL++)) +fi + +# 6. Check OTEL Collector deployment (Helm-managed) +echo "[6/9] Checking OTEL Collector deployment..." +OTEL_DEPLOY=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/name=otel-collector -o name 2>/dev/null) +if [ -n "$OTEL_DEPLOY" ]; then + for dep in $OTEL_DEPLOY; do + DEP_NAME=$(echo "$dep" | sed 's|deployment.apps/||') + READY=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}' 2>/dev/null) + EXPECTED=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null) + if [ "${READY:-0}" == "$EXPECTED" ]; then + echo " PASS: $DEP_NAME is ready ($READY/$EXPECTED)" + ((PASS++)) + else + echo " FAIL: $DEP_NAME is not ready (${READY:-0}/$EXPECTED)" + ((FAIL++)) + fi + done +else + echo " FAIL: No OTEL Collector deployment found" + ((FAIL++)) +fi + +# 7. Check cert-manager pods (Helm sub-chart) +echo "[7/9] Checking cert-manager pods..." +CM_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=cert-manager --no-headers 2>/dev/null | wc -l) +if [ "$CM_PODS" -ge 1 ]; then + CM_READY=$(kubectl get pods -n "$NAMESPACE" -l app=cert-manager --no-headers 2>/dev/null | awk '$2 ~ /^[0-9]+\/[0-9]+$/ && $3=="Running"' | wc -l) + echo " PASS: cert-manager pods running ($CM_READY/$CM_PODS)" + ((PASS++)) +else + echo " WARN: No cert-manager pods found (may be disabled in values)" + ((WARN++)) +fi + +# 8. Check OTEL Collector PVC +echo "[8/9] Checking OTEL Collector PVC..." +PVC_STATUS=$(kubectl get pvc otel-collector-data -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null) +if [ "$PVC_STATUS" == "Bound" ]; then + PVC_SC=$(kubectl get pvc otel-collector-data -n "$NAMESPACE" -o jsonpath='{.spec.storageClassName}' 2>/dev/null) + PVC_SIZE=$(kubectl get pvc otel-collector-data -n "$NAMESPACE" -o jsonpath='{.spec.resources.requests.storage}' 2>/dev/null) + echo " PASS: PVC 'otel-collector-data' is Bound (${PVC_SIZE}, StorageClass: ${PVC_SC:-default})" + ((PASS++)) +elif [ -n "$PVC_STATUS" ]; then + echo " WARN: PVC 'otel-collector-data' status: $PVC_STATUS (not yet Bound)" + ((WARN++)) +else + echo " WARN: PVC 'otel-collector-data' not found (OTEL Collector using in-memory only)" + ((WARN++)) +fi + +# 9. Check metrics flow +echo "[9/9] Checking metrics flow..." +METRICS_LOG=$(kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=karavi-metrics-powerscale --tail=5 2>/dev/null | grep -c "function duration") +OTEL_LOG=$(kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=otel-collector --all-containers --tail=5 2>/dev/null | grep -c "Metrics") +if [ "$METRICS_LOG" -gt 0 ] && [ "$OTEL_LOG" -gt 0 ]; then + echo " PASS: Metrics flowing (CSM Metrics -> OTEL Collector)" + ((PASS++)) +elif [ "$METRICS_LOG" -gt 0 ]; then + echo " WARN: CSM Metrics collecting but OTEL Collector not logging metrics" + ((WARN++)) +else + echo " FAIL: No metrics activity detected" + ((FAIL++)) +fi + +# Summary +echo "" +echo "==============================================" +echo " Verification Summary" +echo "==============================================" +echo " PASSED: $PASS" +echo " FAILED: $FAIL" +echo " WARNINGS: $WARN" +echo "" + +if [ $FAIL -eq 0 ]; then + echo " STATUS: ALL CHECKS PASSED" +else + echo " STATUS: SOME CHECKS FAILED" + echo "" + echo " Debug Commands:" + echo " kubectl get pods -n $NAMESPACE -l app.kubernetes.io/instance=$HELM_RELEASE" + echo " kubectl logs -n $NAMESPACE -l app.kubernetes.io/name=karavi-metrics-powerscale --tail=50" + echo " kubectl logs -n $NAMESPACE -l app.kubernetes.io/name=otel-collector --all-containers --tail=50" + echo " helm status $HELM_RELEASE -n $NAMESPACE" + echo " kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -20" +fi + +echo "" +echo "==============================================" +exit $FAIL diff --git a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 b/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 index fd3ccbacfb..352671ad4c 100644 --- a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 +++ b/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 @@ -2,9 +2,24 @@ kubectl apply -f {{ k8s_client_mount_path }}/telemetry/deployments/telemetry_nam {% if kafka_support %} helm -n telemetry install strimzi-cluster-operator {{ k8s_client_mount_path }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz {% endif %} +{% if 'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +helm -n telemetry install victoria-metrics-operator {{ k8s_client_mount_path }}/telemetry/{{ victoria_operator_pkg }}.tar.gz +echo "Waiting for victoria-metrics-operator to be ready..." +kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=victoria-metrics-operator -n telemetry || true +echo "Waiting for VictoriaLogs CRDs to be registered..." +for i in {1..30}; do + if kubectl get crd vlclusters.operator.victoriametrics.com >/dev/null 2>&1 && \ + kubectl get crd vlagents.operator.victoriametrics.com >/dev/null 2>&1; then + echo "VictoriaLogs CRDs are ready" + break + fi + echo "Waiting for VictoriaLogs CRDs... (attempt $i/30)" + sleep 2 +done +{% endif %} kubectl apply -k {{ k8s_client_mount_path }}/telemetry/deployments/. {% if hostvars['localhost']['ldms_support'] %} kubectl create secret generic nersc-ldms-ovis-auth --from-file=ldmsauth.conf={{ k8s_client_mount_path }}/telemetry/ldms/ldmsauth.conf --dry-run=client -o yaml | kubectl apply -f - -n telemetry kubectl create secret generic nersc-munge-key --from-file=munge.key={{ k8s_client_mount_path }}/telemetry/ldms/munge.key --dry-run=client -o yaml | kubectl apply -f - -n telemetry cd {{ k8s_client_mount_path }}/telemetry/ldms/nersc-ldms-aggr && helm install -n telemetry nersc-ldms-aggr nersc-ldms-aggr --values values.yaml -{% endif %} \ No newline at end of file +{% endif %} diff --git a/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml b/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml new file mode 100644 index 0000000000..015150abc6 --- /dev/null +++ b/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml @@ -0,0 +1,52 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check kube_vip reachability + when: + - kube_vip is defined + - kube_vip | length > 0 + tags: telemetry_deployment + block: + - name: Set kube_vip reachability fact to false initially + ansible.builtin.set_fact: + kube_vip_reachable: false + + - name: Test SSH connectivity to kube_vip + ansible.builtin.wait_for: + host: "{{ kube_vip }}" + port: 22 + timeout: 5 + state: started + register: kube_vip_ssh_check + ignore_errors: true + changed_when: false + + - name: Set kube_vip reachable fact if SSH successful + ansible.builtin.set_fact: + kube_vip_reachable: true + when: + - kube_vip_ssh_check is defined + - kube_vip_ssh_check.state is defined + - kube_vip_ssh_check.state == 'started' + + - name: Log kube_vip reachability status + ansible.builtin.debug: + msg: | + kube_vip Reachability Check: + - Host: {{ kube_vip }} + - Port: 22 + - Reachable: {{ kube_vip_reachable }} + - Status: {% if kube_vip_reachable %}✅ REACHABLE{% else %}❌ NOT REACHABLE{% endif %} + tags: telemetry_deployment diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml b/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml new file mode 100644 index 0000000000..463152977f --- /dev/null +++ b/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml @@ -0,0 +1,124 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate CSI driver PowerScale is configured + ansible.builtin.assert: + that: + - hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool + fail_msg: "{{ ps_csi_driver_not_configured_msg }}" + +- name: Set PowerScale configuration facts + ansible.builtin.set_fact: + ps_csi_secret_path: "{{ hostvars['localhost']['service_cluster_info'].csi_powerscale_driver_secret_file_path | default('') }}" + powerscale_configurations: "{{ telemetry_config.powerscale_configurations }}" + ps_helm_values_file: "{{ telemetry_config.powerscale_configurations.csm_observability_values_file_path }}" + csm_observability_namespace: "{{ csm_namespace }}" + +- name: Validate user-provided Helm values file path + ansible.builtin.assert: + that: + - ps_helm_values_file | length > 0 + fail_msg: "{{ ps_helm_values_path_missing_msg }}" + +- name: Verify user-provided Helm values file exists + ansible.builtin.stat: + path: "{{ ps_helm_values_file }}" + register: helm_values_stat + delegate_to: localhost + +- name: Fail if user-provided Helm values file does not exist + ansible.builtin.fail: + msg: "{{ ps_helm_values_file_not_found_msg }}" + when: not helm_values_stat.stat.exists + +# --- Read user values file to detect auth mode for secret copying --- + +- name: Load user-provided Helm values to detect auth mode + block: + - name: Read user Helm values file + ansible.builtin.include_vars: + file: "{{ ps_helm_values_file }}" + name: user_helm_values + no_log: true + rescue: + - name: Failed to read user Helm values file + ansible.builtin.fail: + msg: "{{ ps_helm_values_parse_fail_msg }}" + +- name: Detect authentication mode from CSM Observability values + ansible.builtin.set_fact: + karavi_enabled: >- + {{ user_helm_values.karaviMetricsPowerscale.authorization.enabled | default(false) | bool }} + +- name: Display authentication mode + ansible.builtin.debug: + msg: "{% if karavi_enabled | bool %}{{ ps_auth_mode_karavi_msg }}{% else %}{{ ps_auth_mode_direct_msg }}{% endif %}" + verbosity: 2 + +# --- Validate cert-manager is enabled (required for TLS) --- + +- name: Validate cert-manager is enabled in Helm values + ansible.builtin.assert: + that: + - user_helm_values['cert-manager']['enabled'] | default(false) | bool + fail_msg: "{{ ps_cert_manager_disabled_msg }}" + +# --- Load CSI PowerScale credentials from vault --- + +- name: Load CSI PowerScale credentials from vault + block: + - name: Read CSI driver secret.yaml + ansible.builtin.include_vars: + file: "{{ ps_csi_secret_path }}" + name: csi_powerscale_secret + no_log: true + + - name: Extract isilon clusters from secret + ansible.builtin.set_fact: + ps_clusters: "{{ csi_powerscale_secret.isilonClusters | default([]) }}" + no_log: true + rescue: + - name: Failed to load CSI PowerScale credentials + ansible.builtin.fail: + msg: "{{ ps_csi_secret_read_fail_msg }}" + +- name: Fail if no PowerScale clusters defined + ansible.builtin.fail: + msg: "{{ ps_no_clusters_found_msg }}" + when: ps_clusters | length == 0 + +- name: Display PowerScale clusters found + ansible.builtin.debug: + msg: "{{ ps_clusters_found_msg }}" + verbosity: 2 + +# --- Air-gapped: Download and extract karavi-observability dependencies from Pulp to NFS --- + +- name: Get PowerScale telemetry offline dependencies + ansible.builtin.include_tasks: get_powerscale_telemetry_dependencies.yml + +# --- Copy user-provided values file to NFS share --- +# Actual helm install happens in cloud-init during PXE boot. + +- name: Copy user-provided Helm values file to NFS share + ansible.builtin.copy: + src: "{{ ps_helm_values_file }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml" + mode: '0600' + +- name: Display PowerScale telemetry preparation status + ansible.builtin.debug: + msg: "{{ ps_telemetry_prepared_msg }}" + verbosity: 2 diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index 72ce7f8707..d1247287dd 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -33,12 +33,22 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.dest }}" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" loop: "{{ victoria_templates }}" - when: "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',')" + when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" tags: telemetry_deployment - # NOTE: victoria_templates is automatically set based on deployment_mode in telemetry_config.yml + # NOTE: victoria_templates is automatically set based on victoria_deployment_mode in vars/main.yml # - cluster mode: includes vmstorage, vminsert, vmselect templates # - single-node mode: includes victoria-statefulset template +- name: Populate VictoriaLogs deployment configs + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.dest }}" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + loop: "{{ victorialogs_templates }}" + when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + tags: telemetry_deployment + # NOTE: victorialogs_templates includes VLCluster CR, VLAgent CR, and VLAgent ConfigMap + - name: Kafka configurations when: kafka_support block: @@ -72,7 +82,7 @@ }] }} when: - hostvars['localhost']['idrac_telemetry_support'] - - "'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',')" + - "'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',')" - "kafka.topics.idrac.name in kafka_topic_partitions" - name: Add ldms topic if enabled @@ -115,6 +125,19 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" +- name: Victoria Metrics operator configuration + when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + block: + - name: Extract and set facts for tarball URLs for victoria metrics operator + ansible.builtin.set_fact: + victoria_operator_pkg: "{{ k8s_packages_json['service_k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'victoria-metrics-operator') | map(attribute='package') | join }}" # noqa: yaml[line-length] + + - name: Download victoria metrics operator tarball + ansible.builtin.get_url: + url: "{{ victoria_operator_tarball_url }}" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ victoria_operator_pkg }}.tar.gz" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + - name: Populate common telemetry deployment configs ansible.builtin.template: src: "{{ item.src }}" diff --git a/provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml b/provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml new file mode 100644 index 0000000000..b5f1bdd008 --- /dev/null +++ b/provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml @@ -0,0 +1,162 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# get_powerscale_telemetry_dependencies.yml +# Downloads and extracts karavi-observability git repository, Helm chart, +# and cert-manager Helm chart dependency from Pulp local mirror for +# air-gapped deployment. +# All files are downloaded to the NFS share at {{ k8s_client_mount_path }}/karavi-observability/ +# +# cert-manager CRDs are included inside the karavi-observability repo +# (installer/cert-manager.crds.yaml) -- no separate download needed. +# +# cert-manager Helm chart (from Jetstack) is pre-downloaded to Pulp via +# service_k8s.json tarball entry. The package name and version are read +# dynamically from service_k8s.json -- not hardcoded. + +- name: Extract cert-manager package name from service_k8s.json + ansible.builtin.set_fact: + cert_manager_package: >- + {{ telemetry_packages['service_k8s']['cluster'] + | selectattr('type', 'equalto', 'tarball') + | selectattr('package', 'search', 'cert-manager') + | map(attribute='package') + | first }} + +- name: Set cert-manager chart tarball filename (Pulp stores tarballs as .tar.gz) + ansible.builtin.set_fact: + cert_manager_chart_tgz: "{{ cert_manager_package }}.tar.gz" + +- name: Display cert-manager package read from service_k8s.json + ansible.builtin.debug: + msg: "{{ ps_cert_manager_pkg_msg }}" + verbosity: 2 + +- name: Get karavi-observability offline dependencies from Pulp to NFS share + block: + - name: Create karavi-observability directory on NFS share + ansible.builtin.file: + path: "{{ k8s_client_mount_path }}/karavi-observability" + state: directory + mode: '0755' + + - name: Get karavi-observability git tar from Pulp + ansible.builtin.get_url: + url: "{{ offline_git_path }}/karavi-observability/{{ karavi_observability_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_observability_git }}" + mode: "{{ permission_644 }}" + + - name: Extract karavi-observability tar file on NFS share + ansible.builtin.unarchive: + src: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_observability_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/" + remote_src: true + + - name: Get dell/helm-charts git tar from Pulp + ansible.builtin.get_url: + url: "{{ offline_git_path }}/helm-charts/{{ karavi_helm_charts_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_helm_charts_git }}" + mode: "{{ permission_644 }}" + + - name: Extract dell/helm-charts tar file on NFS share + ansible.builtin.unarchive: + src: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_helm_charts_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/" + remote_src: true + + - name: Set karavi-observability Helm chart path on NFS share + ansible.builtin.set_fact: + karavi_helm_chart_path: "{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" + + - name: Create charts/ directory for Helm dependencies + ansible.builtin.file: + path: "{{ karavi_helm_chart_path }}/charts" + state: directory + mode: '0755' + + - name: Check if cert-manager is disabled in values file + ansible.builtin.set_fact: + cert_manager_disabled: >- + {%- if csm_observability_values_file_path | default('') != '' -%} + {%- set values_content = lookup('file', csm_observability_values_file_path, errors='ignore') -%} + {%- if values_content is not none and 'cert-manager:' in values_content and 'enabled: false' in values_content -%} + true{%- else -%} + false{%- endif -%} + {%- else -%} + false{%- endif -%} + + - name: Display cert-manager dependency status + ansible.builtin.debug: + msg: "{% if cert_manager_disabled %}{{ ps_cert_manager_skipped_msg }}{% else %}{{ ps_cert_manager_required_msg }}{% endif %}" + verbosity: 2 + + - name: Download cert-manager Helm chart from Pulp (Jetstack repo pre-staged offline) + ansible.builtin.get_url: + url: "{{ offline_tarball_path }}/{{ cert_manager_package }}/{{ cert_manager_chart_tgz }}" + dest: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + mode: "{{ permission_644 }}" + when: not cert_manager_disabled + + - name: Verify cert-manager chart archive was downloaded + ansible.builtin.stat: + path: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + register: cert_manager_chart_stat + when: not cert_manager_disabled + + - name: Fail if cert-manager chart archive is missing + ansible.builtin.fail: + msg: "{{ ps_cert_manager_missing_msg }}" + when: not cert_manager_disabled and not cert_manager_chart_stat.stat.exists + + - name: Extract cert-manager chart as directory for Helm compatibility + ansible.builtin.unarchive: + src: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + dest: "{{ karavi_helm_chart_path }}/charts/" + remote_src: true + when: not cert_manager_disabled and cert_manager_chart_stat.stat.exists + + - name: Remove cert-manager archive after extraction + ansible.builtin.file: + path: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + state: absent + when: not cert_manager_disabled and cert_manager_chart_stat.stat.exists + + - name: Verify cert-manager chart directory was extracted + ansible.builtin.stat: + path: "{{ karavi_helm_chart_path }}/charts/cert-manager/Chart.yaml" + register: cert_manager_dir_stat + when: not cert_manager_disabled + + - name: Fail if cert-manager chart directory extraction failed + ansible.builtin.fail: + msg: "{{ ps_cert_manager_extract_fail_msg }}" + when: not cert_manager_disabled and not cert_manager_dir_stat.stat.exists + + - name: Display cert-manager dependency staged successfully + ansible.builtin.debug: + msg: "{{ ps_cert_manager_staged_msg }}" + verbosity: 2 + when: not cert_manager_disabled and cert_manager_dir_stat.stat.exists + + - name: Display cert-manager dependency skipped + ansible.builtin.debug: + msg: "{{ ps_cert_manager_skipped_detail_msg }}" + verbosity: 2 + when: cert_manager_disabled + + rescue: + - name: Handle dependency download failure + ansible.builtin.fail: + msg: "{{ ps_dependency_fail_msg }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 2e9c3ac0da..1c5bcb7dc8 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -24,9 +24,19 @@ - name: Load service images from service_k8s.json ansible.builtin.include_tasks: load_service_images.yml +- name: Check kube_vip reachability for validation + ansible.builtin.include_tasks: check_kube_vip_reachability.yml + when: + - "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + - kube_vip is defined + - kube_vip | length > 0 + - name: Configure of k8s telemetry service when: - - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] + - >- + hostvars['localhost']['idrac_telemetry_support'] or + hostvars['localhost']['ldms_support'] or + hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool block: - name: Set NFS info fact ansible.builtin.set_fact: @@ -35,6 +45,11 @@ - name: Service cluster prerequisite ansible.builtin.include_tasks: telemetry_prereq.yml + - name: Deploy PowerScale telemetry metrics + ansible.builtin.include_tasks: deploy_powerscale_metrics.yml + when: + - hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool + - name: Generate telemetry deployments ansible.builtin.include_tasks: generate_telemetry_deployments.yml diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index 7eb45a89ab..c41462c609 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -69,7 +69,7 @@ - name: Set kafka_support to true ansible.builtin.set_fact: kafka_support: true - when: "'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') or hostvars['localhost']['ldms_support']" + when: "'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') or hostvars['localhost']['ldms_support']" - name: Configure TLS certificate and secrets for kafka when: kafka_support @@ -100,7 +100,9 @@ when: not cluster_id_present | default(false) - name: Configure TLS certificate for VictoriaMetrics - when: "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type']" + when: + - "'victoria' in hostvars['localhost']['telemetry_collection_type']" + - victoria_cluster.tls_enabled | default(false) | bool block: - name: Create VictoriaMetrics certificate directory ansible.builtin.file: diff --git a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 index bdfc894bef..a4b391519f 100644 --- a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 @@ -18,12 +18,14 @@ # Telemetry Stack Cleanup Script # Removes Kafka, LDMS, iDRAC telemetry, and monitoring resources from the {{ telemetry_namespace }} namespace # -# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [all] -# kafka - Delete Kafka cluster, users, and bridge -# ldms - Delete LDMS aggregator and store -# idrac - Delete iDRAC telemetry -# victoria - Delete VictoriaMetrics monitoring -# all - Delete everything (default if no arguments) +# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [all] +# kafka - Delete Kafka cluster, users, and bridge +# ldms - Delete LDMS aggregator and store +# idrac - Delete iDRAC telemetry +# victoria - Delete VictoriaMetrics monitoring (vmcluster, vmagent) +# victorialogs - Delete VictoriaLogs only (vlagent, vlcluster) without affecting VictoriaMetrics +# powerscale - Delete PowerScale telemetry (karavi-observability Helm release, CSM Metrics, OTEL Collector) +# all - Delete everything (default if no arguments) # set -e @@ -35,6 +37,8 @@ CLEAN_KAFKA=false CLEAN_LDMS=false CLEAN_IDRAC=false CLEAN_VICTORIA=false +CLEAN_VICTORIALOGS=false +CLEAN_POWERSCALE=false CLEAN_ALL=false if [ $# -eq 0 ]; then @@ -54,24 +58,34 @@ else victoria) CLEAN_VICTORIA=true ;; + victorialogs) + CLEAN_VICTORIALOGS=true + ;; + powerscale) + CLEAN_POWERSCALE=true + ;; all) CLEAN_ALL=true ;; -h|--help) - echo "Usage: $0 [kafka] [ldms] [idrac] [victoria] [all]" + echo "Usage: $0 [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [all]" echo "" echo "Options:" - echo " kafka - Delete Kafka cluster, users, and bridge" - echo " ldms - Delete LDMS aggregator and store" - echo " idrac - Delete iDRAC telemetry" - echo " victoria - Delete VictoriaMetrics monitoring" - echo " all - Delete everything (default if no arguments)" + echo " kafka - Delete Kafka cluster, users, and bridge" + echo " ldms - Delete LDMS aggregator and store" + echo " idrac - Delete iDRAC telemetry" + echo " victoria - Delete VictoriaMetrics monitoring (vmcluster, vmagent)" + echo " victorialogs - Delete VictoriaLogs only (vlagent, vlcluster) without affecting VictoriaMetrics" + echo " powerscale - Delete PowerScale telemetry (karavi-observability Helm release)" + echo " all - Delete everything (default if no arguments)" echo "" echo "Examples:" echo " $0 # Delete everything" echo " $0 all # Delete everything" echo " $0 kafka ldms # Delete only Kafka and LDMS" - echo " $0 idrac victoria # Delete only iDRAC and Victoria" + echo " $0 idrac victoria # Delete only iDRAC and VictoriaMetrics" + echo " $0 victorialogs # Delete only VictoriaLogs (keeps VictoriaMetrics running)" + echo " $0 powerscale # Delete only PowerScale telemetry" exit 0 ;; *) @@ -89,6 +103,8 @@ if [ "$CLEAN_ALL" = true ]; then CLEAN_LDMS=true CLEAN_IDRAC=true CLEAN_VICTORIA=true + CLEAN_VICTORIALOGS=true + CLEAN_POWERSCALE=true fi echo "==========================================" @@ -96,11 +112,13 @@ echo " Telemetry Stack Cleanup" echo "==========================================" echo "" echo "Components to clean:" -echo " Kafka Bridge: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" -echo " Kafka Cluster: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" -echo " LDMS: $([ "$CLEAN_LDMS" = true ] && echo "YES" || echo "NO")" -echo " iDRAC Telemetry: $([ "$CLEAN_IDRAC" = true ] && echo "YES" || echo "NO")" -echo " Victoria Metrics:$([ "$CLEAN_VICTORIA" = true ] && echo "YES" || echo "NO")" +echo " Kafka Bridge: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" +echo " Kafka Cluster: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" +echo " LDMS: $([ "$CLEAN_LDMS" = true ] && echo "YES" || echo "NO")" +echo " iDRAC Telemetry: $([ "$CLEAN_IDRAC" = true ] && echo "YES" || echo "NO")" +echo " VictoriaMetrics: $([ "$CLEAN_VICTORIA" = true ] && echo "YES" || echo "NO")" +echo " VictoriaLogs: $([ "$CLEAN_VICTORIALOGS" = true ] && echo "YES" || echo "NO")" +echo " PowerScale Tel.: $([ "$CLEAN_POWERSCALE" = true ] && echo "YES" || echo "NO")" echo "" read -p "Continue? (y/N): " -n 1 -r echo @@ -220,7 +238,7 @@ if [ "$CLEAN_KAFKA" = true ]; then echo "" fi -if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ]; then +if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ] || [ "$CLEAN_VICTORIALOGS" = true ] || [ "$CLEAN_POWERSCALE" = true ]; then echo "Step 7: Delete Persistent Volume Claims" echo "----------------------------------------" if [ "$CLEAN_KAFKA" = true ]; then @@ -236,12 +254,26 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = t # Delete single-node PVCs delete_all pvc "app=victoria-metric" delete_resource pvc victoria-metrics-pvc-victoria-metric-0 - # Delete cluster mode PVCs (vmstorage StatefulSet PVCs) - delete_all pvc "app=vmstorage" + # Delete cluster mode PVCs (operator-managed vmstorage StatefulSet PVCs) + delete_all pvc "app.kubernetes.io/instance=victoria-cluster" + for i in {0..9}; do + delete_resource pvc vmstorage-data-vmstorage-victoria-cluster-$i + done + fi + if [ "$CLEAN_VICTORIALOGS" = true ]; then + # Delete VictoriaLogs PVCs (vlstorage StatefulSet PVCs + vlagent PVC) + delete_all pvc "app.kubernetes.io/name=vlstorage" + delete_all pvc "app.kubernetes.io/name=vlagent" for i in {0..9}; do - delete_resource pvc vmstorage-data-vmstorage-$i + delete_resource pvc vlstorage-data-vlstorage-victoria-logs-cluster-$i + delete_resource pvc vlagent-data-vlagent-vlagent-$i done fi + if [ "$CLEAN_POWERSCALE" = true ]; then + # Delete OTEL Collector PVC (created by cloud-init post-Helm-install) + delete_resource pvc otel-collector-data + delete_all pvc "app.kubernetes.io/name=otel-collector" + fi sleep 2 echo "" fi @@ -277,7 +309,7 @@ if [ "$CLEAN_IDRAC" = true ]; then echo "" fi -if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ]; then +if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ] || [ "$CLEAN_VICTORIALOGS" = true ]; then echo "Step 9: Delete ConfigMaps" echo "-------------------------" if [ "$CLEAN_KAFKA" = true ]; then @@ -293,6 +325,9 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = t if [ "$CLEAN_VICTORIA" = true ]; then delete_resource configmap victoria-tls-test-script fi + if [ "$CLEAN_VICTORIALOGS" = true ]; then + delete_resource configmap vlagent-config + fi sleep 2 echo "" fi @@ -320,30 +355,77 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_VICTORIA" = true ]; then fi if [ "$CLEAN_VICTORIA" = true ]; then delete_resource service victoria-metric - delete_resource service vmselect - delete_resource service vminsert - delete_resource service vmstorage - delete_resource service vmagent + # Operator-managed cluster services + delete_resource service vmselect-victoria-cluster + delete_resource service vminsert-victoria-cluster + delete_resource service vmstorage-victoria-cluster + delete_resource service vmagent-victoria-cluster fi sleep 2 echo "" fi +if [ "$CLEAN_VICTORIALOGS" = true ]; then + echo "Step 12: Delete VictoriaLogs Resources" + echo "---------------------------------------" + + # Delete VictoriaLogs operator CRD resources (operator cascades deletion) + echo "Deleting VictoriaLogs operator CRD resources..." + kubectl -n $NAMESPACE delete vlcluster victoria-logs-cluster --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vlagent --all --ignore-not-found=true 2>/dev/null || true + sleep 5 + + # Delete VictoriaLogs cluster components (operator-managed) + echo "Deleting VictoriaLogs cluster components..." + delete_resource deployment vlinsert-victoria-logs-cluster + delete_resource deployment vlselect-victoria-logs-cluster + delete_resource statefulset vlstorage-victoria-logs-cluster + delete_resource service vlinsert-victoria-logs-cluster + delete_resource service vlselect-victoria-logs-cluster + delete_resource service vlstorage-victoria-logs-cluster + delete_all pod "app.kubernetes.io/component=vlinsert" + delete_all pod "app.kubernetes.io/component=vlselect" + delete_all pod "app.kubernetes.io/component=vlstorage" + + # Delete VLAgent components + echo "Deleting VLAgent..." + delete_resource statefulset vlagent-vlagent + delete_resource service vlagent-vlagent + delete_all pod "app.kubernetes.io/name=vlagent" + + # Delete VictoriaLogs shared resources + echo "Deleting VictoriaLogs shared resources..." + delete_resource configmap vlagent-config + # Note: victoria-tls-certs secret is shared with VictoriaMetrics - only delete if VictoriaMetrics is also being removed + if [ "$CLEAN_VICTORIA" != true ]; then + echo " Keeping victoria-tls-certs secret (shared with VictoriaMetrics)" + fi + + sleep 2 + echo "" +fi + if [ "$CLEAN_VICTORIA" = true ]; then - echo "Step 12: Delete Monitoring Resources" - echo "-------------------------------------" + echo "Step 13: Delete VictoriaMetrics Resources" + echo "------------------------------------------" - # Delete VictoriaMetrics cluster components (if cluster mode is deployed) + # Delete VictoriaMetrics operator CRD resources (operator cascades deletion) + echo "Deleting VictoriaMetrics operator CRD resources..." + kubectl -n $NAMESPACE delete vmcluster victoria-cluster --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vmagent --all --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vmpodscrape --all --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vmsingle --all --ignore-not-found=true 2>/dev/null || true + sleep 5 + + # Delete any remaining operator-managed cluster components echo "Deleting VictoriaMetrics cluster components..." - delete_resource deployment vmselect - delete_resource deployment vminsert - delete_resource statefulset vmstorage - delete_resource service vmselect - delete_resource service vminsert - delete_resource service vmstorage - delete_all pod "app=vmselect" - delete_all pod "app=vminsert" - delete_all pod "app=vmstorage" + delete_resource deployment vmselect-victoria-cluster + delete_resource deployment vminsert-victoria-cluster + delete_resource statefulset vmstorage-victoria-cluster + delete_resource service vmselect-victoria-cluster + delete_resource service vminsert-victoria-cluster + delete_resource service vmstorage-victoria-cluster + delete_all pod "app.kubernetes.io/instance=victoria-cluster" # Delete VictoriaMetrics single-node components (if single-node mode is deployed) echo "Deleting VictoriaMetrics single-node components..." @@ -357,6 +439,7 @@ if [ "$CLEAN_VICTORIA" = true ]; then delete_resource deployment vmagent delete_resource service vmagent delete_all pod "app=vmagent" + delete_all pod "app.kubernetes.io/name=vmagent" # Delete shared resources echo "Deleting VictoriaMetrics shared resources..." @@ -372,8 +455,56 @@ if [ "$CLEAN_VICTORIA" = true ]; then echo "" fi +if [ "$CLEAN_POWERSCALE" = true ]; then + echo "Step: Delete PowerScale Telemetry" + echo "----------------------------------" + + # Uninstall karavi-observability Helm release (primary deployment method) + echo "Uninstalling karavi-observability Helm release..." + if helm list -n $NAMESPACE --filter karavi-observability -q 2>/dev/null | grep -q karavi-observability; then + helm uninstall karavi-observability -n $NAMESPACE --wait --timeout 5m 2>/dev/null || true + echo "Helm release karavi-observability uninstalled." + sleep 5 + else + echo "No karavi-observability Helm release found." + fi + + # Delete Helm-managed resources by label (karavi-observability Helm chart) + echo "Deleting CSM Metrics PowerScale (Helm-managed)..." + delete_all deployment "app.kubernetes.io/name=karavi-metrics-powerscale" + delete_all service "app.kubernetes.io/name=karavi-metrics-powerscale" + delete_all configmap "app.kubernetes.io/name=karavi-metrics-powerscale" + delete_all pod "app.kubernetes.io/name=karavi-metrics-powerscale" + + echo "Deleting OTEL Collector (Helm-managed)..." + delete_all deployment "app.kubernetes.io/name=otel-collector" + delete_all service "app.kubernetes.io/name=otel-collector" + delete_all configmap "app.kubernetes.io/name=otel-collector" + delete_all pod "app.kubernetes.io/name=otel-collector" + + # Delete cert-manager resources deployed by karavi-observability sub-chart + echo "Deleting cert-manager resources (Helm sub-chart)..." + delete_all deployment "app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager" + delete_all pod "app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager" + + # Note: vmagent is shared with iDRAC telemetry - not deleted here. + # PowerScale scrape targets are removed from vmagent config on next deployment. + + # Delete Karavi-specific resources (if deployed) + echo "Deleting Karavi Authorization resources..." + delete_resource configmap karavi-authorization-config + delete_resource secret karavi-authorization-ca-cert + + # Delete PowerScale credentials + echo "Deleting PowerScale credentials..." + delete_resource secret isilon-creds + + sleep 2 + echo "" +fi + echo "" -echo "Step 13: Force Delete Any Remaining Component Pods" +echo "Step 14: Force Delete Any Remaining Component Pods" echo "---------------------------------------------------" # Only force delete pods from components being cleaned if [ "$CLEAN_KAFKA" = true ]; then @@ -387,18 +518,28 @@ fi if [ "$CLEAN_IDRAC" = true ]; then kubectl -n $NAMESPACE delete pod -l app=idrac-telemetry --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true fi +if [ "$CLEAN_VICTORIALOGS" = true ]; then + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/component=vlinsert --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/component=vlselect --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/component=vlstorage --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=vlagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true +fi if [ "$CLEAN_VICTORIA" = true ]; then kubectl -n $NAMESPACE delete pod -l app=victoria-metric --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true - kubectl -n $NAMESPACE delete pod -l app=vmselect --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true - kubectl -n $NAMESPACE delete pod -l app=vminsert --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true - kubectl -n $NAMESPACE delete pod -l app=vmstorage --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/instance=victoria-cluster --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=vmagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true kubectl -n $NAMESPACE delete pod -l app=vmagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true kubectl -n $NAMESPACE delete pod -l app=victoria-tls-test --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true fi +if [ "$CLEAN_POWERSCALE" = true ]; then + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=karavi-metrics-powerscale --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=otel-collector --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true +fi sleep 5 echo "" -echo "Step 14: Check for Remaining Resources" +echo "Step 15: Check for Remaining Resources" echo "---------------------------------------" if [ "$CLEAN_KAFKA" = true ]; then echo "Remaining Kafka resources:" @@ -415,20 +556,40 @@ if [ "$CLEAN_IDRAC" = true ]; then kubectl -n $NAMESPACE get statefulset,pod,configmap -l app=idrac-telemetry 2>/dev/null || echo " None" echo "" fi +if [ "$CLEAN_VICTORIALOGS" = true ]; then + echo "Remaining VictoriaLogs resources:" + echo " VLCluster CR:" + kubectl -n $NAMESPACE get vlcluster 2>/dev/null || echo " None" + echo " VLAgent CR:" + kubectl -n $NAMESPACE get vlagent 2>/dev/null || echo " None" + echo " Pods:" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=vlinsert 2>/dev/null || echo " None" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=vlselect 2>/dev/null || echo " None" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=vlstorage 2>/dev/null || echo " None" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/name=vlagent 2>/dev/null || echo " None" + echo "" +fi if [ "$CLEAN_VICTORIA" = true ]; then - echo "Remaining Victoria Metrics resources:" + echo "Remaining VictoriaMetrics resources:" echo " Single-node:" kubectl -n $NAMESPACE get statefulset,deployment,pod,configmap -l app=victoria-metric 2>/dev/null || echo " None" - echo " Cluster (vmselect):" - kubectl -n $NAMESPACE get deployment,pod -l app=vmselect 2>/dev/null || echo " None" - echo " Cluster (vminsert):" - kubectl -n $NAMESPACE get deployment,pod -l app=vminsert 2>/dev/null || echo " None" - echo " Cluster (vmstorage):" - kubectl -n $NAMESPACE get statefulset,pod -l app=vmstorage 2>/dev/null || echo " None" + echo " Operator-managed cluster:" + kubectl -n $NAMESPACE get vmcluster,deployment,statefulset,pod -l app.kubernetes.io/instance=victoria-cluster 2>/dev/null || echo " None" echo " vmagent:" + kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=vmagent 2>/dev/null || echo " None" kubectl -n $NAMESPACE get deployment,pod -l app=vmagent 2>/dev/null || echo " None" echo "" fi +if [ "$CLEAN_POWERSCALE" = true ]; then + echo "Remaining PowerScale telemetry resources:" + echo " CSM Metrics:" + kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=karavi-metrics-powerscale 2>/dev/null || echo " None" + echo " OTEL Collector:" + kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=otel-collector 2>/dev/null || echo " None" + echo " Helm release:" + helm list -n $NAMESPACE --filter karavi-observability 2>/dev/null || echo " None" + echo "" +fi echo "Remaining PVCs:" kubectl -n $NAMESPACE get pvc 2>/dev/null || echo " None" echo "" diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 index 7b817c7a22..0135593cc0 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 @@ -9,7 +9,7 @@ data: mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] | b64encode }}" mysqldb_root_password: "{{ hostvars['localhost']['mysqldb_root_password'] | b64encode }}" -{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} {% if 'kafka' in types %} --- apiVersion: v1 diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index b300029920..80994fc5da 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -47,7 +47,7 @@ spec: app: {{ idrac_telemetry_k8s_name }} spec: volumes: -{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} {% if 'kafka' in types %} # Mount Kafka cluster CA certificate for TLS verification - name: kafka-cluster-ca-cert diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 index 01ef142084..70790b75a4 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 @@ -59,7 +59,7 @@ spec: - Describe host: "*" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} # Producer and consumer permissions for idrac topic - resource: type: topic diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 index 0a80304e89..1b58cd5811 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 @@ -32,7 +32,7 @@ data: echo "Bootstrap Server: kafka-kafka-bootstrap:9093" echo "Certificates: kafkapump (for all TLS topics)" echo "Testing topics based on enabled telemetry support:" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} echo " - iDRAC telemetry topic ({{ kafka.topics.idrac.name }})" {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -80,7 +80,7 @@ data: echo "✓ mTLS connection successful" echo "" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} # Test iDRAC telemetry topic consumer echo "Step 5: Testing consumer on {{ kafka.topics.idrac.name }} topic (kafkapump user)..." timeout 30 /opt/kafka/bin/kafka-console-consumer.sh \ @@ -114,7 +114,7 @@ data: echo " ✓ kafkapump keystore created" echo " ✓ mTLS connection established" echo " ✓ Topics listed successfully" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} echo " ✓ {{ kafka.topics.idrac.name }} topic tested (kafkapump user)" {% endif %} {% if hostvars['localhost']['ldms_support'] %} diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 19c722fb7a..e25d434918 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -1,29 +1,36 @@ resources: - telemetry_secret_creation.yaml -{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} {% if 'victoria' in types %} - # VictoriaMetrics Common Resources - - victoria-tls-secret.yaml +# VictoriaMetrics Common Resources (RBAC) - victoria-vmagent-rbac.yaml - - vmagent-scrape-config.yaml - - victoria-agent-deployment.yaml - # VictoriaMetrics Deployment (mode: {{ hostvars['localhost']['victoria_configurations']['deployment_mode'] }}) -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} - # Cluster Mode: High-availability deployment - - victoria-cluster-vmstorage.yaml - - victoria-cluster-vminsert.yaml - - victoria-cluster-vmselect.yaml +{% if victoria_cluster.tls_enabled | default(false) %} + # TLS secret for VictoriaMetrics cluster components + - victoria-tls-secret.yaml +{% endif %} + # VictoriaMetrics Operator-based Deployment (mode: {{ victoria_deployment_mode }}) +{% if victoria_deployment_mode == 'cluster' %} + # Cluster Mode: VMCluster CR (operator manages StatefulSets) + - victoria-operator-vmcluster.yaml {% else %} - # Single-Node Mode: Simple deployment - - victoria-statefulset.yaml + # Single-Node Mode: VMSingle CR (operator manages StatefulSet) + - victoria-operator-vmsingle.yaml {% endif %} - # Uncomment to deploy VictoriaMetrics TLS test job - # - test/victoria-tls-test-job.yaml + # VMAgent CR (operator-managed scraper) + - victoria-operator-vmagent.yaml + # VMPodScrape CR (native operator-based pod discovery) + - victoria-operator-vmpodscrape.yaml + # VictoriaLogs Cluster Mode: VLCluster CR (operator manages vlstorage StatefulSet, vlinsert/vlselect Deployments) + # VLAgent CR (operator-managed log collection agent) + # VLAgent ConfigMap (syslog receiver and remoteWrite configuration) + - victorialogs-operator-vlcluster.yaml + - victorialogs-operator-vlagent.yaml + - victorialogs-vlagent-config.yaml {% endif %} {% if kafka_support %} - kafka.kafka.yaml - kafka.kafkapump_user.yaml -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} - kafka.topic_idrac.yaml {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -38,4 +45,4 @@ resources: - idrac_telemetry_statefulset.yaml - telemetry_cleaner_rbac.yaml - telemetry_pod_cleanup.yaml -{% endif %} \ No newline at end of file +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 b/provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 new file mode 100644 index 0000000000..c08bbd2f46 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 @@ -0,0 +1,82 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: csm-metrics-powerscale-cluster{{ cluster_index }} + namespace: {{ telemetry_namespace }} + labels: + app: csm-metrics-powerscale + cluster-index: "{{ cluster_index }}" +spec: + replicas: 1 + selector: + matchLabels: + app: csm-metrics-powerscale + cluster-index: "{{ cluster_index }}" + template: + metadata: + labels: + app: csm-metrics-powerscale + cluster-index: "{{ cluster_index }}" + spec: + containers: + - name: csm-metrics-powerscale + image: {{ csm_metrics_powerscale_image }} + envFrom: + - configMapRef: + name: csm-metrics-powerscale-config-cluster{{ cluster_index }} + env: + - name: POWERSCALE_CLUSTER_ENDPOINT + valueFrom: + secretKeyRef: + name: isilon-creds + key: endpoint + - name: POWERSCALE_CLUSTER_USERNAME + valueFrom: + secretKeyRef: + name: isilon-creds + key: username + - name: POWERSCALE_CLUSTER_PASSWORD + valueFrom: + secretKeyRef: + name: isilon-creds + key: password + - name: POWERSCALE_CLUSTER_NAME + valueFrom: + secretKeyRef: + name: isilon-creds + key: clusterName + resources: + requests: + cpu: {{ csm_metrics_powerscale_resources.requests.cpu }} + memory: {{ csm_metrics_powerscale_resources.requests.memory }} + limits: + cpu: {{ csm_metrics_powerscale_resources.limits.cpu }} + memory: {{ csm_metrics_powerscale_resources.limits.memory }} + volumeMounts: + - name: isilon-creds + mountPath: /etc/isilon-creds + readOnly: true + - name: csm-metrics-config + mountPath: /etc/config/karavi-metrics-powerscale.yaml + subPath: karavi-metrics-powerscale.yaml + volumes: + - name: isilon-creds + secret: + secretName: isilon-creds + - name: csm-metrics-config + configMap: + name: csm-metrics-powerscale-config-cluster{{ cluster_index }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 b/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 index ef2086c831..bfb894d58d 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 @@ -52,26 +52,57 @@ DNS.5 = victoria-metric-0 DNS.6 = victoria-metric-0.{{ telemetry_namespace }} DNS.7 = victoria-metric-0.{{ telemetry_namespace }}.svc DNS.8 = victoria-metric-0.{{ telemetry_namespace }}.svc.cluster.local -# Cluster deployment names -DNS.9 = vminsert -DNS.10 = vminsert.{{ telemetry_namespace }} -DNS.11 = vminsert.{{ telemetry_namespace }}.svc -DNS.12 = vminsert.{{ telemetry_namespace }}.svc.cluster.local -DNS.13 = vmselect -DNS.14 = vmselect.{{ telemetry_namespace }} -DNS.15 = vmselect.{{ telemetry_namespace }}.svc -DNS.16 = vmselect.{{ telemetry_namespace }}.svc.cluster.local -DNS.17 = vmstorage -DNS.18 = vmstorage.{{ telemetry_namespace }} -DNS.19 = vmstorage.{{ telemetry_namespace }}.svc -DNS.20 = vmstorage.{{ telemetry_namespace }}.svc.cluster.local -# VMStorage StatefulSet pods -DNS.21 = vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local -DNS.22 = vmstorage-1.vmstorage.{{ telemetry_namespace }}.svc.cluster.local -DNS.23 = vmstorage-2.vmstorage.{{ telemetry_namespace }}.svc.cluster.local +# Cluster deployment names (operator-managed) +DNS.9 = vminsert-victoria-cluster +DNS.10 = vminsert-victoria-cluster.{{ telemetry_namespace }} +DNS.11 = vminsert-victoria-cluster.{{ telemetry_namespace }}.svc +DNS.12 = vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.13 = vmselect-victoria-cluster +DNS.14 = vmselect-victoria-cluster.{{ telemetry_namespace }} +DNS.15 = vmselect-victoria-cluster.{{ telemetry_namespace }}.svc +DNS.16 = vmselect-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.17 = vmstorage-victoria-cluster +DNS.18 = vmstorage-victoria-cluster.{{ telemetry_namespace }} +DNS.19 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc +DNS.20 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +# VMStorage StatefulSet pods (operator-managed) +DNS.21 = vmstorage-victoria-cluster-0.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.22 = vmstorage-victoria-cluster-1.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.23 = vmstorage-victoria-cluster-2.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +# VictoriaLogs cluster deployment names (operator-managed) +DNS.24 = vlinsert-victoria-logs-cluster +DNS.25 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.26 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.27 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.28 = vlselect-victoria-logs-cluster +DNS.29 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.30 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.31 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.32 = vlstorage-victoria-logs-cluster +DNS.33 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.34 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.35 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +# vlstorage StatefulSet pod FQDNs (operator-managed, 3 replicas) +DNS.36 = vlstorage-victoria-logs-cluster-0.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.37 = vlstorage-victoria-logs-cluster-1.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.38 = vlstorage-victoria-logs-cluster-2.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local IP.1 = 127.0.0.1 EOF +# Check if existing cert has the required operator-managed SANs +# If SANs are stale (missing operator-managed names), force server cert regeneration +# CA is preserved so external clients do not need to re-import it +if [ -f "$CERT_FILE" ]; then + REQUIRED_SAN="vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local" + REQUIRED_VL_SAN="vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local" + if ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_SAN" || \ + ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VL_SAN"; then + echo "Existing certificate missing required SAN: $REQUIRED_SAN" + echo "Removing stale server cert/key/csr to force regeneration..." + rm -f "$CERT_KEY" "$CSR_FILE" "$CERT_FILE" + fi +fi + # Generate CA key if [ ! -f "$CA_KEY" ]; then echo "Generating CA key..." diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 index 48296183c2..dafce0aada 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 @@ -43,6 +43,14 @@ spec: - -remoteWrite.url={{ vmagent.remote_write_url }} - -remoteWrite.tlsCAFile=/etc/victoria/certs/ca.crt - -remoteWrite.tlsInsecureSkipVerify=false +{% endif %} +{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% for endpoint in telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) %} + - -remoteWrite.url={{ endpoint.url }} +{% if endpoint.tls_insecure_skip_verify | default(false) %} + - -remoteWrite.tlsInsecureSkipVerify=true +{% endif %} +{% endfor %} {% endif %} volumeMounts: - name: scrape-config diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 deleted file mode 100644 index b39dda8e39..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMInsert - Insert component for VictoriaMetrics cluster -# Accepts data ingestion and routes to vmstorage nodes - -apiVersion: v1 -kind: Service -metadata: - name: vminsert - namespace: {{ telemetry_namespace }} - labels: - app: vminsert -spec: - type: LoadBalancer - selector: - app: vminsert - ports: - - port: 8480 - targetPort: 8480 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vminsert - namespace: {{ telemetry_namespace }} - labels: - app: vminsert -spec: - replicas: {{ victoria_cluster.vminsert.replicas }} - selector: - matchLabels: - app: vminsert - template: - metadata: - labels: - app: vminsert - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - vminsert - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 5 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 5 -{% if victoria_cluster.tls_enabled %} - volumes: - - name: victoria-tls-certs - secret: - secretName: victoria-tls-certs - items: - - key: tls.crt - path: server.crt - - key: tls.key - path: server.key - - key: ca.crt - path: ca.crt -{% endif %} - containers: - - name: vminsert - image: {{ victoria_cluster.vminsert.image }} - imagePullPolicy: IfNotPresent - args: - - --storageNode=vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8400 -{% for i in range(1, victoria_cluster.vmstorage.replicas) %} - - --storageNode=vmstorage-{{ i }}.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8400 -{% endfor %} - - --httpListenAddr=:8480 -{% if victoria_cluster.tls_enabled %} - - -tls - - -tlsCertFile=/etc/victoria/certs/server.crt - - -tlsKeyFile=/etc/victoria/certs/server.key -{% endif %} - - --maxLabelsPerTimeseries=60 - ports: - - containerPort: 8480 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - startupProbe: - httpGet: - path: /health - port: 8480 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 10 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 30 - livenessProbe: - httpGet: - path: /health - port: 8480 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8480 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 5 - periodSeconds: 15 - resources: - requests: - memory: {{ victoria_cluster.vminsert.resources.requests.memory }} - cpu: {{ victoria_cluster.vminsert.resources.requests.cpu }} - limits: - memory: {{ victoria_cluster.vminsert.resources.limits.memory }} - cpu: {{ victoria_cluster.vminsert.resources.limits.cpu }} -{% if victoria_cluster.tls_enabled %} - volumeMounts: - - name: victoria-tls-certs - mountPath: /etc/victoria/certs - readOnly: true -{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 deleted file mode 100644 index 63649b1068..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMSelect - Query component for VictoriaMetrics cluster -# Performs queries against vmstorage nodes and returns results - -apiVersion: v1 -kind: Service -metadata: - name: vmselect - namespace: {{ telemetry_namespace }} - labels: - app: vmselect -spec: - type: LoadBalancer - selector: - app: vmselect - ports: - - port: 8481 - targetPort: 8481 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vmselect - namespace: {{ telemetry_namespace }} - labels: - app: vmselect -spec: - replicas: {{ victoria_cluster.vmselect.replicas }} - selector: - matchLabels: - app: vmselect - template: - metadata: - labels: - app: vmselect - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - vmselect - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 5 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 5 - volumes: -{% if victoria_cluster.tls_enabled %} - - name: victoria-tls-certs - secret: - secretName: victoria-tls-certs - items: - - key: tls.crt - path: server.crt - - key: tls.key - path: server.key - - key: ca.crt - path: ca.crt -{% endif %} -{% if victoria_cluster.vmselect.cache_data_path %} - - name: cache - emptyDir: {} -{% endif %} - containers: - - name: vmselect - image: {{ victoria_cluster.vmselect.image }} - imagePullPolicy: IfNotPresent - args: - - --storageNode=vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8401 -{% for i in range(1, victoria_cluster.vmstorage.replicas) %} - - --storageNode=vmstorage-{{ i }}.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8401 -{% endfor %} - - --httpListenAddr=:8481 -{% if victoria_cluster.tls_enabled %} - - -tls - - -tlsCertFile=/etc/victoria/certs/server.crt - - -tlsKeyFile=/etc/victoria/certs/server.key -{% endif %} - - --search.maxQueryDuration={{ victoria_cluster.vmselect.max_query_duration }} - - --search.maxConcurrentRequests={{ victoria_cluster.vmselect.max_concurrent_requests }} -{% if victoria_cluster.vmselect.cache_data_path %} - - --cacheDataPath=/cache -{% endif %} - ports: - - containerPort: 8481 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - livenessProbe: - httpGet: - path: /health - port: 8481 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8481 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 5 - periodSeconds: 15 - resources: - requests: - memory: {{ victoria_cluster.vmselect.resources.requests.memory }} - cpu: {{ victoria_cluster.vmselect.resources.requests.cpu }} - limits: - memory: {{ victoria_cluster.vmselect.resources.limits.memory }} - cpu: {{ victoria_cluster.vmselect.resources.limits.cpu }} - volumeMounts: -{% if victoria_cluster.tls_enabled %} - - name: victoria-tls-certs - mountPath: /etc/victoria/certs - readOnly: true -{% endif %} -{% if victoria_cluster.vmselect.cache_data_path %} - - name: cache - mountPath: /cache -{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 deleted file mode 100644 index 9d79cda60b..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMStorage - Storage component for VictoriaMetrics cluster -# Stores raw data and returns query results to vmselect - -apiVersion: v1 -kind: Service -metadata: - name: vmstorage - namespace: {{ telemetry_namespace }} - labels: - app: vmstorage -spec: - clusterIP: None # Headless service for StatefulSet - selector: - app: vmstorage - ports: - - port: 8482 - targetPort: 8482 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - - port: 8400 - targetPort: 8400 - name: vminsert - - port: 8401 - targetPort: 8401 - name: vmselect ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: vmstorage - namespace: {{ telemetry_namespace }} - labels: - app: vmstorage -spec: - serviceName: vmstorage - replicas: {{ victoria_cluster.vmstorage.replicas }} - selector: - matchLabels: - app: vmstorage - template: - metadata: - labels: - app: vmstorage - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - vmstorage - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 5 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 5 -{% if victoria_cluster.tls_enabled %} - volumes: - - name: victoria-tls-certs - secret: - secretName: victoria-tls-certs - items: - - key: tls.crt - path: server.crt - - key: tls.key - path: server.key -{% endif %} - initContainers: - # Clean up stale VictoriaMetrics lock files from previous ungraceful shutdowns - - name: cleanup-victoria-locks - image: {{ victoria_cluster.vmstorage.image }} - command: - - /bin/sh - - -c - - | - echo "Checking for stale VictoriaMetrics lock files..." - rm -f /vmstorage-data/flock.lock 2>/dev/null || true - echo "Lock file cleanup complete" - volumeMounts: - - name: vmstorage-data - mountPath: /vmstorage-data - containers: - - name: vmstorage - image: {{ victoria_cluster.vmstorage.image }} - imagePullPolicy: IfNotPresent - args: - - --storageDataPath=/vmstorage-data - - --retentionPeriod={{ hostvars['localhost']['victoria_configurations']['retention_period'] }} - - --httpListenAddr=:8482 -{% if victoria_cluster.tls_enabled %} - - -tls - - -tlsCertFile=/etc/victoria/certs/server.crt - - -tlsKeyFile=/etc/victoria/certs/server.key -{% endif %} - - --vminsertAddr=:8400 - - --vmselectAddr=:8401 -{% if victoria_cluster.vmstorage.dedup_min_scrape_interval %} - - --dedup.minScrapeInterval={{ victoria_cluster.vmstorage.dedup_min_scrape_interval }} -{% endif %} - ports: - - containerPort: 8482 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - - containerPort: 8400 - name: vminsert - - containerPort: 8401 - name: vmselect - livenessProbe: - httpGet: - path: /health - port: 8482 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8482 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 5 - periodSeconds: 15 - resources: - requests: - memory: {{ victoria_cluster.vmstorage.resources.requests.memory }} - cpu: {{ victoria_cluster.vmstorage.resources.requests.cpu }} - limits: - memory: {{ victoria_cluster.vmstorage.resources.limits.memory }} - cpu: {{ victoria_cluster.vmstorage.resources.limits.cpu }} - volumeMounts: - - name: vmstorage-data - mountPath: /vmstorage-data -{% if victoria_cluster.tls_enabled %} - - name: victoria-tls-certs - mountPath: /etc/victoria/certs - readOnly: true -{% endif %} - volumeClaimTemplates: - - metadata: - name: vmstorage-data - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: {{ hostvars['localhost']['victoria_configurations']['persistence_size'] }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 new file mode 100644 index 0000000000..970c9b20aa --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 @@ -0,0 +1,72 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMAgent - VictoriaMetrics agent for scraping metrics via operator +# Managed by victoria-metrics-operator + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMAgent +metadata: + name: vmagent + namespace: {{ telemetry_namespace }} +spec: + # Service account for kubernetes service discovery + serviceAccountName: {{ vmagent.service_account_name }} + + # Replica count + replicaCount: 1 + + # Image configuration + image: + repository: {{ vmagent.image.split(':')[0] }} + tag: {{ vmagent.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Remote write configuration - depends on deployment mode + remoteWrite: +{% if victoria_cluster.enabled %} + - url: {{ vmagent.remote_write_url_cluster }} +{% if victoria_cluster.tls_enabled %} + tlsConfig: + ca: + secret: + name: victoria-tls-certs + key: ca.crt + insecureSkipVerify: false +{% endif %} +{% else %} + - url: {{ vmagent.remote_write_url }} + tlsConfig: + insecureSkipVerify: true +{% endif %} + + # Resource limits + resources: + requests: + memory: "{{ victoria_cluster.vmagent.resources.requests.memory}}" + cpu: "{{ victoria_cluster.vmagent.resources.requests.cpu}}" + limits: + memory: "{{ victoria_cluster.vmagent.resources.limits.memory}}" + cpu: "{{ victoria_cluster.vmagent.resources.limits.cpu}}" + + # Service discovery configs - operator uses VMServiceScrape/VMPodScrape CRDs + serviceScrapeNamespaceSelector: {} + serviceScrapeSelector: {} + podScrapeNamespaceSelector: {} + podScrapeSelector: {} + + # Extra args + extraArgs: + promscrape.streamParse: "true" + promscrape.maxScrapeSize: "16MB" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 new file mode 100644 index 0000000000..b986ae0af8 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 @@ -0,0 +1,241 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMCluster - VictoriaMetrics cluster deployment via operator +# Managed by victoria-metrics-operator + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMCluster +metadata: + name: victoria-cluster + namespace: {{ telemetry_namespace }} +spec: + # Retention period from telemetry_config.yml + retentionPeriod: "{{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}h" + + # VMStorage configuration + vmstorage: + replicaCount: {{ victoria_cluster.vmstorage.replicas }} + image: + repository: {{ victoria_cluster.vmstorage.image.split(':')[0] }} + tag: {{ victoria_cluster.vmstorage.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Storage configuration per pod + storageDataPath: /vmstorage-data + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }} + + # Resource limits + resources: + requests: + memory: {{ victoria_cluster.vmstorage.resources.requests.memory }} + cpu: {{ victoria_cluster.vmstorage.resources.requests.cpu }} + limits: + memory: {{ victoria_cluster.vmstorage.resources.limits.memory }} + cpu: {{ victoria_cluster.vmstorage.resources.limits.cpu }} + + # Pod anti-affinity + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vmstorage + topologyKey: "kubernetes.io/hostname" + + # Tolerations + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 +{% if victoria_cluster.vmstorage.dedup_min_scrape_interval or victoria_cluster.tls_enabled %} + + extraArgs: +{% if victoria_cluster.vmstorage.dedup_min_scrape_interval %} + dedup.minScrapeInterval: {{ victoria_cluster.vmstorage.dedup_min_scrape_interval }} +{% endif %} +{% if victoria_cluster.tls_enabled %} + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} +{% endif %} +{% if victoria_cluster.tls_enabled %} + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # VMSelect configuration + vmselect: + replicaCount: {{ victoria_cluster.vmselect.replicas }} + image: + repository: {{ victoria_cluster.vmselect.image.split(':')[0] }} + tag: {{ victoria_cluster.vmselect.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # External access via LoadBalancer (useAsDefault merges into the main service) + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer + + # Resource limits + resources: + requests: + memory: {{ victoria_cluster.vmselect.resources.requests.memory }} + cpu: {{ victoria_cluster.vmselect.resources.requests.cpu }} + limits: + memory: {{ victoria_cluster.vmselect.resources.limits.memory }} + cpu: {{ victoria_cluster.vmselect.resources.limits.cpu }} + + # Pod anti-affinity + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vmselect + topologyKey: "kubernetes.io/hostname" + + # Extra args for query optimization + extraArgs: +{% if victoria_cluster.vmselect.max_query_duration %} + search.maxQueryDuration: {{ victoria_cluster.vmselect.max_query_duration }} +{% endif %} +{% if victoria_cluster.vmselect.max_concurrent_requests %} + search.maxConcurrentRequests: "{{ victoria_cluster.vmselect.max_concurrent_requests }}" +{% endif %} +{% if victoria_cluster.vmselect.cache_data_path %} + cacheDataPath: /cache +{% endif %} +{% if victoria_cluster.tls_enabled %} + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} +{% if victoria_cluster.tls_enabled %} + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # VMInsert configuration + vminsert: + replicaCount: {{ victoria_cluster.vminsert.replicas }} + image: + repository: {{ victoria_cluster.vminsert.image.split(':')[0] }} + tag: {{ victoria_cluster.vminsert.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Resource limits + resources: + requests: + memory: {{ victoria_cluster.vminsert.resources.requests.memory }} + cpu: {{ victoria_cluster.vminsert.resources.requests.cpu }} + limits: + memory: {{ victoria_cluster.vminsert.resources.limits.memory }} + cpu: {{ victoria_cluster.vminsert.resources.limits.cpu }} + + # Pod anti-affinity + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vminsert + topologyKey: "kubernetes.io/hostname" +{% if victoria_cluster.tls_enabled %} + + extraArgs: + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" + maxLabelsPerTimeseries: "60" + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} +{% if victoria_cluster.vminsert.external_access %} + + # External access via LoadBalancer (useAsDefault merges into the main service) + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 new file mode 100644 index 0000000000..4ed5c9c72d --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 @@ -0,0 +1,46 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMPodScrape - Native operator-based pod discovery for idrac-telemetry +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: idrac-telemetry-scrape + namespace: {{ telemetry_namespace }} +spec: + # Target pod selector + selector: + matchLabels: + app: {{ vmagent.target_pod_label }} + + # Namespace selector + namespaceSelector: + matchNames: + - {{ vmagent.kubernetes_sd_namespace }} + + # Pod metrics endpoints + podMetricsEndpoints: + - port: "victoriapump" + interval: {{ vmagent.global.scrape_interval }} + honorLabels: true + + # Only scrape the metrics container + relabelConfigs: + - sourceLabels: [__meta_kubernetes_pod_container_name] + regex: {{ vmagent.metrics_container_name }} + action: keep + + # Add pod IP label + - sourceLabels: [__meta_kubernetes_pod_ip] + targetLabel: pod_ip diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 new file mode 100644 index 0000000000..41af43489e --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 @@ -0,0 +1,86 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMSingle - VictoriaMetrics single-node deployment via operator +# Managed by victoria-metrics-operator + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMSingle +metadata: + name: victoria-single + namespace: {{ telemetry_namespace }} +spec: + # Replica count for single-node (always 1) + replicaCount: 1 + + # Retention period from telemetry_config.yml + retentionPeriod: "{{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}h" + + # Storage configuration + storageDataPath: /victoria-metrics-data + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }} + + # Image configuration + image: + repository: {{ victoria.image.split(':')[0] }} + tag: {{ victoria.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Port configuration + port: "8428" + + # Resource limits + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "2000m" + + # Pod anti-affinity for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vmsingle + topologyKey: "kubernetes.io/hostname" + + # Tolerations for node failures + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 + + # Service configuration + extraArgs: + selfScrapeInterval: "5s" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 index 90344c3dd9..8c8af09972 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 @@ -99,7 +99,7 @@ spec: args: - "--selfScrapeInterval=5s" - "--storageDataPath=/victoria-metrics-data" - - "--retentionPeriod={{ hostvars['localhost']['victoria_configurations']['retention_period'] }}" + - "--retentionPeriod={{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}" - "--httpListenAddr=:8443" - "-tls" - "-tlsCertFile=/etc/victoria/certs/server.crt" @@ -121,4 +121,4 @@ spec: accessModes: ["ReadWriteOnce"] resources: requests: - storage: "{{ hostvars['localhost']['victoria_configurations']['persistence_size'] }}" + storage: "{{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }}" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 index ad4241f135..6de4c42882 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 @@ -26,18 +26,18 @@ data: echo "==========================================" echo " VictoriaMetrics TLS Connection Test" echo "==========================================" -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} +{% if victoria_deployment_mode == 'cluster' %} echo "Deployment Mode: Cluster" {% if victoria_cluster.tls_enabled %} - echo "VictoriaMetrics URL: https://vmselect:8481" + echo "VictoriaMetrics URL: https://vmselect-victoria-cluster:8481" echo "Testing with CA certificate verification" - VICTORIA_URL="https://vmselect:8481" + VICTORIA_URL="https://vmselect-victoria-cluster:8481" CA_CERT="/etc/victoria/certs/ca.crt" USE_TLS="true" {% else %} - echo "VictoriaMetrics URL: http://vmselect:8481" + echo "VictoriaMetrics URL: http://vmselect-victoria-cluster:8481" echo "Testing cluster without TLS (HTTP)" - VICTORIA_URL="http://vmselect:8481" + VICTORIA_URL="http://vmselect-victoria-cluster:8481" CA_CERT="" USE_TLS="false" {% endif %} @@ -125,7 +125,7 @@ data: echo "" # Test 5: Test API query endpoint -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} +{% if victoria_deployment_mode == 'cluster' %} echo "Step 5: Testing /select/0/prometheus/api/v1/query endpoint..." if [ "$USE_TLS" = "true" ]; then QUERY_RESPONSE=$(curl -s --max-time 30 --cacert "$CA_CERT" "${VICTORIA_URL}/select/0/prometheus/api/v1/query?query=up" || echo "failed") @@ -316,8 +316,8 @@ data: if [ "$USE_TLS" = "true" ]; then echo "Step 7: Checking server certificate details..." if command -v openssl > /dev/null 2>&1; then -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} - echo | openssl s_client -connect vmselect:8481 -CAfile "$CA_CERT" 2>/dev/null | \ +{% if victoria_deployment_mode == 'cluster' %} + echo | openssl s_client -connect vmselect-victoria-cluster:8481 -CAfile "$CA_CERT" 2>/dev/null | \ openssl x509 -noout -subject -issuer -dates 2>/dev/null | sed 's/^/ /' || \ echo " ⚠ Could not retrieve server certificate details" {% else %} @@ -374,7 +374,7 @@ spec: spec: restartPolicy: Never volumes: -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'single-node' or victoria_cluster.tls_enabled %} +{% if victoria_deployment_mode == 'single-node' or victoria_cluster.tls_enabled %} - name: victoria-tls-certs secret: secretName: victoria-tls-certs @@ -388,7 +388,7 @@ spec: image: curlimages/curl:8.17.0 imagePullPolicy: IfNotPresent volumeMounts: -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'single-node' or victoria_cluster.tls_enabled %} +{% if victoria_deployment_mode == 'single-node' or victoria_cluster.tls_enabled %} - mountPath: /etc/victoria/certs name: victoria-tls-certs readOnly: true diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 index 501328c1c8..e84877af56 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 @@ -27,6 +27,10 @@ rules: - apiGroups: [""] resources: ["pods", "services", "endpoints"] verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] + --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -40,4 +44,4 @@ subjects: roleRef: kind: Role name: "{{ vmagent.role_name }}" - apiGroup: rbac.authorization.k8s.io \ No newline at end of file + apiGroup: rbac.authorization.k8s.io diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 new file mode 100644 index 0000000000..22b4ecef11 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -0,0 +1,208 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VLAgent - VictoriaLogs log collection agent via operator +# Managed by victoria-metrics-operator (>= v0.59.0) +# +# Purpose: Platform-managed log forwarding agent providing: +# - Syslog reception (RFC 3164/5424) on ports 514 (plaintext) and 6514 (TLS) +# - JSON Lines forwarding to vlinsert ingestion endpoint +# - Client-side buffering with PVC-backed persistence for retry during vlinsert unavailability +# +# Design: Source-neutral base configuration with no source-specific relabel rules. +# Downstream capabilities extend this configuration for specific log sources. + +apiVersion: operator.victoriametrics.com/v1 +kind: VLAgent +metadata: + name: vlagent + namespace: {{ telemetry_namespace }} + labels: + app: vlagent + component: victorialogs +spec: + # ======================================== + # Container Image Configuration + # ======================================== + image: + repository: {{ victoria_logs_cluster.vlagent.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlagent.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # ======================================== + # Replica Configuration + # ======================================== + # Single replica (DD-A1): Platform-managed log forwarding agent + # HA provided by PVC buffer persistence and pod restart + replicaCount: {{ victoria_logs_cluster.vlagent.replicas }} + + # ======================================== + # Configuration Management + # ======================================== + # ConfigMap containing VLAgent syslog receiver configuration + configSecret: vlagent-config + + # ======================================== + # Remote Write Configuration + # ======================================== + # Forward logs to VictoriaLogs vlinsert endpoint + # Supports JSON Lines format with optional TLS + remoteWrite: +{% if victoria_logs_cluster.tls_enabled %} + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline + tlsConfig: + ca: /etc/victoria/certs/ca.crt +{% else %} + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline +{% endif %} + + # ======================================== + # Resource Allocation + # ======================================== + # CPU: 50m request, 250m limit (I/O-bound, low CPU usage) + # Memory: 128Mi request, 512Mi limit (in-memory batch buffers + disk-backed WAL) + resources: + requests: + memory: {{ victoria_logs_cluster.vlagent.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlagent.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlagent.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlagent.resources.limits.cpu }} + + # ======================================== + # Persistent Storage Configuration + # ======================================== + # PVC buffer for client-side log persistence and retry during vlinsert unavailability + # Default: 5Gi (sufficient for ~24-48 hours of log accumulation at typical syslog rates) + # Configurable via victoria_logs_cluster.vlagent.pvc_size in vars/main.yml + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ victoria_logs_cluster.vlagent.pvc_size | default('5Gi') }} + + # ======================================== + # TLS Certificate Configuration + # ======================================== + # Shared with VictoriaMetrics cluster + # Used for: + # - Syslog TLS receiver (:6514) — server certificate + # - remoteWrite to vlinsert — CA certificate validation +{% if victoria_logs_cluster.tls_enabled %} + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # ======================================== + # Service Exposure Configuration + # ======================================== + # Service type: LoadBalancer (MetalLB) or NodePort (fallback) + # Exposes syslog receivers (:514 TCP+UDP, :6514 TLS) for external log sources +{% if metalLB_deployed | default(false) %} + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer +{% else %} + serviceSpec: + useAsDefault: true + spec: + type: NodePort +{% endif %} + + # ======================================== + # Port Configuration + # ======================================== + # Syslog receivers (platform-provided scrape targets) + # - :514 TCP+UDP — plaintext syslog (RFC 3164/5424) + # - :6514 TCP — TLS syslog (RFC 5425) + # Health check endpoint + # - :9429 — HTTP health checks (distinct from vmagent 8429) + ports: + - name: syslog + port: 514 + targetPort: 514 + protocol: TCP + - name: syslog-udp + port: 514 + targetPort: 514 + protocol: UDP + - name: syslog-tls + port: 6514 + targetPort: 6514 + protocol: TCP + - name: health + port: 9429 + targetPort: 9429 + protocol: TCP + + # ======================================== + # Health Probes + # ======================================== + # Liveness probe: Restart pod if health check fails + # Readiness probe: Route traffic only to ready pods + livenessProbe: + httpGet: + path: /health + port: 9429 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + + readinessProbe: + httpGet: + path: /health + port: 9429 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + + # ======================================== + # Pod Scheduling and Affinity + # ======================================== + # No pod anti-affinity required (single replica) + # Tolerations allow scheduling on nodes with taints + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 + + # ======================================== + # Termination Grace Period + # ======================================== + # Allow time for graceful shutdown and buffer flush + terminationGracePeriodSeconds: 30 diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 new file mode 100644 index 0000000000..ec289f4e5e --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 @@ -0,0 +1,231 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VLCluster - VictoriaLogs cluster deployment via operator +# Managed by victoria-metrics-operator (>= v0.59.0) + +apiVersion: operator.victoriametrics.com/v1 +kind: VLCluster +metadata: + name: victoria-logs-cluster + namespace: {{ telemetry_namespace }} +spec: + # Single image version for all cluster components + # Operator determines component role (vlstorage, vlinsert, vlselect) internally + clusterVersion: {{ victoria_logs_cluster.vlstorage.image.split(':')[1] }} + + # ======================== + # vlstorage — Persistent log storage (StatefulSet, 3 replicas) + # ======================== + vlstorage: + replicaCount: {{ victoria_logs_cluster.vlstorage.replicas }} + image: + repository: {{ victoria_logs_cluster.vlstorage.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlstorage.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Ports are managed by operator defaults (9491, 9400, 9401) + + storageDataPath: /vlstorage-data + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ hostvars['localhost']['victoria_logs_configurations']['storage_size'] }} + + resources: + requests: + memory: {{ victoria_logs_cluster.vlstorage.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlstorage.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlstorage.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlstorage.resources.limits.cpu }} + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vlstorage + topologyKey: "kubernetes.io/hostname" + + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 + + extraArgs: + retentionPeriod: "{{ hostvars['localhost']['victoria_logs_configurations']['retention_period'] }}h" +{% if victoria_logs_cluster.tls_enabled %} + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} +{% if victoria_logs_cluster.tls_enabled %} + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # ======================== + # vlinsert — Log ingestion gateway (Deployment, 2 replicas) + # ======================== + vlinsert: + replicaCount: {{ victoria_logs_cluster.vlinsert.replicas }} + image: + repository: {{ victoria_logs_cluster.vlinsert.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlinsert.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Ports are managed by operator defaults (9481) + + resources: + requests: + memory: {{ victoria_logs_cluster.vlinsert.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlinsert.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlinsert.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlinsert.resources.limits.cpu }} + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vlinsert + topologyKey: "kubernetes.io/hostname" +{% if victoria_logs_cluster.tls_enabled %} + + extraArgs: + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" + storageNode.tls: "true" + storageNode.tlsCAFile: "/etc/victoria/certs/ca.crt" + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} +{% if victoria_logs_cluster.vlinsert.external_access %} + + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer +{% endif %} + + # ======================== + # vlselect — Log query gateway (Deployment, 2 replicas) + # ======================== + vlselect: + replicaCount: {{ victoria_logs_cluster.vlselect.replicas }} + image: + repository: {{ victoria_logs_cluster.vlselect.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlselect.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Ports are managed by operator defaults (9471) + + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer + + resources: + requests: + memory: {{ victoria_logs_cluster.vlselect.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlselect.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlselect.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlselect.resources.limits.cpu }} + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vlselect + topologyKey: "kubernetes.io/hostname" +{% if victoria_logs_cluster.tls_enabled %} + + extraArgs: + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" + storageNode.tls: "true" + storageNode.tlsCAFile: "/etc/victoria/certs/ca.crt" + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 new file mode 100644 index 0000000000..819ca03670 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 @@ -0,0 +1,155 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VLAgent Configuration ConfigMap +# Mounted as vlagent.yml in the VLAgent pod +# Defines syslog receivers and log forwarding pipeline to vlinsert + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vlagent-config + namespace: {{ telemetry_namespace }} + labels: + app: vlagent + component: victorialogs +data: + vlagent.yml: | + # ============================================================================ + # VLAgent Platform Base Configuration + # ============================================================================ + # Source-neutral: no source-specific relabel rules or external labels + # Downstream capabilities extend this configuration for specific log sources + # + # Pipeline: + # 1. RECEIVE: Syslog messages on :514 (plaintext) and :6514 (TLS) + # 2. PARSE: Convert syslog to JSON Lines format + # 3. FORWARD: Send to vlinsert via HTTPS remoteWrite + # 4. BUFFER: PVC-backed persistence for retry during vlinsert unavailability + + # ============================================================================ + # SYSLOG RECEIVERS (Platform-provided scrape targets) + # ============================================================================ + # Listens for incoming syslog messages from external sources + # Supports RFC 3164 (BSD) and RFC 5424 (structured) formats + # + # Plaintext receiver (:514): + # - TCP and UDP support + # - Standard syslog port + # - Suitable for trusted internal networks + # + # TLS receiver (:6514): + # - TCP only (RFC 5425) + # - Encrypted syslog transport + # - Requires server certificate from victoria-tls-certs secret + syslog: + # Plaintext syslog receiver (RFC 3164/5424) + # Listens on all interfaces, TCP and UDP + listenAddr: "0.0.0.0:514" + + # TLS syslog receiver (RFC 5425) + # Listens on all interfaces, TCP only + tlsListenAddr: "0.0.0.0:6514" + +{% if victoria_logs_cluster.tls_enabled %} + # TLS certificate and key for syslog TLS receiver + # Sourced from shared victoria-tls-certs secret + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} + + # ============================================================================ + # LOG FORWARDING PIPELINE (remoteWrite to vlinsert) + # ============================================================================ + # Forwards parsed logs to vlinsert ingestion endpoint + # Format: JSON Lines (NDJSON) over HTTPS + # Stream identification: hostname and app_name fields for consistent-hash sharding + # + # Behavior: + # - Batches logs and sends HTTP POST to vlinsert + # - Batch size: ~1 MB or 10,000 entries (whichever comes first) + # - Flush interval: 1 second (default) + # - Retry: Exponential backoff (1s → 60s) on vlinsert errors + # - Persistence: Unsent batches buffered to PVC on vlinsert unavailability + remoteWrite: + # vlinsert endpoint (in-cluster FQDN) + # Operator creates service as vlinsert-victoria-logs-cluster + # Port 9481: ingestion endpoint (HTTPS when tls_enabled, HTTP otherwise) + # Path: /insert/jsonline — primary VictoriaLogs ingestion format + # Query params: + # _stream_fields=hostname,app_name — stream identification for sharding + url: "{{ 'https' if victoria_logs_cluster.tls_enabled else 'http' }}://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline?_stream_fields=hostname,app_name" + +{% if victoria_logs_cluster.tls_enabled %} + # TLS configuration for remoteWrite client + # Validates vlinsert's server certificate using CA cert from victoria-tls-certs secret + tls_config: + ca_file: "/etc/victoria/certs/ca.crt" + insecure_skip_verify: false +{% endif %} + + # ============================================================================ + # PERSISTENT QUEUE CONFIGURATION (Client-side buffering) + # ============================================================================ + # Disk-backed write-ahead log (WAL) for log persistence + # Prevents log loss during vlinsert unavailability (pod restart, rolling update) + # + # Behavior: + # - VLAgent writes incoming logs to disk before forwarding + # - On vlinsert error: Batches persisted to PVC, retried with exponential backoff + # - On VLAgent restart: Unsent batches read from PVC and retried + # - On PVC full: Oldest buffered entries evicted (FIFO) to make room + # + # Sizing: + # - Default: 5Gi (sufficient for ~24-48 hours of log accumulation) + # - Configurable via victoria_logs_cluster.vlagent.pvc_size in vars/main.yml + persistentQueue: + # Mount path for PVC buffer storage + # Corresponds to storage.volumeClaimTemplate in VLAgent CR + dir: "/vlagent-data" + + # Maximum buffer size before eviction + # Matches PVC size (5Gi default) + maxPendingBytes: "5GiB" + + # ============================================================================ + # EXTENSION POINTS FOR DOWNSTREAM CAPABILITIES + # ============================================================================ + # This platform base configuration is source-neutral. + # Downstream capabilities (separate epics) extend this configuration with: + # + # 1. Source-specific relabel rules + # Example: Add labels for PowerScale syslog sources + # relabeling: + # - source_labels: [hostname] + # regex: "powerscale-.*" + # target_label: source + # replacement: powerscale + # + # 2. External labels for source identification + # Example: Add cluster identifier + # external_labels: + # cluster: "production" + # environment: "prod" + # + # 3. Additional remoteWrite destinations + # Example: Dual-write to secondary vlinsert + # remoteWrite: + # - url: "https://secondary-vlinsert:9480/insert/jsonline" + # + # 4. Parsing and enrichment rules + # Example: Extract fields from syslog message + # parsing: + # - type: json + # field: message diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 index d653bbcef6..caa70954a6 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 @@ -59,4 +59,19 @@ data: # Add Pod IP label - source_labels: [__meta_kubernetes_pod_ip] - target_label: pod_ip \ No newline at end of file + target_label: pod_ip +{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} + + # PowerScale OTEL Collector scrape targets (per cluster) +{% for cluster in ps_clusters %} + - job_name: "otel-collector-powerscale-cluster{{ loop.index0 }}" + honor_labels: true + scrape_interval: {{ vmagent.global.scrape_interval }} + static_configs: + - targets: ['otel-collector.{{ telemetry_namespace }}.svc.cluster.local:8889'] + labels: + source: powerscale + cluster: "{{ cluster.clusterName }}" + cluster_endpoint: "{{ cluster.endpoint }}" +{% endfor %} +{% endif %} diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index b393423728..7476953350 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -99,12 +99,13 @@ victoria: image: "{{ telemetry_images['victoriametrics/victoria-metrics'] | default('victoriametrics/victoria-metrics:v1.128.0') }}" # VictoriaMetrics Cluster Configuration -# Deployment mode is controlled by victoria_configurations.deployment_mode in telemetry_config.yml +# Deployment mode is controlled by victoria_deployment_mode variable (default: cluster) # Supported modes: "single-node" or "cluster" +victoria_deployment_mode: "cluster" # Default deployment mode for VictoriaMetrics victoria_cluster: - # Auto-configured based on telemetry_config.yml + # Auto-configured based on victoria_deployment_mode variable # true = cluster mode, false = single-node mode - enabled: "{{ true if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' else false }}" + enabled: "{{ true if victoria_deployment_mode == 'cluster' else false }}" tls_enabled: true # Set to true to enable TLS for cluster components # VMStorage: Stores raw data and returns query results vmstorage: @@ -146,6 +147,89 @@ victoria_cluster: memory: "1Gi" cpu: "1000m" + vmagent: + replicas: 1 + image: "{{ telemetry_images['victoriametrics/vmagent'] | default('victoriametrics/vmagent:v1.128.0') }}" + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + cpu: "250m" + +# ============================================================================ +# VictoriaLogs Cluster Configuration +# ============================================================================ +# Deployed alongside VictoriaMetrics when 'victoria' is in telemetry_collection_type. +# Managed by the same VictoriaMetrics operator via VLCluster and VLAgent CRs. +# See VL_cluster_component_spec.md and VL_Agent_component_spec.md for full design. +victoria_logs_cluster: + tls_enabled: true # Enable TLS for all inter-component communication (shared victoria-tls-certs secret) + + # vlstorage: Persistent log storage nodes (StatefulSet managed by operator via VLCluster CR) + vlstorage: + replicas: 3 + image: "{{ telemetry_images['victoriametrics/victoria-logs'] | default('docker.io/victoriametrics/victoria-logs:v1.49.0') }}" + resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + + # vlinsert: Log ingestion gateway (Deployment managed by operator via VLCluster CR) + vlinsert: + replicas: 2 + image: "{{ telemetry_images['victoriametrics/victoria-logs'] | default('docker.io/victoriametrics/victoria-logs:v1.49.0') }}" + external_access: true # Expose vlinsert via LoadBalancer service for external log ingestion + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "1000m" + + # vlselect: Log query gateway (Deployment managed by operator via VLCluster CR) + vlselect: + replicas: 2 + image: "{{ telemetry_images['victoriametrics/victoria-logs'] | default('docker.io/victoriametrics/victoria-logs:v1.49.0') }}" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "1000m" + + # VLAgent: Platform-managed log forwarding agent (Deployment managed by operator via VLAgent CR) + vlagent: + replicas: 1 + image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" + pvc_size: "5Gi" # Buffer storage for retry during vlinsert unavailability + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + cpu: "250m" + +# VictoriaLogs ports (operator v0.66.1 defaults) +# Note: VictoriaMetrics operator uses these default ports and ignores custom port specifications +# will uncomment after verify that victoria oprator does not takes default port +# victoria_logs_ports: +# vlinsert_http: 9481 # Ingestion clients → vlinsert (HTTPS, LoadBalancer) - operator default +# vlselect_http: 9471 # Query clients → vlselect (HTTPS, LoadBalancer) - operator default +# vlstorage_http: 9491 # Health checks and admin API (internal) - operator default +# vlstorage_insert: 9400 # vlinsert → vlstorage data sharding (HTTPS, internal) +# vlstorage_select: 9401 # vlselect → vlstorage query fan-out (HTTPS, internal) +# vlagent_syslog: 514 # Syslog receiver plaintext (TCP+UDP) +# vlagent_syslog_tls: 6514 # Syslog receiver TLS (TCP, RFC 5425) +# vlagent_http: 9429 # VLAgent health checks (distinct from vmagent 8429) + # Telemetry shared path configuration telemetry_share_path: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/telemetry" @@ -173,12 +257,20 @@ vmagent: # Single-node URL remote_write_url: "https://victoria-loadbalancer.telemetry.svc.cluster.local:8443/api/v1/write" # Cluster URL (used when victoria_cluster.enabled: true) - remote_write_url_cluster: > - {% if victoria_cluster.tls_enabled %}https{% else %} - http{% endif %}://vminsert.{{ telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + # Operator creates service as vminsert-victoria-cluster (not vminsert) + # Protocol depends on tls_enabled: https when TLS is on, http otherwise + remote_write_url_cluster: >- + {{ 'https' if victoria_cluster.tls_enabled else 'http' }}://vminsert-victoria-cluster.{{ + telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write strmzi_kafka_tarball_url: "{{ offline_tarball_path }}/{{ strimzi_kafka_pkg }}/{{ strimzi_kafka_pkg }}.tar.gz" +# Victoria Metrics operator tarball configuration +# Version must match the Helm chart entry in service_k8s.json (victoria-metrics-operator-0.59.3) +# Required for VLCluster and VLAgent CRD support (minimum v0.59.0) +victoria_operator_pkg: "victoria-metrics-operator-0.59.3" +victoria_operator_tarball_url: "{{ offline_tarball_path }}/{{ victoria_operator_pkg }}/{{ victoria_operator_pkg }}.tar.gz" + # Usage: validate_idrac_inventory.yml bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" bmc_group_data_headers: "BMC_IP,GROUP_NAME,PARENT" @@ -193,30 +285,42 @@ common_mode: "0755" # Usage: generate_telemetry_deployments.yml - Template lists for different components # Victoria templates - conditional based on victoria_cluster.enabled victoria_templates_common: - - src: 'telemetry/victoria/victoria-tls-secret.yaml.j2' - dest: 'victoria-tls-secret.yaml' - src: 'telemetry/victoria/victoria-vmagent-rbac.yaml.j2' dest: 'victoria-vmagent-rbac.yaml' - - src: 'telemetry/victoria/vmagent-scrape-config.yaml.j2' - dest: 'vmagent-scrape-config.yaml' - - src: 'telemetry/victoria/victoria-agent-deployment.yaml.j2' - dest: 'victoria-agent-deployment.yaml' + +# Operator-based templates (new default) +# Single-node operator template (used when victoria_cluster.enabled: false) +victoria_templates_operator_single: + - src: 'telemetry/victoria/victoria-operator-vmsingle.yaml.j2' + dest: 'victoria-operator-vmsingle.yaml' + - src: 'telemetry/victoria/victoria-operator-vmagent.yaml.j2' + dest: 'victoria-operator-vmagent.yaml' + - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' + dest: 'victoria-operator-vmpodscrape.yaml' + +# Cluster operator template (used when victoria_cluster.enabled: true) +victoria_templates_operator_cluster: + - src: 'telemetry/victoria/victoria-operator-vmcluster.yaml.j2' + dest: 'victoria-operator-vmcluster.yaml' + - src: 'telemetry/victoria/victoria-operator-vmagent.yaml.j2' + dest: 'victoria-operator-vmagent.yaml' + - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' + dest: 'victoria-operator-vmpodscrape.yaml' + +# Legacy manual deployment templates (removed - use operator-based templates above) +# Raw victoria-cluster-vminsert/vmselect/vmstorage.yaml.j2 files have been removed +# in favor of the operator-managed victoria-operator-vmcluster.yaml.j2 idrac_telemetry_statefulset_path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/idrac_telemetry_statefulset.yaml" # Single-node templates (used when victoria_cluster.enabled: false) victoria_templates_single: + - src: 'telemetry/victoria/victoria-tls-secret.yaml.j2' + dest: 'victoria-tls-secret.yaml' - src: 'telemetry/victoria/victoria-statefulset.yaml.j2' dest: 'victoria-statefulset.yaml' - -# Cluster templates (used when victoria_cluster.enabled: true) -victoria_templates_cluster: - - src: 'telemetry/victoria/victoria-cluster-vmstorage.yaml.j2' - dest: 'victoria-cluster-vmstorage.yaml' - - src: 'telemetry/victoria/victoria-cluster-vminsert.yaml.j2' - dest: 'victoria-cluster-vminsert.yaml' - - src: 'telemetry/victoria/victoria-cluster-vmselect.yaml.j2' - dest: 'victoria-cluster-vmselect.yaml' + - src: 'telemetry/victoria/victoria-agent-deployment.yaml.j2' + dest: 'victoria-agent-deployment.yaml' # Test job template (optional) victoria_templates_test: @@ -227,9 +331,22 @@ victoria_templates_test: # Note: victoria_templates_test is commented out by default in kustomization.yaml.j2 victoria_templates: > {{ victoria_templates_common + - (victoria_templates_cluster if victoria_cluster.enabled else victoria_templates_single) + + (victoria_templates_operator_cluster if victoria_cluster.enabled else victoria_templates_operator_single) + victoria_templates_test }} +# ============================================================================ +# VictoriaLogs Template Lists +# ============================================================================ +# VictoriaLogs operator CR templates (used when 'victoria' in telemetry_collection_type) +# These are applied alongside VictoriaMetrics templates in the same kustomize deployment. +victorialogs_templates: + - src: 'telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2' + dest: 'victorialogs-operator-vlcluster.yaml' + - src: 'telemetry/victoria/victorialogs-operator-vlagent.yaml.j2' + dest: 'victorialogs-operator-vlagent.yaml' + - src: 'telemetry/victoria/victorialogs-vlagent-config.yaml.j2' + dest: 'victorialogs-vlagent-config.yaml' + kafka_templates: - src: 'telemetry/kafka/kafka.kafka.yaml.j2' dest: 'kafka.kafka.yaml' @@ -278,3 +395,91 @@ ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready withi ldms_store_pod_ready_msg: "LDMS store daemon pod restarted successfully and is ready" ldms_store_pod_not_ready_msg: "LDMS store daemon pod restart failed or not ready within timeout" ldms_store_restart_wait_seconds: 10 + +# ============================================================================ +# PowerScale Telemetry Configuration +# ============================================================================ +# Usage: deploy_powerscale_metrics.yml +# PowerScale telemetry is deployed via Helm chart (karavi-observability). +# The Helm chart deploys CSM Metrics PowerScale, OTEL Collector, and all +# associated Kubernetes resources. No per-cluster Kustomize manifests needed. +# PowerScale metrics are scraped by the existing shared vmagent. + +# Karavi Observability offline installer variables +# All files are on the NFS share at {{ k8s_client_mount_path }}/karavi-observability/ +karavi_observability_git: "karavi-observability.tar.gz" +karavi_helm_charts_git: "helm-charts.tar.gz" +karavi_helm_chart_path: "{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" +permission_644: "0644" + +# Internal namespace constants (not user-configurable) +csm_namespace: "telemetry" + +# CSM Metrics PowerScale resource limits +csm_metrics_powerscale_resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "512Mi" + +# Usage: deploy_powerscale_metrics.yml - messages +ps_csi_driver_not_configured_msg: >- + PowerScale telemetry requires csi_driver_powerscale to be configured in software_config.json. + Please add csi_driver_powerscale to software_config.json and re-run. +ps_helm_values_path_missing_msg: >- + csm_observability_values_file_path is required in telemetry_config.yml when powerscale_configurations.powerscale_telemetry_support is true. + Provide the path to your customized karavi-observability Helm values file. +ps_helm_values_file_not_found_msg: >- + Helm values file not found at '{{ ps_helm_values_file | default('') }}'. + Please create a values file following the Dell CSM Observability documentation + and set the path in telemetry_config.yml (csm_observability_values_file_path). +ps_helm_values_parse_fail_msg: >- + Failed to parse Helm values file at '{{ ps_helm_values_file | default('') }}'. + Please verify the file contains valid YAML. +ps_cert_manager_disabled_msg: >- + cert-manager must be enabled in the CSM Observability Helm values file. + Set 'cert-manager.enabled: true' in {{ ps_helm_values_file | default('') }}. + cert-manager is required for automatic TLS certificate management (otel-collector-tls secret). +ps_auth_mode_direct_msg: "PowerScale telemetry authentication mode: Direct Authentication (Mode A)" +ps_auth_mode_karavi_msg: "PowerScale telemetry authentication mode: Karavi Authorization (Mode B)" +ps_csi_secret_read_fail_msg: >- + Failed to load CSI PowerScale driver secret from '{{ ps_csi_secret_path | default('') }}'. + Please verify the file exists and contains valid isilonClusters configuration. +ps_no_clusters_found_msg: >- + No PowerScale clusters found in CSI driver secret.yaml. + Please define at least one cluster in isilonClusters. +ps_clusters_found_msg: >- + Found {{ ps_clusters | default([]) | length }} PowerScale cluster(s) for telemetry deployment. +ps_telemetry_prepared_msg: >- + PowerScale telemetry files prepared on NFS share. + Helm chart: {{ karavi_helm_chart_path }} + Values file: {{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml + Deployment will occur during cloud-init (PXE boot) on the control plane node. +ps_cert_manager_pkg_msg: >- + cert-manager package from service_k8s.json: {{ cert_manager_package | default('') }} ({{ cert_manager_chart_tgz | default('') }}) +ps_cert_manager_required_msg: "cert-manager dependency download: REQUIRED" +ps_cert_manager_skipped_msg: "cert-manager dependency download: SKIPPED (disabled in values file)" +ps_cert_manager_staged_msg: >- + cert-manager ({{ cert_manager_package | default('') }}) extracted to + {{ karavi_helm_chart_path }}/charts/cert-manager/. + Helm will resolve this dependency during install on the control plane node. +ps_cert_manager_skipped_detail_msg: >- + cert-manager dependency skipped (disabled in values file). + CRDs are bundled in karavi-observability chart and will be applied automatically. +ps_cert_manager_extract_fail_msg: >- + cert-manager chart was not extracted correctly to + {{ karavi_helm_chart_path }}/charts/cert-manager/. + The archive {{ cert_manager_chart_tgz | default('') }} may be corrupt. +ps_cert_manager_missing_msg: >- + cert-manager Helm chart ({{ cert_manager_chart_tgz | default('') }}) was not found in + {{ karavi_helm_chart_path }}/charts/. This chart is required for TLS certificate + management. Ensure the Jetstack cert-manager entry exists in service_k8s.json + and run local_repo.yml to download it to Pulp. +ps_dependency_fail_msg: >- + Failed to get karavi-observability dependencies from Pulp. + Ensure the following entries exist in service_k8s.json and run local_repo.yml: + - karavi-observability (git) + - helm-charts (git) + - {{ cert_manager_package | default('cert-manager') }} (tarball from Jetstack Helm repo) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index 1aa095e66b..9e431f6671 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -50,12 +50,11 @@ - name: Normalize telemetry_config.yml values ansible.builtin.set_fact: telemetry_idrac_telemetry_support: "{{ backup_telemetry_config.idrac_telemetry_support | default(true) }}" - telemetry_idrac_telemetry_collection_type: >- + telemetry_telemetry_collection_type: >- {{ - backup_telemetry_config.idrac_telemetry_collection_type + backup_telemetry_config.telemetry_collection_type | default('victoria,kafka') }} - telemetry_victoria_deployment_mode: "{{ backup_telemetry_victoria_config.deployment_mode | default('cluster') }}" telemetry_victoria_persistence_size: "{{ backup_telemetry_victoria_config.persistence_size | default('8Gi') }}" telemetry_victoria_retention_period: "{{ backup_telemetry_victoria_config.retention_period | default(168) }}" telemetry_kafka_persistence_size: "{{ backup_telemetry_kafka_config.persistence_size | default('8Gi') }}" @@ -112,8 +111,7 @@ mode: "{{ default_file_mode }}" vars: telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}" - telemetry_idrac_telemetry_collection_type: "{{ telemetry_idrac_telemetry_collection_type }}" - telemetry_victoria_deployment_mode: "{{ telemetry_victoria_deployment_mode }}" + telemetry_telemetry_collection_type: "{{ telemetry_telemetry_collection_type }}" telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}" telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}" telemetry_kafka_persistence_size: "{{ telemetry_kafka_persistence_size }}" diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 index cb89944e1c..ae57457882 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -82,40 +82,17 @@ idrac_telemetry_support: {{ telemetry_idrac_telemetry_support | default(true) | # - "kafka" : Store in Kafka only # - "victoria,kafka" : Store in both (recommended) # Default: "victoria,kafka" -idrac_telemetry_collection_type: {{ telemetry_idrac_telemetry_collection_type | default('victoria,kafka') | to_json }} +telemetry_collection_type: {{ telemetry_telemetry_collection_type | default('victoria,kafka') | to_json }} # ============================================================================ # VICTORIAMETRICS CONFIGURATION # ============================================================================ # VictoriaMetrics is a time-series database for storing telemetry metrics. -# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'victoria' is enabled in telemetry_collection_type. # -# DEPLOYMENT MODES: -# - single-node: Simple deployment with one pod (suitable for small deployments) -# - cluster: High-availability deployment with multiple components -# (recommended for production and large-scale deployments) -victoria_configurations: - # VictoriaMetrics deployment mode - # Supported values: - # - "single-node" : Simple deployment (1 pod, suitable for dev/test) - # - "cluster" : High-availability deployment (7 pods, recommended for production) - # Default: "cluster" - # - # Cluster Mode Benefits: - # - High availability (no single point of failure) - # - Horizontal scalability (scale components independently) - # - Better performance (4x ingestion, 2x query speed) - # - Production-ready architecture - # - # Single-Node Benefits: - # - Simple setup (fewer resources) - # - Suitable for small deployments (<10 nodes) - # - Lower resource usage (~4Gi memory vs ~10Gi for cluster) - deployment_mode: {{ telemetry_victoria_deployment_mode | default('cluster') | to_json }} - +victoria_metrics_configurations: # The amount of storage allocated for EACH VictoriaMetrics persistent volume. # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: - # - Single-node mode: Total storage = persistence_size × 1 pod # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" @@ -130,7 +107,7 @@ victoria_configurations: # KAFKA CONFIGURATION # ============================================================================ # Apache Kafka is a distributed streaming platform for storing telemetry data. -# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'kafka' is enabled in telemetry_collection_type. # Also used for LDMS telemetry when LDMS software is configured. # # NOTE: Kafka topics are auto-generated based on enabled features: diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 260c8376fd..c7a19b3210 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -33,8 +33,8 @@ ansible.builtin.command: >- kubectl get svc {{ item }} -n {{ victoria_namespace }} -o name loop: - - vminsert - - vmselect + - "{{ victoria_vminsert_svc }}" + - "{{ victoria_vmselect_svc }}" register: victoria_cluster_svcs changed_when: false failed_when: false @@ -63,7 +63,7 @@ - name: Get Victoria pods status ansible.builtin.command: >- kubectl get pods -n {{ victoria_namespace }} - -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" + -l "app.kubernetes.io/instance=victoria-cluster" -o wide register: victoria_pods_wide changed_when: false @@ -111,7 +111,7 @@ - name: Get vminsert service LoadBalancer IP ansible.builtin.command: >- - kubectl get svc vminsert -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vminsert_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}' register: vminsert_lb_ip changed_when: false @@ -119,7 +119,7 @@ - name: Get vminsert service LoadBalancer hostname ansible.builtin.command: >- - kubectl get svc vminsert -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vminsert_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' register: vminsert_lb_hostname changed_when: false @@ -127,7 +127,7 @@ - name: Get vminsert service external port ansible.builtin.command: >- - kubectl get svc vminsert -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vminsert_svc }} -n {{ victoria_namespace }} -o jsonpath='{.spec.ports[0].port}' register: vminsert_lb_port changed_when: false @@ -135,7 +135,7 @@ - name: Get vmselect service LoadBalancer IP ansible.builtin.command: >- - kubectl get svc vmselect -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vmselect_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}' register: vmselect_lb_ip changed_when: false @@ -143,7 +143,7 @@ - name: Get vmselect service LoadBalancer hostname ansible.builtin.command: >- - kubectl get svc vmselect -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vmselect_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' register: vmselect_lb_hostname changed_when: false @@ -151,7 +151,7 @@ - name: Get vmselect service external port ansible.builtin.command: >- - kubectl get svc vmselect -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vmselect_svc }} -n {{ victoria_namespace }} -o jsonpath='{.spec.ports[0].port}' register: vmselect_lb_port changed_when: false @@ -185,7 +185,7 @@ ansible.builtin.set_fact: victoria_sfm_hosts_entry: >- {{ - 'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' + 'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' ' ~ victoria_vminsert_svc ~ '.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' if (vminsert_lb_ip.stdout | trim | length) > 0 else '' }} @@ -194,7 +194,7 @@ ansible.builtin.set_fact: victoria_sfm_hosts_entry_vmselect: >- {{ - 'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' + 'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' ' ~ victoria_vmselect_svc ~ '.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' if (vmselect_lb_ip.stdout | trim | length) > 0 else '' }} @@ -202,11 +202,11 @@ - name: Set endpoint urls and SFM note strings ansible.builtin.set_fact: victoria_vminsert_write_url: >- - https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + {{ victoria_url_scheme }}://{{ victoria_vminsert_svc }}.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write victoria_vmselect_query_url: >- - https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query + {{ victoria_url_scheme }}://{{ victoria_vmselect_svc }}.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query victoria_vmselect_ui_url: >- - https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui + {{ victoria_url_scheme }}://{{ victoria_vmselect_svc }}.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui victoria_sfm_hosts_entry_vminsert_display: >- {{ victoria_sfm_hosts_entry @@ -234,7 +234,7 @@ namespace: "{{ victoria_namespace }}" deployment_mode: "{{ victoria_deployment_mode }}" pod_status: "{{ victoria_pods_wide.stdout }}" - base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}" + base_url: "{{ victoria_url_scheme }}://{{ vminsert_host }}:{{ vminsert_port }}" endpoints: vminsert: host: "{{ vminsert_host }}" @@ -243,8 +243,8 @@ vmselect: host: "{{ vmselect_host }}" port: "{{ vmselect_port | int }}" - query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" - ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" + query_endpoint: "{{ victoria_url_scheme }}://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" + ui_url: "{{ victoria_url_scheme }}://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" tls: ca_crt: "{{ victoria_tls_ca }}" notes: diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml index f9a1fb72dd..38833d807a 100644 --- a/utils/roles/external_victoria_connect_details/vars/main.yml +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -17,6 +17,15 @@ victoria_namespace: "telemetry" victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml" victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs" +# Operator-managed service names (derived from VMCluster CR name "victoria-cluster") +victoria_vminsert_svc: "vminsert-victoria-cluster" +victoria_vmselect_svc: "vmselect-victoria-cluster" +victoria_vmstorage_svc: "vmstorage-victoria-cluster" + +# TLS configuration (must match victoria_cluster.tls_enabled in telemetry role) +victoria_tls_enabled: true +victoria_url_scheme: "{{ 'https' if victoria_tls_enabled else 'http' }}" + victoria_err_mode_not_supported: >- Victoria deployment mode detected: {{ victoria_deployment_mode }}. External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage). @@ -27,7 +36,7 @@ victoria_err_pods_not_running: "One or more Victoria pods are not in Running sta victoria_err_pods_not_ready: "One or more Victoria pods are not Ready." victoria_err_lb_missing: >- - Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' + Failed to fetch Victoria LoadBalancer IP(s). Ensure services '{{ victoria_vminsert_svc }}' and '{{ victoria_vmselect_svc }}' exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. victoria_preflight_err_ha_config_missing: >- From 772d7b147679b0f3202ed6189cc7723072dd23b1 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Wed, 22 Apr 2026 05:10:56 +0000 Subject: [PATCH 03/15] IB nic ip assignment --- .../validation_flows/provision_validation.py | 67 +++++++++++++++++++ .../pxe_mapping_file.csv | 12 ++-- .../catalog_rhel_json/pxe_mapping_file.csv | 24 ++++--- .../pxe_mapping_file.csv | 22 +++--- .../pxe_mapping_file.csv | 12 ++-- examples/pxe_mapping_file.csv | 22 +++--- input/pxe_mapping_file.csv | 27 ++++---- ...-group-login_compiler_node_aarch64.yaml.j2 | 3 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 3 +- .../ci-group-login_node_aarch64.yaml.j2 | 3 +- .../ci-group-login_node_x86_64.yaml.j2 | 3 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 3 +- ...-service_kube_control_plane_x86_64.yaml.j2 | 3 +- .../ci-group-service_kube_node_x86_64.yaml.j2 | 3 +- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 3 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 3 +- .../ci-group-slurm_node_x86_64.yaml.j2 | 5 +- .../doca-ofed/configure-ib-network.sh.j2 | 58 ++++++++++------ 18 files changed, 188 insertions(+), 88 deletions(-) diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 16263c7b48..48e40a16cf 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -30,6 +30,7 @@ file_names = config.files create_error_msg = validation_utils.create_error_msg create_file_path = validation_utils.create_file_path +ib_mac_re = re.compile(r"^([0-9A-Fa-f]{2}:){7}[0-9A-Fa-f]{2}$") # Expected header columns (case-insensitive) required_headers = [ @@ -271,6 +272,52 @@ def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path): raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}") +def validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path): + """Validates that IB_IP values in the mapping file are unique.""" + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") + + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + ib_ip_col = fieldname_map.get("IB_IP") + hostname_col = fieldname_map.get("HOSTNAME") + + if not ib_ip_col: + return + + seen_ib_ips = {} + duplicates = [] + + for row_idx, row in enumerate(reader, start=2): + ib_ip = row.get(ib_ip_col, "").strip() if row.get(ib_ip_col) else "" + hostname = "" + if hostname_col: + hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" + + if not ib_ip: + continue + + if ib_ip in seen_ib_ips: + first_row = seen_ib_ips[ib_ip]["row"] + first_host = seen_ib_ips[ib_ip]["hostname"] + dup_host = hostname or "" + first_host_disp = first_host or "" + duplicates.append( + f"'{ib_ip}' at CSV rows {first_row} ({first_host_disp}) and {row_idx} ({dup_host})" + ) + continue + + seen_ib_ips[ib_ip] = {"row": row_idx, "hostname": hostname} + + if duplicates: + raise ValueError(f"Duplicate IB_IP found in PXE mapping file: {'; '.join(duplicates)}") + + def validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path): """Validates that GROUP_NAME has a consistent PARENT_SERVICE_TAG across the mapping file.""" if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): @@ -421,6 +468,25 @@ def validate_mapping_file_entries(mapping_file_path): if bmc_ip and not validation_utils.validate_ipv4(bmc_ip): raise ValueError(f"Invalid BMC_IP: '{bmc_ip}' at CSV row {row_idx} in mapping file.") + ib_mac_col = fieldname_map.get("IB_MAC") + ib_ip_col = fieldname_map.get("IB_IP") + ib_mac = row.get(ib_mac_col, "").strip() if ib_mac_col and row.get(ib_mac_col) else "" + ib_ip = row.get(ib_ip_col, "").strip() if ib_ip_col and row.get(ib_ip_col) else "" + + if bool(ib_mac) != bool(ib_ip): + raise ValueError( + f"IB_MAC and IB_IP must both be provided or both be empty at CSV row {row_idx} in mapping file." + ) + + if ib_mac and not ib_mac_re.match(ib_mac): + raise ValueError( + f"Invalid IB_MAC: '{ib_mac}' at CSV row {row_idx} in mapping file. " + "Expected format: xx:xx:xx:xx:xx:xx:xx:xx." + ) + + if ib_ip and not validation_utils.validate_ipv4(ib_ip): + raise ValueError(f"Invalid IB_IP: '{ib_ip}' at CSV row {row_idx} in mapping file.") + if not row_seen: raise ValueError("Please provide details in mapping file.") @@ -859,6 +925,7 @@ def validate_provision_config( validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path) validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path) validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path) + validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path) validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv index 0a350bc72d..295e7615af 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv index 6e3e4c6e63..5226b0a19e 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv @@ -1,11 +1,13 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,94:6d:ae:03:00:8c:12:ae,192.168.0.110 +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,94:6d:ae:03:00:8c:12:bf,192.168.0.111 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv index 6e3e4c6e63..01360b424b 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv index 98ad5ab134..65ceac6ada 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index f4d41e2a77..01360b424b 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 diff --git a/input/pxe_mapping_file.csv b/input/pxe_mapping_file.csv index abb6fc5fe8..e9b5a893f2 100644 --- a/input/pxe_mapping_file.csv +++ b/input/pxe_mapping_file.csv @@ -1,13 +1,14 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 -os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60 -os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,, +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,, +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,, +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,, +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,, +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,, +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,, +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,, +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,, +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,, +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,, +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,, + diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index fe6966c4be..303baf5743 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -289,7 +289,8 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 1ee1fce5e1..fbf39d348c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -291,7 +291,8 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index cdea0cd340..406a50a5a0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -145,7 +145,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index b744859381..50f85187b1 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -147,7 +147,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 43c6866ab0..7eb3c72cc1 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -424,7 +424,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 922f63f852..71f8be3033 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -332,7 +332,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index df98035baa..e363187b58 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -234,7 +234,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index d5f9ef9ba6..d72541d774 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -494,7 +494,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh {% if powervault_config is defined %} - /usr/local/bin/setup_iscsi_storage.sh {% endif %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 49e5322195..145f79190d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -560,7 +560,8 @@ - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index ccffc5cd9e..65ef5a8b0c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -565,8 +565,9 @@ - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh @@ -623,4 +624,4 @@ - /usr/local/bin/export_nvhpc_env.sh - systemctl restart slurmd - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 249b90b6a5..43a000c561 100644 --- a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -7,34 +7,54 @@ if ! lspci | grep -i 'mellanox'; then exit 0 fi +# Ensure IPoIB + Mellanox IB kernel modules are loaded before interface detection. +# This avoids boot-time races where the IB device exists (lspci) but no ib* link is present yet. +modprobe mlx5_ib || true +modprobe ib_ipoib || true +modprobe ib_umad || true +modprobe ib_uverbs || true + ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" -IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" - -ip_to_int() { - local IFS=. - read -r a b c d <<< "$1" - echo $(( (a << 24) + (b << 16) + (c << 8) + d )) -} +declare -A IB_IP_MAP=( +{% for mac, node in hostvars['localhost']['read_mapping_file']['dict'].items() -%} +{% if node.IB_IP is defined and node.IB_IP | trim | length > 0 %} + ["{{ node.ADMIN_IP }}"]="{{ node.IB_IP }}" +{%- endif %} +{%- endfor %} +) + +IB_IP="${IB_IP_MAP[$ADMIN_NIC_IP]:-}" + +if [ -n "$IB_IP" ]; then + echo "Using explicit IB IP : $IB_IP/$NETMASK_BITS" +else + IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" -int_to_ip() { - local ip=$1 - echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" -} + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } -ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") -IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") + ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") + IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") -HOST_BITS=$(( 32 - NETMASK_BITS )) -HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) -HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) -IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) + HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) + IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) -IB_IP=$(int_to_ip "$IB_IP_INT") + IB_IP=$(int_to_ip "$IB_IP_INT") -echo "Derived IB IP : $IB_IP/$NETMASK_BITS" + echo "Derived IB IP : $IB_IP/$NETMASK_BITS" +fi MAX_WAIT=120 # total wait time in seconds (2 minutes) INTERVAL=10 # check every 10 seconds From 19d17ad25505a6a41c5a58d039ac9e4c8dd7f862 Mon Sep 17 00:00:00 2001 From: Mithilesh Reddy Date: Fri, 24 Apr 2026 10:30:12 +0530 Subject: [PATCH 04/15] update MinIO and registry images to fixed tagged versions, omnia core container tag and version to 2.2 and v2.2.0.0 (#4309) * Minimal OS-only functional group enablement for x86_64 and aarch64 * Update image_package_collector.py * Update provision_validation.py * Minimal OS functional group updates in provision * Minimal OS functional group upgrade * Fix os_* package cross-contamination and remove stale discovery templates * OpenCHAMI upgrade changes * Update openchami container tags * Update main.yml * Update main.yml * Update main.yml * Update omnia version and core tag --- omnia.sh | 39 ++++++++----------- .../deploy_containers/openchami/vars/main.yml | 8 ++-- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/omnia.sh b/omnia.sh index 85736f9427..c4290b922f 100755 --- a/omnia.sh +++ b/omnia.sh @@ -105,7 +105,7 @@ get_metadata_version() { fi } -omnia_release=2.1.0.0 +omnia_release=2.2.0.0 core_container_status=false omnia_path="" @@ -128,7 +128,7 @@ is_local_ip() { } # Version configuration variables -OMNIA_CORE_CONTAINER_TAG="2.1" # Default container tag +OMNIA_CORE_CONTAINER_TAG="2.2" # Default container tag OMNIA_VERSION="" # Will be read from metadata TARGET_OMNIA_VERSION="" # Target version for upgrade TARGET_CONTAINER_TAG="" # Target container tag for upgrade @@ -136,10 +136,8 @@ TARGET_CONTAINER_TAG="" # Target container tag for upgrade # Centralized version list (in chronological order) # Note: Include RC milestones so upgrades from RC to RC/GA appear ALL_OMNIA_VERSIONS=( - "2.0.0.0" - "2.1.0.0-rc1" - "2.1.0.0-rc2" "2.1.0.0" + "2.2.0.0" ) # Container-side paths (used inside podman exec commands) @@ -315,13 +313,13 @@ validate_container_image() { echo -e "${BLUE}Build the required image using the following commands:${NC}" echo "" echo -e "git clone https://github.com/dell/omnia-artifactory.git -b omnia-container-" - echo -e "${YELLOW}Note: Replace with the target Omnia version (e.g., v2.1.0.0)${NC}" + echo -e "${YELLOW}Note: Replace with the target Omnia version (e.g., v2.2.0.0)${NC}" echo "" echo -e "cd omnia-artifactory" echo "" echo -e "./build_images.sh core core_tag= omnia_branch=" - echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.1.0.0)${NC}" - echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the target Omnia version (e.g., 2.1 for v2.1.0.0)${NC}" + echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.2.0.0)${NC}" + echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the target Omnia version (e.g., 2.2 for v2.2.0.0)${NC}" echo "" echo -e "${BLUE}After the image is built successfully, re-run:${NC}" echo -e "./omnia.sh --$operation" @@ -338,18 +336,15 @@ validate_container_image() { get_container_tag_from_version() { local version="$1" - # Explicit mapping: 2.1.0.0-rc1 stays on pre-GA tag 1.0 - if [[ "$version" == "2.1.0.0-rc1" ]]; then - echo "1.0" - return - fi - case "$version" in - 2.0.*) - echo "1.0" + 2.1.*) + echo "2.1" + ;; + 2.2.*) + echo "2.2" ;; *) - # All other versions (including rc2/GA) use major.minor as tag + # All other versions use major.minor as tag echo "$(echo "$version" | awk -F. '{print $1"."$2}')" ;; esac @@ -1167,7 +1162,7 @@ Description=${container_name^} Container [Container] ContainerName=${container_name} HostName=${container_name} -Image=${container_name}:2.1 +Image=${container_name}:${OMNIA_CORE_CONTAINER_TAG} Network=host # Capabilities @@ -1384,19 +1379,19 @@ show_help() { } install_omnia_core() { - # Detect existing Omnia 2.0 installation + # Check for existing installation if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then # Read version from metadata inside container current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') - if [ "$current_version" = "2.0.0.0" ]; then - echo -e "${RED}ERROR: Existing Omnia 2.0 installation detected.${NC}" + if [ "$current_version" = "2.1.0.0" ]; then + echo -e "${RED}ERROR: Existing Omnia 2.1 installation detected.${NC}" echo -e "${YELLOW}To upgrade, run: $0 --upgrade${NC}" echo -e "${YELLOW}For a fresh install, first run: $0 --uninstall${NC}" exit 1 fi fi - local omnia_core_tag="2.1" + local omnia_core_tag="$OMNIA_CORE_CONTAINER_TAG" local omnia_core_registry="" # Check if local omnia_core image exists using validate function diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index f7234139f7..9f6254a0b3 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -17,7 +17,7 @@ openchami_git_repo: https://github.com/OpenCHAMI/deployment-recipes.git openchami_share_dir: /opt/omnia/openchami openchami_clone_path: "{{ openchami_share_dir }}/deployment-recipes" -openchami_git_version: 92310bd537c05c201d5156137357bb742b09d5e0 +openchami_git_version: bf1f6dfdc7f6107a4227568987faedb1b79b95fa clone_retry: "5" clone_delay: "10" dir_permissions_755: "0755" @@ -48,11 +48,11 @@ openchami_bss_tag: "v1.32.1" openchami_cloud_init_tag: "v1.3.0" openchami_coresmd_tag: "v0.4.0" # Third-party image tags for OpenCHAMI -minio_tag: "latest" +minio_release_tag: "RELEASE.2026-04-17T00-00-00Z" postgres_tag: "11.5-alpine" hydra_tag: "v2.3" haproxy_tag: "latest" -registry_tag: "latest" +registry_tag: "3.1.0" curl_tag: "latest" acme_tag: "3.1.1" @@ -64,7 +64,7 @@ openchami_images: - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" - "ghcr.io/openchami/coresmd:{{ openchami_coresmd_tag }}" - - "docker.io/minio/minio:{{ minio_tag }}" + - "docker.io/pgsty/minio:{{ minio_release_tag }}" - "docker.io/library/postgres:{{ postgres_tag }}" - "docker.io/oryd/hydra:{{ hydra_tag }}" - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}" From 650c76a8222fab4ad52225ba168c38d1b1e2ce17 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 27 Apr 2026 11:25:02 +0000 Subject: [PATCH 05/15] vast client installation --- ...-group-login_compiler_node_aarch64.yaml.j2 | 74 ++++++++++++++++++- ...i-group-login_compiler_node_x86_64.yaml.j2 | 74 ++++++++++++++++++- .../ci-group-login_node_aarch64.yaml.j2 | 74 ++++++++++++++++++- .../ci-group-login_node_x86_64.yaml.j2 | 74 ++++++++++++++++++- .../ci-group-slurm_node_x86_64.yaml.j2 | 73 +++++++++++++++++- 5 files changed, 364 insertions(+), 5 deletions(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 303baf5743..ad2d292268 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -221,6 +221,77 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + LOGFILE="/var/log/configure_vast_installation.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + echo "[INFO] ===== Starting VAST NFS client installation =====" + + echo "[INFO] Disabling GPG check for dnf package installation" + sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" + else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 + fi + + echo "[INFO] Installing vastnfs package with SSL verification disabled" + dnf install -y --setopt=sslverify=0 vastnfs + if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" + else + echo "[ERROR] vastnfs package installation failed" + exit 1 + fi + + echo "[INFO] Running depmod to update module dependencies" + depmod -a + if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" + else + echo "[ERROR] Failed to update module dependencies" + exit 1 + fi + + echo "[INFO] Stopping gssproxy and rpcbind services" + systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" + systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + + echo "[INFO] Unmounting RPC pipefs and removing NFS modules" + umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" + rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" + rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + + echo "[INFO] Loading sunrpc module and starting rpcbind service" + modprobe sunrpc + if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" + else + echo "[ERROR] Failed to load sunrpc module" + exit 1 + fi + + systemctl start rpcbind + if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" + else + echo "[ERROR] Failed to start rpcbind service" + exit 1 + fi + + echo "[INFO] Verifying vastnfs installation" + if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" + else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" + fi + + echo "[INFO] ===== VAST NFS client installation completed successfully =====" + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -254,9 +325,10 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index fbf39d348c..72af8005c5 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -221,6 +221,77 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + LOGFILE="/var/log/configure_vast_installation.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + echo "[INFO] ===== Starting VAST NFS client installation =====" + + echo "[INFO] Disabling GPG check for dnf package installation" + sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" + else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 + fi + + echo "[INFO] Installing vastnfs package with SSL verification disabled" + dnf install -y --setopt=sslverify=0 vastnfs + if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" + else + echo "[ERROR] vastnfs package installation failed" + exit 1 + fi + + echo "[INFO] Running depmod to update module dependencies" + depmod -a + if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" + else + echo "[ERROR] Failed to update module dependencies" + exit 1 + fi + + echo "[INFO] Stopping gssproxy and rpcbind services" + systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" + systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + + echo "[INFO] Unmounting RPC pipefs and removing NFS modules" + umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" + rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" + rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + + echo "[INFO] Loading sunrpc module and starting rpcbind service" + modprobe sunrpc + if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" + else + echo "[ERROR] Failed to load sunrpc module" + exit 1 + fi + + systemctl start rpcbind + if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" + else + echo "[ERROR] Failed to start rpcbind service" + exit 1 + fi + + echo "[INFO] Verifying vastnfs installation" + if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" + else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" + fi + + echo "[INFO] ===== VAST NFS client installation completed successfully =====" + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -256,9 +327,10 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a {% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 406a50a5a0..a78108a0af 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -121,6 +121,77 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + LOGFILE="/var/log/configure_vast_installation.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + echo "[INFO] ===== Starting VAST NFS client installation =====" + + echo "[INFO] Disabling GPG check for dnf package installation" + sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" + else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 + fi + + echo "[INFO] Installing vastnfs package with SSL verification disabled" + dnf install -y --setopt=sslverify=0 vastnfs + if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" + else + echo "[ERROR] vastnfs package installation failed" + exit 1 + fi + + echo "[INFO] Running depmod to update module dependencies" + depmod -a + if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" + else + echo "[ERROR] Failed to update module dependencies" + exit 1 + fi + + echo "[INFO] Stopping gssproxy and rpcbind services" + systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" + systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + + echo "[INFO] Unmounting RPC pipefs and removing NFS modules" + umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" + rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" + rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + + echo "[INFO] Loading sunrpc module and starting rpcbind service" + modprobe sunrpc + if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" + else + echo "[ERROR] Failed to load sunrpc module" + exit 1 + fi + + systemctl start rpcbind + if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" + else + echo "[ERROR] Failed to start rpcbind service" + exit 1 + fi + + echo "[INFO] Verifying vastnfs installation" + if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" + else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" + fi + + echo "[INFO] ===== VAST NFS client installation completed successfully =====" + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -142,9 +213,10 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a - bash /usr/local/bin/doca-install.sh || true - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 50f85187b1..9090d93ed9 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -120,6 +120,77 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + LOGFILE="/var/log/configure_vast_installation.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + echo "[INFO] ===== Starting VAST NFS client installation =====" + + echo "[INFO] Disabling GPG check for dnf package installation" + sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" + else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 + fi + + echo "[INFO] Installing vastnfs package with SSL verification disabled" + dnf install -y --setopt=sslverify=0 vastnfs + if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" + else + echo "[ERROR] vastnfs package installation failed" + exit 1 + fi + + echo "[INFO] Running depmod to update module dependencies" + depmod -a + if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" + else + echo "[ERROR] Failed to update module dependencies" + exit 1 + fi + + echo "[INFO] Stopping gssproxy and rpcbind services" + systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" + systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + + echo "[INFO] Unmounting RPC pipefs and removing NFS modules" + umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" + rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" + rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + + echo "[INFO] Loading sunrpc module and starting rpcbind service" + modprobe sunrpc + if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" + else + echo "[ERROR] Failed to load sunrpc module" + exit 1 + fi + + systemctl start rpcbind + if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" + else + echo "[ERROR] Failed to start rpcbind service" + exit 1 + fi + + echo "[INFO] Verifying vastnfs installation" + if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" + else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" + fi + + echo "[INFO] ===== VAST NFS client installation completed successfully =====" + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -144,9 +215,10 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a - bash /usr/local/bin/doca-install.sh || true - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 65ef5a8b0c..bf5432e34f 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -320,6 +320,77 @@ content: | SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/configure_vast_installation.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + LOGFILE="/var/log/configure_vast_installation.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + echo "[INFO] ===== Starting VAST NFS client installation =====" + + echo "[INFO] Disabling GPG check for dnf package installation" + sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" + else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 + fi + + echo "[INFO] Installing vastnfs package with SSL verification disabled" + dnf install -y --setopt=sslverify=0 vastnfs + if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" + else + echo "[ERROR] vastnfs package installation failed" + exit 1 + fi + + echo "[INFO] Running depmod to update module dependencies" + depmod -a + if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" + else + echo "[ERROR] Failed to update module dependencies" + exit 1 + fi + + echo "[INFO] Stopping gssproxy and rpcbind services" + systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" + systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + + echo "[INFO] Unmounting RPC pipefs and removing NFS modules" + umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" + rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" + rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + + echo "[INFO] Loading sunrpc module and starting rpcbind service" + modprobe sunrpc + if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" + else + echo "[ERROR] Failed to load sunrpc module" + exit 1 + fi + + systemctl start rpcbind + if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" + else + echo "[ERROR] Failed to start rpcbind service" + exit 1 + fi + + echo "[INFO] Verifying vastnfs installation" + if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" + else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" + fi + + echo "[INFO] ===== VAST NFS client installation completed successfully =====" + - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -561,7 +632,7 @@ - /usr/local/bin/setup_dcgm.sh {% endif %} # slurm user and group created in the users module - + - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf From 7fe563c7409db1355b503e9ab6500b3eef16ab51 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 27 Apr 2026 12:43:25 +0000 Subject: [PATCH 06/15] single template way --- ...-group-login_compiler_node_aarch64.yaml.j2 | 69 +-------------- ...i-group-login_compiler_node_x86_64.yaml.j2 | 69 +-------------- .../ci-group-login_node_aarch64.yaml.j2 | 69 +-------------- .../ci-group-login_node_x86_64.yaml.j2 | 69 +-------------- .../ci-group-slurm_node_aarch64.yaml.j2 | 7 ++ .../ci-group-slurm_node_x86_64.yaml.j2 | 69 +-------------- .../vast/configure_vast_installation.sh.j2 | 84 +++++++++++++++++++ 7 files changed, 101 insertions(+), 335 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index ad2d292268..710edfc39c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -222,75 +222,10 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/configure_vast_installation.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "[INFO] ===== Starting VAST NFS client installation =====" - - echo "[INFO] Disabling GPG check for dnf package installation" - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - if [ $? -eq 0 ]; then - echo "[SUCCESS] GPG check disabled in dnf.conf" - else - echo "[ERROR] Failed to disable GPG check in dnf.conf" - exit 1 - fi - - echo "[INFO] Installing vastnfs package with SSL verification disabled" - dnf install -y --setopt=sslverify=0 vastnfs - if [ $? -eq 0 ]; then - echo "[SUCCESS] vastnfs package installed successfully" - else - echo "[ERROR] vastnfs package installation failed" - exit 1 - fi - - echo "[INFO] Running depmod to update module dependencies" - depmod -a - if [ $? -eq 0 ]; then - echo "[SUCCESS] Module dependencies updated successfully" - else - echo "[ERROR] Failed to update module dependencies" - exit 1 - fi - - echo "[INFO] Stopping gssproxy and rpcbind services" - systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" - systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" - - echo "[INFO] Unmounting RPC pipefs and removing NFS modules" - umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" - rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" - rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" - - echo "[INFO] Loading sunrpc module and starting rpcbind service" - modprobe sunrpc - if [ $? -eq 0 ]; then - echo "[SUCCESS] sunrpc module loaded successfully" - else - echo "[ERROR] Failed to load sunrpc module" - exit 1 - fi - - systemctl start rpcbind - if [ $? -eq 0 ]; then - echo "[SUCCESS] rpcbind service started successfully" - else - echo "[ERROR] Failed to start rpcbind service" - exit 1 - fi - - echo "[INFO] Verifying vastnfs installation" - if lsmod | grep -q vastnfs; then - echo "[SUCCESS] vastnfs module is loaded" - else - echo "[WARN] vastnfs module not found in lsmod, may load on demand" - fi - - echo "[INFO] ===== VAST NFS client installation completed successfully =====" + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - path: /tmp/apptainer_mirror.conf permissions: '0644' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 72af8005c5..904f7f5da2 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -222,75 +222,10 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/configure_vast_installation.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "[INFO] ===== Starting VAST NFS client installation =====" - - echo "[INFO] Disabling GPG check for dnf package installation" - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - if [ $? -eq 0 ]; then - echo "[SUCCESS] GPG check disabled in dnf.conf" - else - echo "[ERROR] Failed to disable GPG check in dnf.conf" - exit 1 - fi - - echo "[INFO] Installing vastnfs package with SSL verification disabled" - dnf install -y --setopt=sslverify=0 vastnfs - if [ $? -eq 0 ]; then - echo "[SUCCESS] vastnfs package installed successfully" - else - echo "[ERROR] vastnfs package installation failed" - exit 1 - fi - - echo "[INFO] Running depmod to update module dependencies" - depmod -a - if [ $? -eq 0 ]; then - echo "[SUCCESS] Module dependencies updated successfully" - else - echo "[ERROR] Failed to update module dependencies" - exit 1 - fi - - echo "[INFO] Stopping gssproxy and rpcbind services" - systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" - systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" - - echo "[INFO] Unmounting RPC pipefs and removing NFS modules" - umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" - rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" - rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" - - echo "[INFO] Loading sunrpc module and starting rpcbind service" - modprobe sunrpc - if [ $? -eq 0 ]; then - echo "[SUCCESS] sunrpc module loaded successfully" - else - echo "[ERROR] Failed to load sunrpc module" - exit 1 - fi - - systemctl start rpcbind - if [ $? -eq 0 ]; then - echo "[SUCCESS] rpcbind service started successfully" - else - echo "[ERROR] Failed to start rpcbind service" - exit 1 - fi - - echo "[INFO] Verifying vastnfs installation" - if lsmod | grep -q vastnfs; then - echo "[SUCCESS] vastnfs module is loaded" - else - echo "[WARN] vastnfs module not found in lsmod, may load on demand" - fi - - echo "[INFO] ===== VAST NFS client installation completed successfully =====" + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - path: /tmp/apptainer_mirror.conf permissions: '0644' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index a78108a0af..0db88c90f9 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -122,75 +122,10 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/configure_vast_installation.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "[INFO] ===== Starting VAST NFS client installation =====" - - echo "[INFO] Disabling GPG check for dnf package installation" - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - if [ $? -eq 0 ]; then - echo "[SUCCESS] GPG check disabled in dnf.conf" - else - echo "[ERROR] Failed to disable GPG check in dnf.conf" - exit 1 - fi - - echo "[INFO] Installing vastnfs package with SSL verification disabled" - dnf install -y --setopt=sslverify=0 vastnfs - if [ $? -eq 0 ]; then - echo "[SUCCESS] vastnfs package installed successfully" - else - echo "[ERROR] vastnfs package installation failed" - exit 1 - fi - - echo "[INFO] Running depmod to update module dependencies" - depmod -a - if [ $? -eq 0 ]; then - echo "[SUCCESS] Module dependencies updated successfully" - else - echo "[ERROR] Failed to update module dependencies" - exit 1 - fi - - echo "[INFO] Stopping gssproxy and rpcbind services" - systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" - systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" - - echo "[INFO] Unmounting RPC pipefs and removing NFS modules" - umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" - rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" - rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" - - echo "[INFO] Loading sunrpc module and starting rpcbind service" - modprobe sunrpc - if [ $? -eq 0 ]; then - echo "[SUCCESS] sunrpc module loaded successfully" - else - echo "[ERROR] Failed to load sunrpc module" - exit 1 - fi - - systemctl start rpcbind - if [ $? -eq 0 ]; then - echo "[SUCCESS] rpcbind service started successfully" - else - echo "[ERROR] Failed to start rpcbind service" - exit 1 - fi - - echo "[INFO] Verifying vastnfs installation" - if lsmod | grep -q vastnfs; then - echo "[SUCCESS] vastnfs module is loaded" - else - echo "[WARN] vastnfs module not found in lsmod, may load on demand" - fi - - echo "[INFO] ===== VAST NFS client installation completed successfully =====" + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - path: /tmp/apptainer_mirror.conf permissions: '0644' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 9090d93ed9..5be03cadce 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -121,75 +121,10 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/configure_vast_installation.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "[INFO] ===== Starting VAST NFS client installation =====" - - echo "[INFO] Disabling GPG check for dnf package installation" - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - if [ $? -eq 0 ]; then - echo "[SUCCESS] GPG check disabled in dnf.conf" - else - echo "[ERROR] Failed to disable GPG check in dnf.conf" - exit 1 - fi - - echo "[INFO] Installing vastnfs package with SSL verification disabled" - dnf install -y --setopt=sslverify=0 vastnfs - if [ $? -eq 0 ]; then - echo "[SUCCESS] vastnfs package installed successfully" - else - echo "[ERROR] vastnfs package installation failed" - exit 1 - fi - - echo "[INFO] Running depmod to update module dependencies" - depmod -a - if [ $? -eq 0 ]; then - echo "[SUCCESS] Module dependencies updated successfully" - else - echo "[ERROR] Failed to update module dependencies" - exit 1 - fi - - echo "[INFO] Stopping gssproxy and rpcbind services" - systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" - systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" - - echo "[INFO] Unmounting RPC pipefs and removing NFS modules" - umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" - rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" - rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" - - echo "[INFO] Loading sunrpc module and starting rpcbind service" - modprobe sunrpc - if [ $? -eq 0 ]; then - echo "[SUCCESS] sunrpc module loaded successfully" - else - echo "[ERROR] Failed to load sunrpc module" - exit 1 - fi - - systemctl start rpcbind - if [ $? -eq 0 ]; then - echo "[SUCCESS] rpcbind service started successfully" - else - echo "[ERROR] Failed to start rpcbind service" - exit 1 - fi - - echo "[INFO] Verifying vastnfs installation" - if lsmod | grep -q vastnfs; then - echo "[SUCCESS] vastnfs module is loaded" - else - echo "[WARN] vastnfs module not found in lsmod, may load on demand" - fi - - echo "[INFO] ===== VAST NFS client installation completed successfully =====" + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - path: /tmp/apptainer_mirror.conf permissions: '0644' diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 145f79190d..ef32920faa 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -305,6 +305,12 @@ content: | {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -557,6 +563,7 @@ - /usr/local/bin/setup_dcgm.sh {% endif %} + - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index bf5432e34f..65cb98c9f1 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -321,75 +321,10 @@ SLURMD_OPTIONS="{{ conf_server }}" - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/configure_vast_installation.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "[INFO] ===== Starting VAST NFS client installation =====" - - echo "[INFO] Disabling GPG check for dnf package installation" - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - if [ $? -eq 0 ]; then - echo "[SUCCESS] GPG check disabled in dnf.conf" - else - echo "[ERROR] Failed to disable GPG check in dnf.conf" - exit 1 - fi - - echo "[INFO] Installing vastnfs package with SSL verification disabled" - dnf install -y --setopt=sslverify=0 vastnfs - if [ $? -eq 0 ]; then - echo "[SUCCESS] vastnfs package installed successfully" - else - echo "[ERROR] vastnfs package installation failed" - exit 1 - fi - - echo "[INFO] Running depmod to update module dependencies" - depmod -a - if [ $? -eq 0 ]; then - echo "[SUCCESS] Module dependencies updated successfully" - else - echo "[ERROR] Failed to update module dependencies" - exit 1 - fi - - echo "[INFO] Stopping gssproxy and rpcbind services" - systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" - systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" - - echo "[INFO] Unmounting RPC pipefs and removing NFS modules" - umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" - rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" - rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" - - echo "[INFO] Loading sunrpc module and starting rpcbind service" - modprobe sunrpc - if [ $? -eq 0 ]; then - echo "[SUCCESS] sunrpc module loaded successfully" - else - echo "[ERROR] Failed to load sunrpc module" - exit 1 - fi - - systemctl start rpcbind - if [ $? -eq 0 ]; then - echo "[SUCCESS] rpcbind service started successfully" - else - echo "[ERROR] Failed to start rpcbind service" - exit 1 - fi - - echo "[INFO] Verifying vastnfs installation" - if lsmod | grep -q vastnfs; then - echo "[SUCCESS] vastnfs module is loaded" - else - echo "[WARN] vastnfs module not found in lsmod, may load on demand" - fi - - echo "[INFO] ===== VAST NFS client installation completed successfully =====" + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' diff --git a/provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 b/provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 new file mode 100644 index 0000000000..627994ef47 --- /dev/null +++ b/provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VAST NFS Client Installation Script +# This script installs and configures the VAST NFS client for high-performance NFS mounts + +LOGFILE="/var/log/configure_vast_installation.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "[INFO] ===== Starting VAST NFS client installation =====" + +echo "[INFO] Disabling GPG check for dnf package installation" +sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf +if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" +else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 +fi + +echo "[INFO] Installing vastnfs package with SSL verification disabled" +dnf install -y --setopt=sslverify=0 vastnfs +if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" +else + echo "[ERROR] vastnfs package installation failed" + exit 1 +fi + +echo "[INFO] Running depmod to update module dependencies" +depmod -a +if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" +else + echo "[ERROR] Failed to update module dependencies" + exit 1 +fi + +echo "[INFO] Stopping gssproxy and rpcbind services" +systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" +systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + +echo "[INFO] Unmounting RPC pipefs and removing NFS modules" +umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" +rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" +rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + +echo "[INFO] Loading sunrpc module and starting rpcbind service" +modprobe sunrpc +if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" +else + echo "[ERROR] Failed to load sunrpc module" + exit 1 +fi + +systemctl start rpcbind +if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" +else + echo "[ERROR] Failed to start rpcbind service" + exit 1 +fi + +echo "[INFO] Verifying vastnfs installation" +if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" +else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" +fi + +echo "[INFO] ===== VAST NFS client installation completed successfully =====" From e4fc4556596612c479995931ef7bf9d1e25f4d0a Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Tue, 28 Apr 2026 15:26:46 +0530 Subject: [PATCH 07/15] fix(OMN01D-2164): prompt OME credentials only when enable_bmc_discovery is true - Add enable_bmc_discovery flag (default: false) to discovery_config.yml - Load discovery_config.yml in prepare_oim.yml and set ome_discovery_enabled based on enable_bmc_discovery flag - Change discovery credentials from mandatory to conditional_mandatory gated on ome_discovery_enabled - When enable_bmc_discovery is false, OME username/password prompts are skipped during prepare_oim even if ome_ip is pre-filled --- input/discovery_config.yml | 6 +++++- prepare_oim/prepare_oim.yml | 19 +++++++++++++++++++ .../roles/update_config/vars/main.yml | 6 ++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/input/discovery_config.yml b/input/discovery_config.yml index 1e8df8f2a6..3cc563b069 100644 --- a/input/discovery_config.yml +++ b/input/discovery_config.yml @@ -18,7 +18,11 @@ # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. # *********************************************************************** -#### OME Discovery +#### BMC Discovery +# Set to true to enable BMC discovery via OME. +# When false, OME credentials will not be prompted during prepare_oim. +enable_bmc_discovery: false + # IP address of the Dell OpenManage Enterprise (OME) instance used for # server discovery and inventory collection. # Credentials (ome_username, ome_password) are managed separately via diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 942ab57527..08e1d5dfcf 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -99,6 +99,25 @@ - telemetry_config_stat.stat.exists - telemetry_config.idrac_telemetry_support | default(false) | bool + - name: Check discovery configuration for OME + block: + - name: Check if discovery_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/discovery_config.yml" + register: discovery_config_stat + + - name: Load discovery_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/discovery_config.yml" + name: discovery_config + when: discovery_config_stat.stat.exists + failed_when: false + + - name: Set ome_discovery_enabled flag + ansible.builtin.set_fact: + ome_discovery_enabled: "{{ discovery_config_stat.stat.exists and (discovery_config.enable_bmc_discovery | default(false) | bool) }}" + cacheable: true + - name: Invoke validate_config.yml to perform L1 and L2 validations with prepare_oim tag ansible.builtin.import_playbook: ../input_validation/validate_config.yml tags: always diff --git a/utils/credential_utility/roles/update_config/vars/main.yml b/utils/credential_utility/roles/update_config/vars/main.yml index 97821e052d..10efcdefc1 100644 --- a/utils/credential_utility/roles/update_config/vars/main.yml +++ b/utils/credential_utility/roles/update_config/vars/main.yml @@ -111,5 +111,7 @@ omnia_credentials: mandatory: - { password: ldms_sampler_password } discovery: - mandatory: - - { username: ome_username, password: ome_password } + conditional_mandatory: + - username: ome_username + password: ome_password + condition: "{{ ome_discovery_enabled | default(false) | bool }}" From 9430b64f0d4123e49f4cd283a70a83b35b66a718 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Tue, 28 Apr 2026 15:59:10 +0530 Subject: [PATCH 08/15] fix(OMN01D-2168): fail explicitly when discovery_mechanism is not provided Replace meta: end_play with ansible.builtin.fail so the playbook exits with non-zero status and a clear error message when discovery_mechanism is missing, instead of silently succeeding. --- discovery/discovery.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/discovery/discovery.yml b/discovery/discovery.yml index aa60caf33c..6255162e44 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -44,15 +44,15 @@ - name: Display usage information when no discovery_mechanism is specified when: discovery_mechanism == "" block: - - name: Show usage prompt - ansible.builtin.debug: + - name: Fail when no discovery_mechanism specified + ansible.builtin.fail: msg: - "============================================================" + - "ERROR: discovery_mechanism is required but not provided." + - "" - "Usage: ansible-playbook discovery.yml" - " -e discovery_mechanism=" - "" - - "Please specify a discovery_mechanism parameter." - - "" - "Supported discovery mechanisms:" - " - ome : Dell OpenManage Enterprise (OME)" - " - magellan : Magellan (upcoming, not yet supported)" @@ -62,9 +62,6 @@ - " ansible-playbook discovery.yml -e discovery_mechanism=magellan" - "============================================================" - - name: End play when no discovery_mechanism specified - ansible.builtin.meta: end_play - - name: Validate discovery_mechanism parameter ansible.builtin.fail: msg: | From 3ff9ac379df848988f135cb5d0a2bb3f9abb1d06 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:05:35 +0530 Subject: [PATCH 09/15] fix(OMN01D-2169): add L1 input validation for discovery.yml - Create discovery_config.json schema with: - enable_bmc_discovery (boolean, required) - ome_ip (string, required; must be valid IPv4 when enable_bmc_discovery is true) - Register discovery_config in config.py files dict and input_file_inventory - Add 'discovery' tag and invoke validate_config.yml in discovery.yml before role execution (consistent with provision.yml pattern) - Add explicit ome_ip check before OME role inclusion for clear fail-fast error when ome_ip is empty with discovery_mechanism=ome --- .../input_validation/common_utils/config.py | 4 +- .../schema/discovery_config.json | 38 +++++++++++++++++++ discovery/discovery.yml | 23 ++++++++++- 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 common/library/module_utils/input_validation/schema/discovery_config.json diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 7f26f692e4..47990cafdc 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -58,7 +58,8 @@ "telemetry_config": "telemetry_config.yml", "high_availability_config": "high_availability_config.yml", "build_stream_config": "build_stream_config.yml", - "gitlab_config": "gitlab_config.yml" + "gitlab_config": "gitlab_config.yml", + "discovery_config": "discovery_config.yml" # "additional_software": "additional_software.json" } @@ -103,6 +104,7 @@ # "high_availability": [files["high_availability_config"]], # "additional_software": [files["additional_software"]], "build_stream": [files["build_stream_config"]], + "discovery": [files["discovery_config"]], "gitlab": [files["gitlab_config"], files["build_stream_config"]], "all": [ files["local_repo_config"], diff --git a/common/library/module_utils/input_validation/schema/discovery_config.json b/common/library/module_utils/input_validation/schema/discovery_config.json new file mode 100644 index 0000000000..10b41175ac --- /dev/null +++ b/common/library/module_utils/input_validation/schema/discovery_config.json @@ -0,0 +1,38 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Discovery Configuration", + "type": "object", + "properties": { + "enable_bmc_discovery": { + "type": "boolean", + "description": "Enable BMC discovery via OME. When true, ome_ip must be a valid IP address." + }, + "ome_ip": { + "type": "string", + "description": "IP address of the Dell OpenManage Enterprise (OME) instance." + } + }, + "required": [ + "enable_bmc_discovery", + "ome_ip" + ], + "allOf": [ + { + "if": { + "properties": { + "enable_bmc_discovery": { "const": true } + } + }, + "then": { + "properties": { + "ome_ip": { + "type": "string", + "minLength": 7, + "pattern": "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$", + "errorMessage": "ome_ip must be a valid IPv4 address when enable_bmc_discovery is true. Example: 192.168.1.100" + } + } + } + } + ] +} diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 6255162e44..8adfa0e5d1 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -21,6 +21,19 @@ when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml +- name: Set discovery validation tags + hosts: localhost + connection: local + tasks: + - name: Set dynamic run tags for discovery validation + ansible.builtin.set_fact: + omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['discovery']) | unique }}" + cacheable: true + +- name: Invoke validate_config.yml to perform L1 and L2 validations with discovery tag + ansible.builtin.import_playbook: ../input_validation/validate_config.yml + tags: always + - name: Load discovery configuration hosts: localhost connection: local @@ -86,9 +99,17 @@ - name: End play for magellan ansible.builtin.meta: end_play - - name: Handle OME discovery mechanism + - name: Validate OME inputs before discovery when: discovery_mechanism == 'ome' block: + - name: Fail when ome_ip is not configured + ansible.builtin.fail: + msg: >- + ome_ip must be provided in discovery_config.yml when using OME discovery. + Please set 'enable_bmc_discovery: true' and provide a valid 'ome_ip' in + {{ input_project_dir }}/discovery_config.yml. + when: ome_ip | default('') | length == 0 + - name: Include OME discovery role ansible.builtin.include_role: name: ome_discovery From d3238ce2d81c20ab0d87459388f31d42ef476af0 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:11:39 +0530 Subject: [PATCH 10/15] fix(OMN01D-2225): improve OME authentication and reachability error messages - ome_server_inventory.py: auth failure now tells user to verify ome_username/ome_password in omnia_config_credentials.yml and rerun - collect_inventory.yml: wrap wait_for in block/rescue so timeout gives actionable message pointing to discovery_config.yml and network check --- .../library/modules/ome_server_inventory.py | 6 +++++- .../ome_discovery/tasks/collect_inventory.yml | 19 ++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/common/library/modules/ome_server_inventory.py b/common/library/modules/ome_server_inventory.py index a9ed7a3fb0..329f2a1513 100644 --- a/common/library/modules/ome_server_inventory.py +++ b/common/library/modules/ome_server_inventory.py @@ -360,7 +360,11 @@ def main(): try: if not client.authenticate(): - module.fail_json(msg=f"Failed to authenticate with OME at {ome_ip}") + module.fail_json(msg=( + f"Failed to authenticate with OME at {ome_ip}. " + "Please verify the ome_username and ome_password provided in " + "omnia_config_credentials.yml (managed via prepare_oim.yml) and rerun the playbook." + )) devices = client.get_all_devices(device_type) device_group_map, group_debug = client.build_device_group_map() diff --git a/discovery/roles/ome_discovery/tasks/collect_inventory.yml b/discovery/roles/ome_discovery/tasks/collect_inventory.yml index e8651f5180..20a0c47301 100644 --- a/discovery/roles/ome_discovery/tasks/collect_inventory.yml +++ b/discovery/roles/ome_discovery/tasks/collect_inventory.yml @@ -14,11 +14,20 @@ --- - name: Verify OME is reachable - ansible.builtin.wait_for: - host: "{{ ome_ip }}" - port: 443 - timeout: 30 - register: ome_reachability + block: + - name: Wait for OME HTTPS port + ansible.builtin.wait_for: + host: "{{ ome_ip }}" + port: 443 + timeout: 30 + register: ome_reachability + rescue: + - name: Fail with actionable message when OME is unreachable + ansible.builtin.fail: + msg: >- + Unable to reach OME at {{ ome_ip }}:443 within 30 seconds. + Please verify that ome_ip in {{ input_project_dir | default('input') }}/discovery_config.yml + is correct and that the OME appliance is powered on and network-accessible. - name: Collect OME server inventory ome_server_inventory: From c0551deb341179a56e79e455e2b7782bf864f336 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:16:16 +0530 Subject: [PATCH 11/15] fix(OMN01D-2226): correct discovery completion message next steps - Replace misleading 'Rename or copy' instruction with guidance to update pxe_mapping_file_path in provision_config.yml - Show full absolute path of generated file throughout - Add spacing between steps for readability --- discovery/roles/ome_discovery/vars/main.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/discovery/roles/ome_discovery/vars/main.yml b/discovery/roles/ome_discovery/vars/main.yml index c2fcf8cc4e..936d95c4b2 100644 --- a/discovery/roles/ome_discovery/vars/main.yml +++ b/discovery/roles/ome_discovery/vars/main.yml @@ -42,12 +42,16 @@ discovery_complete_msg: - "Total servers discovered: {{ discovered_servers | length }}" - "" - "Next Steps:" - - "1. Review and edit the generated file:" + - "1. Review and edit the generated PXE mapping file:" - " {{ pxe_mapping_output_file }}" + - "" - "2. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed." - - "3. Rename or copy the file to:" - - " input/pxe_mapping_file.csv" - - "4. Run: ansible-playbook provision/provision.yml" + - "" + - "3. Update the following parameter in provision_config.yml:" + - " pxe_mapping_file_path: {{ pxe_mapping_output_file }}" + - "" + - "4. Run:" + - " ansible-playbook provision/provision.yml" - "============================================================" no_servers_found_msg: | From 4c0c0538bb9314d07e43742e2554bc136a1266dd Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:20:13 +0530 Subject: [PATCH 12/15] fix(OMN01D-2227): escape backslash in docstring to suppress SyntaxWarning Python 3.12+ warns about invalid escape sequence '\d' in non-raw string literals. The docstring in extract_su_from_hostname() contained (?=R\d+) which triggered SyntaxWarning during discovery execution. Escaped the backslash to (?=R\\d+) in the docstring. --- common/library/modules/generate_pxe_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/library/modules/generate_pxe_mapping.py b/common/library/modules/generate_pxe_mapping.py index b50cd0775a..d47579505e 100644 --- a/common/library/modules/generate_pxe_mapping.py +++ b/common/library/modules/generate_pxe_mapping.py @@ -111,7 +111,7 @@ def extract_su_from_hostname(bmc_hostname): idrac-SUA99R999OU30C2 -> SUA99 SU1R2OU1C5 -> SU1 idrac-JCGT033 -> '' (service tag pattern, not an SU hostname) - The lookahead (?=R\d+) ensures only genuine SU hostnames match; + The lookahead (?=R\\d+) ensures only genuine SU hostnames match; service-tag-only hostnames like idrac-JCGT033 are ignored. Returns empty string when no SU pattern is found; caller defaults to grp0. """ From ee5addd787a9a625bbc62955336a32501d42d3bb Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:26:08 +0530 Subject: [PATCH 13/15] fix(OMN01D-2230): correct GROUP_NAME and PARENT_SERVICE_TAG in PXE mapping Issue 1 - GROUP_NAME: - Add fallback: try extracting SU pattern from OME group name when BMC hostname has no SU pattern (covers hierarchical OME groups like SU1_slurm_node, SU2_compute, etc.) - grp0 remains the correct default for single-cluster environments Issue 2 - PARENT_SERVICE_TAG: - Define CHILD_ROLES_OF_CONTROL_PLANE set (service_kube_node_x86_64) - Only assign PARENT_SERVICE_TAG to rows whose FUNCTIONAL_GROUP_NAME is a child role of the control plane within the same GROUP_NAME - Control plane nodes, slurm nodes, login nodes, etc. no longer get an incorrect PARENT_SERVICE_TAG --- common/library/modules/generate_pxe_mapping.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/common/library/modules/generate_pxe_mapping.py b/common/library/modules/generate_pxe_mapping.py index d47579505e..171e2f6595 100644 --- a/common/library/modules/generate_pxe_mapping.py +++ b/common/library/modules/generate_pxe_mapping.py @@ -103,6 +103,12 @@ DEFAULT_FUNCTIONAL_GROUP = "slurm_node_aarch64" SERVICE_CONTROL_PLANE_GROUP = "service_kube_control_plane_x86_64" +# Roles that have a parent-child relationship with the control plane. +# Only these roles should receive PARENT_SERVICE_TAG. +CHILD_ROLES_OF_CONTROL_PLANE = { + "service_kube_node_x86_64", +} + def extract_su_from_hostname(bmc_hostname): """ @@ -229,8 +235,11 @@ def main(): server_group = server.get('group_name', '').strip() resolved_functional_group = server_group if server_group else functional_group - # Derive GROUP_NAME from SU extracted from BMC hostname + # Derive GROUP_NAME: try SU from BMC hostname first, + # then from OME group name, then fall back to module default (grp0) su_name = extract_su_from_hostname(bmc_hostname) + if not su_name: + su_name = extract_su_from_hostname(server_group) resolved_group_name = su_name if su_name else group_name row = { @@ -256,8 +265,11 @@ def main(): if su and su not in su_control_plane_map: su_control_plane_map[su] = row["SERVICE_TAG"] - # Assign PARENT_SERVICE_TAG from control plane node of the same SU + # Assign PARENT_SERVICE_TAG only to child roles of the control plane + # within the same GROUP_NAME for row in rows: + if row["FUNCTIONAL_GROUP_NAME"] not in CHILD_ROLES_OF_CONTROL_PLANE: + continue su = row["GROUP_NAME"] if su in su_control_plane_map: row["PARENT_SERVICE_TAG"] = su_control_plane_map[su] From 26a0c4d86db8a4495af3bfddd58b49b4b98bfa46 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:30:23 +0530 Subject: [PATCH 14/15] fix(OMN01D-2231): detect and fail on duplicate OME static group assignments - build_device_group_map() now tracks all group memberships per device and returns a conflicts dict for devices in multiple static groups - main() fails with an actionable error listing each conflicting device and its groups, instead of silently using the first-seen group - Prevents incorrect FUNCTIONAL_GROUP_NAME override in PXE mapping --- .../library/modules/ome_server_inventory.py | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/common/library/modules/ome_server_inventory.py b/common/library/modules/ome_server_inventory.py index 329f2a1513..a8a0efabb1 100644 --- a/common/library/modules/ome_server_inventory.py +++ b/common/library/modules/ome_server_inventory.py @@ -174,8 +174,15 @@ def build_device_group_map(self): all groups whose ParentId matches that container's Id. Fallback: skip well-known OME system/container group names and use any group that has at least one device. + + Returns (device_group_map, conflicts, debug): + device_group_map: dict mapping device_id -> first group_name + conflicts: dict mapping device_id -> list of all group_names (only for + devices found in more than one static group) + debug: diagnostic info dict """ device_group_map = {} + device_all_groups = {} all_groups_url = f"{self.base_url}/api/GroupService/Groups" all_groups = self.get_paginated(all_groups_url) @@ -221,9 +228,19 @@ def build_device_group_map(self): group_devices = self.get_paginated(devices_url) for gd in group_devices: dev_id = gd.get("Id") - if dev_id and dev_id not in device_group_map: + if not dev_id: + continue + device_all_groups.setdefault(dev_id, []).append(group_name) + if dev_id not in device_group_map: device_group_map[dev_id] = group_name + # Detect devices present in multiple static groups + conflicts = { + dev_id: groups + for dev_id, groups in device_all_groups.items() + if len(groups) > 1 + } + empty_groups = [g.get("Name") for g in target_groups if g.get("Name") not in [device_group_map.get(d) for d in device_group_map]] @@ -234,8 +251,9 @@ def build_device_group_map(self): "target_group_names": [g.get("Name") for g in target_groups], "device_ids_mapped": list(device_group_map.keys()), "empty_groups": empty_groups, + "conflicting_device_count": len(conflicts), } - return device_group_map, debug + return device_group_map, conflicts, debug def extract_server_info(client, device, device_group_map=None): @@ -367,7 +385,7 @@ def main(): )) devices = client.get_all_devices(device_type) - device_group_map, group_debug = client.build_device_group_map() + device_group_map, conflicts, group_debug = client.build_device_group_map() if not group_debug["static_container_found"]: module.warn("OME: 'Static Groups' container not found under Custom Groups. " @@ -380,6 +398,23 @@ def main(): module.warn(f"OME: Static group '{grp}' exists but has no devices assigned. " f"Devices in this group will fall back to the default functional group.") + # Fail if any device belongs to multiple static groups + if conflicts: + # Build a human-readable summary keyed by service tag + svc_tag_map = {d.get("Id"): d.get("Identifier") or d.get("DeviceServiceTag", str(d.get("Id"))) + for d in devices} + conflict_lines = [] + for dev_id, groups in conflicts.items(): + tag = svc_tag_map.get(dev_id, str(dev_id)) + conflict_lines.append(f" Device {tag}: member of groups [{', '.join(groups)}]") + module.fail_json(msg=( + "Conflicting OME static group assignments detected. " + "Each server must belong to exactly one static group. " + "The following devices are assigned to multiple groups:\n" + + "\n".join(conflict_lines) + + "\nPlease fix the group assignments in OME and rerun discovery." + )) + server_info_list = [] for device in devices: info = extract_server_info(client, device, device_group_map) From c879c86f012fc2333ead4f00fa4a1de7dec5b406 Mon Sep 17 00:00:00 2001 From: Sujit Jadhav Date: Wed, 29 Apr 2026 10:34:17 +0530 Subject: [PATCH 15/15] fix(OMN01D-2232): validate OME group names against supported functional groups - Define SUPPORTED_FUNCTIONAL_GROUPS set matching Omnia's known roles - Skip servers whose OME static group is not in the supported set - Emit a warning per skipped device listing the unsupported group name and the full set of supported groups - Unsupported groups (e.g. 'abc') no longer appear in the PXE mapping --- .../library/modules/generate_pxe_mapping.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/common/library/modules/generate_pxe_mapping.py b/common/library/modules/generate_pxe_mapping.py index 171e2f6595..3ed372d48a 100644 --- a/common/library/modules/generate_pxe_mapping.py +++ b/common/library/modules/generate_pxe_mapping.py @@ -103,6 +103,24 @@ DEFAULT_FUNCTIONAL_GROUP = "slurm_node_aarch64" SERVICE_CONTROL_PLANE_GROUP = "service_kube_control_plane_x86_64" +# Omnia-supported functional group names. +# Only servers whose OME static group matches one of these will be +# included in the PXE mapping file. +SUPPORTED_FUNCTIONAL_GROUPS = { + "service_kube_control_plane_first_x86_64", + "service_kube_control_plane_x86_64", + "service_kube_node_x86_64", + "login_node_x86_64", + "login_node_aarch64", + "login_compiler_node_x86_64", + "login_compiler_node_aarch64", + "slurm_control_node_x86_64", + "slurm_node_x86_64", + "slurm_node_aarch64", + "os_x86_64", + "os_aarch64", +} + # Roles that have a parent-child relationship with the control plane. # Only these roles should receive PARENT_SERVICE_TAG. CHILD_ROLES_OF_CONTROL_PLANE = { @@ -233,6 +251,17 @@ def main(): # Use group_name from OME if available, else fall back to module param default server_group = server.get('group_name', '').strip() + + # Skip servers whose OME group is not a supported Omnia functional group + if server_group and server_group not in SUPPORTED_FUNCTIONAL_GROUPS: + svc_tag = server.get('service_tag', 'unknown') + module.warn( + f"Skipping device {svc_tag}: OME static group '{server_group}' " + f"is not a supported Omnia functional group. " + f"Supported groups: {', '.join(sorted(SUPPORTED_FUNCTIONAL_GROUPS))}" + ) + continue + resolved_functional_group = server_group if server_group else functional_group # Derive GROUP_NAME: try SU from BMC hostname first,