diff --git a/build_stream/orchestrator/common/result_poller.py b/build_stream/orchestrator/common/result_poller.py index 6d35738773..6f40a91965 100644 --- a/build_stream/orchestrator/common/result_poller.py +++ b/build_stream/orchestrator/common/result_poller.py @@ -362,6 +362,11 @@ def _on_result_received(self, result: PlaybookResult) -> None: # S12: On restart failure, still persist node_results.json if result.stage_name == "restart": self._on_restart_completed(result) + self._on_restart_failure(result) + + # On deploy failure, mark ImageGroup FAILED + if result.stage_name == "deploy": + self._on_deploy_failure(result) # On validate failure, mark ImageGroup FAILED if result.stage_name == "validate": @@ -968,3 +973,49 @@ def _on_deploy_failure(self, result: PlaybookResult) -> None: job_id=str(result.job_id), exc_info=True, ) + + def _on_restart_failure(self, result: PlaybookResult) -> None: + """Transition ImageGroup from RESTARTING to FAILED on restart failure.""" + if self._image_group_repo is None: + log_secure_info( + "warning", + f"ImageGroup repo not available; skipping restart failure " + f"update for job={result.job_id}", + job_id=str(result.job_id), + ) + return + + try: + image_group = self._image_group_repo.find_by_job_id( + JobId(str(result.job_id)) + ) + if image_group is None: + log_secure_info( + "error", + f"Restart failure callback: No ImageGroup found for job={result.job_id}.", + job_id=str(result.job_id), + ) + return + + self._image_group_repo.update_status( + image_group_id=image_group.id, + new_status=ImageGroupStatus.FAILED, + ) + + if hasattr(self._image_group_repo, 'session'): + self._image_group_repo.session.commit() + + log_secure_info( + "warning", + f"Restart FAILED for job={result.job_id}. " + f"ImageGroup '{image_group.id}' -> FAILED.", + job_id=str(result.job_id), + ) + except Exception as exc: # pylint: disable=broad-except + log_secure_info( + "error", + "Failed to update ImageGroup status on restart " + f"failure for job={result.job_id}: {exc}", + job_id=str(result.job_id), + exc_info=True, + ) diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index da20edea12..af3c1ffab9 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -38,6 +38,7 @@ CSV_COLUMNS, SOFTWARE_CONFIG_SUBDIR, DEFAULT_STATUS_FILENAME, + STATUS_CSV_HEADER, RPM_LABEL_TEMPLATE, RHEL_OS_URL, SOFTWARES_KEY, @@ -853,6 +854,16 @@ def check_csv_existence(path): def read_status_csv(csv_path): """Reads the status.csv file and returns a list of row dictionaries.""" + # Ensure file has valid header before reading + if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0: + with open(csv_path, 'r', encoding='utf-8') as file: + lines = file.readlines() + if lines and lines[0].strip() != STATUS_CSV_HEADER.strip(): + # Header missing or invalid - prepend header to existing data + with open(csv_path, 'w', encoding='utf-8') as wfile: + wfile.write(STATUS_CSV_HEADER) + wfile.writelines(lines) + with open(csv_path, mode='r', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) return [row for row in reader] diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 20268b10fa..99cc28652a 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -160,9 +160,19 @@ def determine_function( # Construct the status file path using DEFAULT_STATUS_FILENAME. status_file = os.path.join(csv_file_path, DEFAULT_STATUS_FILENAME) + + # Ensure file exists with valid header if not os.path.exists(status_file) or os.stat(status_file).st_size == 0: with open(status_file, 'w', encoding="utf-8") as file: file.write(STATUS_CSV_HEADER) + else: + with open(status_file, 'r', encoding="utf-8") as file: + lines = file.readlines() + if lines and lines[0].strip() != STATUS_CSV_HEADER.strip(): + # Header missing or invalid - prepend header to existing data + with open(status_file, 'w', encoding="utf-8") as wfile: + wfile.write(STATUS_CSV_HEADER) + wfile.writelines(lines) task_type = task.get("type") diff --git a/examples/powerscale_reference_files/secret.yaml b/examples/powerscale_reference_files/CSI_driver/secret.yaml similarity index 100% rename from examples/powerscale_reference_files/secret.yaml rename to examples/powerscale_reference_files/CSI_driver/secret.yaml diff --git a/examples/powerscale_reference_files/values.yaml b/examples/powerscale_reference_files/CSI_driver/values.yaml similarity index 97% rename from examples/powerscale_reference_files/values.yaml rename to examples/powerscale_reference_files/CSI_driver/values.yaml index 2b612e02ea..14826ff22e 100644 --- a/examples/powerscale_reference_files/values.yaml +++ b/examples/powerscale_reference_files/CSI_driver/values.yaml @@ -2,35 +2,35 @@ ######################## # version: version of this values file # Note: Do not change this value -version: "v2.16.0" +version: "v2.17.0" images: # "driver" defines the container image, used for the driver container. driver: - image: quay.io/dell/container-storage-modules/csi-isilon:v2.16.0 + image: quay.io/dell/container-storage-modules/csi-isilon:v2.17.0 # CSI sidecars attacher: - image: registry.k8s.io/sig-storage/csi-attacher:v4.10.0 + image: registry.k8s.io/sig-storage/csi-attacher:v4.11.0 provisioner: - image: registry.k8s.io/sig-storage/csi-provisioner:v6.1.0 + image: registry.k8s.io/sig-storage/csi-provisioner:v6.2.0 snapshotter: - image: registry.k8s.io/sig-storage/csi-snapshotter:v8.4.0 + image: registry.k8s.io/sig-storage/csi-snapshotter:v8.5.0 resizer: - image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 + image: registry.k8s.io/sig-storage/csi-resizer:v2.1.0 registrar: - image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.15.0 + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.16.0 healthmonitor: - image: registry.k8s.io/sig-storage/csi-external-health-monitor-controller:v0.16.0 + image: registry.k8s.io/sig-storage/csi-external-health-monitor-controller:v0.17.0 # CSM sidecars replication: - image: quay.io/dell/container-storage-modules/dell-csi-replicator:v1.14.0 + image: quay.io/dell/container-storage-modules/dell-csi-replicator:v1.15.0 podmon: - image: quay.io/dell/container-storage-modules/podmon:v1.15.0 + image: quay.io/dell/container-storage-modules/podmon:v1.16.0 authorization: - image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.4.0 + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 metadataretriever: - image: quay.io/dell/container-storage-modules/csi-metadata-retriever:v1.13.0 + image: quay.io/dell/container-storage-modules/csi-metadata-retriever:v1.14.0 # CSI driver log level # Allowed values: "error", "warn"/"warning", "info", "debug" @@ -119,7 +119,7 @@ controller: # the Kubernetes release. # Allowed values: n, where n > 0 # Default value: None - controllerCount: 2 + controllerCount: 1 # volumeNamePrefix: Prefix of PersistentVolume names created # Allowed values: string @@ -184,7 +184,7 @@ controller: # true: enable volume expansion feature(install resizer sidecar) # false: disable volume snapshot feature(do not install resizer sidecar) # Default value: None - enabled: true + enabled: false healthMonitor: # enabled: Enable/Disable health monitor of CSI volumes- volume status, volume condition @@ -192,7 +192,7 @@ controller: # true: enable checking of health condition of CSI volumes # false: disable checking of health condition of CSI volumes # Default value: None - enabled: false + enabled: true # interval: Interval of monitoring volume health condition # Allowed values: Number followed by unit of time (s,m,h) @@ -301,7 +301,7 @@ node: # true: enable checking of health condition of CSI volumes # false: disable checking of health condition of CSI volumes # Default value: None - enabled: false + enabled: true ## PLATFORM ATTRIBUTES ###################### diff --git a/examples/powerscale_reference_files/powerscale_metrics/values.yaml b/examples/powerscale_reference_files/powerscale_metrics/values.yaml new file mode 100644 index 0000000000..a89148cd79 --- /dev/null +++ b/examples/powerscale_reference_files/powerscale_metrics/values.yaml @@ -0,0 +1,221 @@ +karaviMetricsPowerflex: + image: quay.io/dell/container-storage-modules/csm-metrics-powerflex:v1.15.0 + enabled: false + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-vxflexos.dellemc.com) + provisionerNames: csi-vxflexos.dellemc.com + # set sdcMetricsEnabled to "false" to disable collection of SDC metrics + sdcMetricsEnabled: "true" + # set polling frequency to the PowerFlex array to get metrics data + sdcPollFrequencySeconds: 10 + volumePollFrequencySeconds: 10 + # set volumeMetricsEnabled to "false" to disable collection of Volume metrics + volumeMetricsEnabled: "true" + # set storageClassPoolMetricsEnabled to "false" to disable collection of storage class/pool metrics + storageClassPoolMetricsEnabled: "true" + # set the polling frequency to configure the interval which storage class/pool metrics are gathered + storageClassPoolPollFrequencySeconds: 10 + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 30 + # set the the default max concurrent queries to PowerFlex + concurrentPowerflexQueries: 10 + # set the default endpoint for PowerFlex service + endpoint: karavi-metrics-powerflex + service: + type: ClusterIP + logLevel: INFO + logFormat: text + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +karaviMetricsPowerstore: + image: quay.io/dell/container-storage-modules/csm-metrics-powerstore:v1.15.0 + enabled: false + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-powerstore.dellemc.com) + provisionerNames: csi-powerstore.dellemc.com + # set polling frequency to the PowerStore array to get metrics data + volumePollFrequencySeconds: 20 + spacePollFrequencySeconds: 300 + arrayPollFrequencySeconds: 300 + filesystemPollFrequencySeconds: 20 + # apiTimeout: Defines the timeout for PowerStore API calls in seconds + # Allowed values: Number followed by unit (s,m,h) + # Examples: 60s, 5m, 1h + # Default value: 120s + apiTimeout: "120s" + # set volumeMetricsEnabled to "false" to disable collection of Volume metrics + volumeMetricsEnabled: "true" + # set the the default max concurrent queries to PowerStore + concurrentPowerstoreQueries: 10 + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 30 + # set the default endpoint for PowerStore service + endpoint: karavi-metrics-powerstore + service: + type: ClusterIP + logLevel: INFO + logFormat: text + zipkin: + uri: "" + serviceName: metrics-powerstore + probability: 0.0 + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +karaviMetricsPowerscale: + image: quay.io/dell/container-storage-modules/csm-metrics-powerscale:v1.12.0 + enabled: true + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-isilon.dellemc.com) + provisionerNames: csi-isilon.dellemc.com + # set capacityMetricsEnabled to "false" to disable collection of capacity metrics + capacityMetricsEnabled: "true" + # set performanceMetricsEnabled to "false" to disable collection of performance metrics + performanceMetricsEnabled: "true" + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get cluster capacity metrics data + clusterCapacityPollFrequencySeconds: 30 + # set polling frequency to get cluster performance data + clusterPerformancePollFrequencySeconds: 20 + # set polling frequency to get quota capacity metrics data + quotaCapacityPollFrequencySeconds: 30 + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 30 + # set the default max concurrent queries to PowerScale + concurrentPowerscaleQueries: 10 + # set the default endpoint for PowerScale service + endpoint: karavi-metrics-powerscale + service: + type: ClusterIP + logLevel: INFO + logFormat: text + # isiClientOptions to access Powerscale OneFS API server + isiClientOptions: + # set isiSkipCertificateValidation to true/false to skip/verify OneFS API server's certificates + # default isiSkipCertificateValidation: true to skip OneFS API server's certificates + isiSkipCertificateValidation: true + # set isiAuthType to 0/1 to enables session-based/basic Authentication + # default isiAuthType: 0 to use session-based Authentication + isiAuthType: 1 + # set isiLogVerbose to 0/1/2 decide High/Medium/Low content of the OneFS REST API message should be logged in debug level logs + # default isiLogVerbose: 0 to log full content of the HTTP request and response + isiLogVerbose: 0 + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +karaviMetricsPowermax: + image: quay.io/dell/container-storage-modules/csm-metrics-powermax:v1.10.0 + enabled: false + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-powermax.dellemc.com) + provisionerNames: csi-powermax.dellemc.com + # set capacityMetricsEnabled to "false" to disable collection of capacity metrics + capacityMetricsEnabled: "true" + # set performanceMetricsEnabled to "false" to disable collection of performance metrics + performanceMetricsEnabled: "true" + # set polling frequency to get capacity metrics data for volume, storagegroup, srp and array + capacityPollFrequencySeconds: 3600 + # set polling frequency to get performance metrics data for volume, storagegroup + performancePollFrequencySeconds: 300 + # set the default max concurrent queries to PowerMax + concurrentPowermaxQueries: 10 + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 300 + # set the default endpoint for PowerMax service + endpoint: karavi-metrics-powermax + # useSecret + # Defines if a Secret should be used to provide Unisphere for PowerMax endpoints + # and login credentials instead of the deprecated powermax-reverseproxy-config ConfigMap. + # If set to true, the contents of the secret specified by defaultCredentialsSecret + # will be used, in the new format, to specify Unisphere for PowerMax endpoints, array IDs, + # and login credentials. If set to false, the deprecated ConfigMap will be automatically + # created and used. + # Default value: false + useSecret: false + # defaultCredentialsSecret + # The name of the Kubernetes Secret containing the details of the PowerMax arrays, + # their Unisphere endpoints and their login credentials if useSecret is set to true. + # Default value: "" + defaultCredentialsSecret: "" + service: + type: ClusterIP + logLevel: INFO + logFormat: text + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector:0.150.1 + service: + type: ClusterIP + nginxProxy: + image: nginxinc/nginx-unprivileged:1.29 +# Karavi-observability requires cert-manager. If cert-manager is already present in cluster, set enabled to false not to install it. +cert-manager: + enabled: true + startupapicheck: + enabled: false + serviceAccount: + create: false +# Optionally, uncomment and specify the name of the pre-created namespace to install the module in it +# namespace: \ No newline at end of file diff --git a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml index 65747e39a4..50b9545c53 100644 --- a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml +++ b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml @@ -33,6 +33,12 @@ additional_cloud_init_fg_names: [] when: additional_cloud_init_file_path == '' +- name: Create cloud-init directory + ansible.builtin.file: + path: "{{ cloud_init_dir }}" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + - name: Load additional cloud-init config when: additional_cloud_init_file_path != '' block: diff --git a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml b/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml deleted file mode 100644 index 0cdb4bd2cb..0000000000 --- a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Apply telemetry configurations for upgrade - when: - - kube_vip is defined - - kube_vip | length > 0 - - idrac_telemetry_support | default(false) | bool - block: - - name: Check if telemetry deployment file exists - ansible.builtin.stat: - path: "{{ idrac_telemetry_statefulset_path }}" - register: telemetry_stat - - - name: Get current iDRAC telemetry StatefulSet configuration - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet - name: idrac-telemetry - namespace: "{{ telemetry_namespace }}" - register: current_idrac_statefulset - failed_when: false - when: - - telemetry_stat.stat.exists | default(false) - - - name: Set replica count as fact - ansible.builtin.set_fact: - preserved_replica_count: "{{ current_idrac_statefulset.resources[0].spec.replicas | default(1) }}" - when: - - current_idrac_statefulset.resources is defined and current_idrac_statefulset.resources | length > 0 - - - name: Show current replica count - ansible.builtin.debug: - msg: "Current replica count: {{ preserved_replica_count }}" - verbosity: 2 - when: - - preserved_replica_count is defined - - - name: Read iDRAC telemetry StatefulSet YAML file - ansible.builtin.slurp: - src: "{{ idrac_telemetry_statefulset_path }}" - register: idrac_statefulset_yaml - - - name: Update StatefulSet definition with preserved replica count - ansible.builtin.set_fact: - updated_statefulset_definition: "{{ idrac_statefulset_yaml.content | b64decode | regex_replace('---\\n', '') | from_yaml | combine({'spec': {'replicas': preserved_replica_count | int}}, recursive=true) }}" # noqa: yaml[line-length] - when: - - telemetry_stat.stat.exists | default(false) - - preserved_replica_count is defined - - - name: Apply iDRAC telemetry StatefulSet with preserved replica count - kubernetes.core.k8s: - state: present - definition: "{{ updated_statefulset_definition }}" - register: kubectl_apply_result - when: - - updated_statefulset_definition is defined - - telemetry_stat.stat.exists | default(false) - - - name: Display kubectl apply result - ansible.builtin.debug: - msg: "{{ kubectl_apply_result }}" - when: - - kubectl_apply_result is defined - - - name: Wait for idrac telemetry receiver to be ready - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=idrac-telemetry-receiver" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: idrac_telemetry_receiver_ready - failed_when: false - when: - - idrac_telemetry_support | default(false) | bool - - - name: Display idrac telemetry receiver ready status - ansible.builtin.debug: - msg: "{{ idrac_telemetry_receiver_ready }}" - when: - - idrac_telemetry_support | default(false) | bool - - idrac_telemetry_receiver_ready is defined - -- name: Apply LDMS configurations for upgrade - when: - - kube_vip is defined - - kube_vip | length > 0 - - ldms_support | default(false) | bool - block: - - name: Check if LDMS aggregator is running on service k8s cluster - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet - name: nersc-ldms-aggr - namespace: "{{ telemetry_namespace }}" - delegate_to: "{{ kube_vip }}" - register: ldms_statefulset_info - failed_when: false - - - name: Set LDMS running state - ansible.builtin.set_fact: - ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}" - - - name: Check if LDMS store daemon is running on service k8s cluster - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-store" - delegate_to: "{{ kube_vip }}" - register: ldms_store_pod_info - failed_when: false - when: - - ldms_running | default(false) | bool - - - name: Set LDMS store daemon running state - ansible.builtin.set_fact: - ldms_store_running: "{{ ldms_store_pod_info.resources is defined and ldms_store_pod_info.resources | length > 0 }}" - when: - - ldms_running | default(false) | bool - - - name: Restart LDMS store daemon pod - kubernetes.core.k8s: - state: absent - api_version: v1 - kind: Pod - name: "{{ ldms_store_pod_info.resources[0].metadata.name }}" - namespace: "{{ telemetry_namespace }}" - delegate_to: "{{ kube_vip }}" - failed_when: false - when: - - ldms_store_running | default(false) | bool - - - name: Wait for LDMS store daemon pod to be ready after restart - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-store" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: ldms_store_pod_ready - failed_when: false - when: - - ldms_store_running | default(false) | bool - - - name: Display LDMS store daemon restart status - ansible.builtin.debug: - msg: > - {{ ldms_store_pod_ready_msg - if (ldms_store_pod_ready.resources | default([]) | length > 0) - else ldms_store_pod_not_ready_msg }} - when: - - ldms_store_running | default(false) | bool - - - name: Check if decomp.json exists - ansible.builtin.stat: - path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json" - register: decomp_json_stat - - - name: Copy decompose.json if it doesn't exist - ansible.builtin.copy: - src: files/scripts/decomp.json - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json" - mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - when: not decomp_json_stat.stat.exists - - - name: Restart LDMS aggregator StatefulSet - kubernetes.core.k8s: - state: present - definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: nersc-ldms-aggr - namespace: "{{ telemetry_namespace }}" - spec: - template: - metadata: - annotations: - kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" - delegate_to: "{{ kube_vip }}" - failed_when: false - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) - - - name: Wait for LDMS aggregator pod to be ready after restart - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-aggr" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: ldms_pod_ready - failed_when: false - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) - - - name: Display LDMS aggregator restart status - ansible.builtin.debug: - msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}" - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index 3e59602e44..7f2767d20a 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -68,34 +68,52 @@ additional_remote_write_endpoints: "{{ telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) }}" when: telemetry_config.powerscale_configurations is defined -- name: Check if any source targets victoria_metrics +- name: Check if any enabled source targets victoria_metrics ansible.builtin.set_fact: victoria_metrics_support: true cacheable: true when: >- - 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])) + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))) -- name: Check if any source targets victoria_logs +- name: Check if any enabled source targets victoria_logs ansible.builtin.set_fact: victoria_logs_support: true cacheable: true when: >- - 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])) + (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))) -- name: Check if any source targets Kafka +- name: Check if any enabled source targets Kafka ansible.builtin.set_fact: kafka_support: true cacheable: true when: >- - 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) or - 'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([])) + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + ((telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) and + 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool)) and + 'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([]))) # ============================================================================= # VECTOR BRIDGE LOGIC - Determine sink requirements based on Vector bridges @@ -142,17 +160,17 @@ - name: Set global variable for telemetry_enabled ansible.builtin.set_fact: telemetry_enabled: true - when: > - idrac_telemetry_support or - powerscale_metrics_enabled or - powerscale_log_enabled or - victoria_metrics_support or - victoria_logs_support or - ldms_support or - kafka_support or - ufm_telemetry_support or - ufm_log_enabled or - vast_telemetry_support or - vast_log_enabled or - ome_metrics_enabled or - ome_logs_enabled + when: >- + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index c513480a37..8a7e9f6ab2 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -27,8 +27,28 @@ - name: Derive sink support flags from collection_targets ansible.builtin.include_tasks: derive_sink_support_flags.yml +- name: Set pulp server facts for cloud-init templates + when: + - hostvars['localhost']['service_k8s_support'] | default(false) | bool + block: + - name: Run pulp status command on omnia_core container + ansible.builtin.command: /usr/local/bin/pulp status + delegate_to: localhost + changed_when: false + register: pulp_status_output + + - name: Set pulp content origin value + ansible.builtin.set_fact: + pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}" + + - name: Set pulp_server_ip fact + ansible.builtin.set_fact: + pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" + - name: Configure service_k8s telemetry services - when: hostvars['localhost']['service_k8s_support'] | default(false) | bool + when: + - hostvars['localhost']['service_k8s_support'] | default(false) | bool + - telemetry_enabled | default(false) | bool block: - name: Read telemetry packages from software config ansible.builtin.include_tasks: read_software_config.yml @@ -148,8 +168,3 @@ - telemetry_enabled | default(false) | bool tags: - telemetry_deployment - - # - name: Apply telemetry configurations on upgrade - # ansible.builtin.include_tasks: apply_telemetry_on_upgrade.yml - # when: - # - hostvars['localhost']['upgrade_enabled'] | default(false) | bool diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index a50607e4ed..36300d0a52 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -13,20 +13,6 @@ # limitations under the License. --- -- name: Run pulp status command on omnia_core container - ansible.builtin.command: /usr/local/bin/pulp status - delegate_to: localhost - changed_when: false - register: pulp_status_output - -- name: Set pulp content origin value - ansible.builtin.set_fact: - pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}" - -- name: Set fact for pulp protocol - ansible.builtin.set_fact: - pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" - - name: Get cluster_os_type from software_config.json ansible.builtin.set_fact: cluster_os_type: "{{ software_config['cluster_os_type'] }}" diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index d725cf067a..addac543ab 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -117,11 +117,7 @@ - name: Display LoadBalancer IP injection status ansible.builtin.debug: - msg: >- - {{ victoria_lb_ips_preserved - if (preserved_vminsert_ip | default('') | length > 0) - or (preserved_vmselect_ip | default('') | length > 0) - else victoria_lb_ips_not_preserved }} + msg: "{{ victoria_lb_ip_injection_status }}" # ── Apply main CR (VMCluster only — 2.2 cluster mode only) ── - name: Apply VMCluster CR (cluster mode only) with retry @@ -135,6 +131,125 @@ delegate_to: "{{ kube_vip }}" connection: ssh +# ── Wait for VMCluster LoadBalancer IPs and reclaim if reassigned ── +# The operator creates vminsert/vmselect services asynchronously after the CR is applied. +# We MUST wait for these services to get their LoadBalancer IPs BEFORE Phase 3 +# (telemetry.sh) runs, because telemetry.sh also creates VictoriaLogs services via +# kubectl apply -k. If VL services are created before VM services exist, MetalLB +# assigns the freed IPs to VL services, leaving VM services in state. +# +# If the preserved IPs got assigned to wrong services, we reclaim them: +# 1. Find services holding the preserved IPs that are NOT vminsert/vmselect +# 2. Delete those conflicting services to free the IPs +# 3. Wait for vminsert/vmselect to reclaim the preserved IPs + +- name: Initial wait for vminsert LoadBalancer IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vminsert_lb_ip + until: vminsert_lb_ip.stdout | trim | length > 0 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + +- name: Initial wait for vmselect LoadBalancer IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vmselect_lb_ip + until: vmselect_lb_ip.stdout | trim | length > 0 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + +# ── Reclaim reassigned IPs if VMCluster services are still pending ── +- name: Reclaim preserved IPs from conflicting services + when: + - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0 + - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0 + block: + - name: Stage IP conflict detection script + ansible.builtin.template: + src: find_ip_conflict_svcs.sh.j2 + dest: "{{ ip_conflict_script_path }}" + mode: "{{ executable_mode }}" + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Find services holding preserved IPs that are not VMCluster services + ansible.builtin.command: "{{ ip_conflict_script_path }}" + register: ip_conflict_svcs + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Remove IP conflict detection script + ansible.builtin.file: + path: "{{ ip_conflict_script_path }}" + state: absent + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Display services holding preserved IPs + ansible.builtin.debug: + msg: "{{ victoria_lb_ip_conflict_svcs_found }}" + when: ip_conflict_svcs.stdout_lines | default([]) | select() | list | length > 0 + + - name: Delete conflicting services holding preserved IPs + ansible.builtin.command: + cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s + loop: "{{ ip_conflict_svcs.stdout_lines | default([]) | select() | list }}" + changed_when: true + failed_when: false + when: ip_conflict_svcs.stdout_lines | default([]) | select() | list | length > 0 + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Wait for vminsert to reclaim preserved IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vminsert_lb_ip + until: vminsert_lb_ip.stdout | trim | length > 0 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Wait for vmselect to reclaim preserved IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vmselect_lb_ip + until: vmselect_lb_ip.stdout | trim | length > 0 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + +- name: Display confirmed LoadBalancer IPs + ansible.builtin.debug: + msg: "{{ victoria_lb_ip_confirmed }}" + +- name: Warn if LoadBalancer IPs still not assigned after reclaim + ansible.builtin.debug: + msg: "{{ victoria_lb_ip_reclaim_failed }}" + when: >- + (vminsert_lb_ip is defined and vminsert_lb_ip.stdout is defined and vminsert_lb_ip.stdout | trim | length == 0) or + (vmselect_lb_ip is defined and vmselect_lb_ip.stdout is defined and vmselect_lb_ip.stdout | trim | length == 0) + # ── Apply scrape and agent CRs ── - name: Check for VMScrape manifest ansible.builtin.stat: diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml index 23648e2ed6..b290bc639a 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml @@ -252,6 +252,8 @@ # ── Cleanup old pre-operator services and deployments ── # The operator creates new services with different names (e.g. vminsert-victoria-cluster), # so the old standalone services become stale and waste LoadBalancer IPs. + # Old services MUST be deleted BEFORE applying VMCluster CR so MetalLB can + # assign the same IPs to the new operator-managed services via loadBalancerIP. - name: Find old pre-operator services ansible.builtin.shell: | set -o pipefail @@ -285,6 +287,5 @@ - name: Display old resource cleanup summary ansible.builtin.debug: - msg: - - "Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}" - - "Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}" + msg: "{{ victoria_old_svc_cleanup_summary }}" + verbosity: 2 diff --git a/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 b/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 new file mode 100644 index 0000000000..c21217afad --- /dev/null +++ b/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Find services in the telemetry namespace that are holding LoadBalancer IPs +# which should belong to VMCluster services (vminsert/vmselect). +# This can happen when MetalLB reassigns freed IPs to other services +# before the VMCluster services are created by the operator. +# +# Usage: bash find_ip_conflict_svcs.sh +# Output: One service name per line (services holding conflicting IPs) + +set -o pipefail + +PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}" +VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}" + +kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \ + python3 -c " +import json, sys +data = json.load(sys.stdin) +preserved = set('${PRESERVED_IPS}'.split()) +vmcluster = set('${VMCLUSTER_SVCS}'.split()) +for svc in data.get('items', []): + name = svc['metadata']['name'] + if name in vmcluster: + continue + ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', []) + for ing in ingress: + ip = ing.get('ip', '') + if ip in preserved: + print(name) + break +" || true diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index a869e5a52a..5d51a1a057 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -45,6 +45,11 @@ pod_wait_delay: 15 idrac_rollout_retries: 3 idrac_rollout_delay: 30 +# LoadBalancer IP wait configuration +lb_ip_wait_retries: 30 +lb_ip_wait_delay: 5 +ip_conflict_script_path: /tmp/find_ip_conflict_svcs.sh + # Victoria operator configuration # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml victoria_operator_release_name: victoria-metrics-operator @@ -105,7 +110,34 @@ victoria_lb_ips_preserved: >- LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, vmselect: {{ preserved_vmselect_ip | default('N/A') }} +victoria_lb_ip_injection_status: >- + {{ victoria_lb_ips_preserved + if (preserved_vminsert_ip | default('') | length > 0) + or (preserved_vmselect_ip | default('') | length > 0) + else victoria_lb_ips_not_preserved }} victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)" +victoria_lb_ip_confirmed: >- + VMCluster LoadBalancer IPs confirmed - + vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }}, + vmselect-{{ new_vmcluster_name }}: {{ vmselect_lb_ip.stdout | default('PENDING') | trim }} +victoria_lb_ip_reclaim_needed: >- + VMCluster services still pending after initial wait. + Checking if preserved IPs were assigned to wrong services... +victoria_lb_ip_conflict_svcs_found: >- + Services holding preserved IPs (will be deleted and re-created by telemetry.sh): + {{ ip_conflict_svcs.stdout_lines | default([]) | select() | list }} +victoria_lb_ip_reclaim_success: >- + Successfully reclaimed preserved IPs for VMCluster services - + vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }}, + vmselect-{{ new_vmcluster_name }}: {{ vmselect_lb_ip.stdout | default('PENDING') | trim }} +victoria_lb_ip_reclaim_failed: >- + WARNING: VMCluster services still do not have LoadBalancer IPs after reclaim attempt. + vminsert: {{ vminsert_lb_ip.stdout | default('NONE') | trim }}, + vmselect: {{ vmselect_lb_ip.stdout | default('NONE') | trim }}. + Please use new assigned IPs. +victoria_old_svc_cleanup_summary: >- + Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}. + Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }} victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state." victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting" telemetry_upgrade_success: "Telemetry upgrade COMPLETED: All telemetry pods are running and ready."