From eb025fd3619384d9182710ef58324392947ae74e Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Tue, 2 Jun 2026 15:36:44 +0530
Subject: [PATCH 01/33] upgrade defects fixes and fix for crashloopback on pod
 restart

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../idrac_telemetry_statefulset.yaml.j2       | 33 +++++++++++---
 .../telemetry/kafka/kafka.kafka.yaml.j2       |  9 ++++
 upgrade/playbooks/upgrade_telemetry.yml       | 18 ++++++++
 .../upgrade_k8s/tasks/load_version_vars.yml   |  5 ---
 .../tasks/include_required_input.yml          | 21 ++++++++-
 .../roles/upgrade_telemetry/tasks/main.yml    | 44 +++++++++++++++++++
 .../patch_idrac_termination_grace_period.yml  |  2 +-
 .../tasks/upgrade_operator.yml                | 16 ++++++-
 8 files changed, 134 insertions(+), 14 deletions(-)

diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
index b0c3dd8b3c..a71ebd0aa0 100644
--- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
+++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
@@ -72,7 +72,7 @@ spec:
         - ip: "127.0.0.1"
           hostnames:
             - "mysqldb"
-      terminationGracePeriodSeconds: 10
+      terminationGracePeriodSeconds: 60
       tolerations:
       - effect: NoExecute
         key: node.kubernetes.io/not-ready
@@ -83,16 +83,27 @@ spec:
         operator: Exists
         tolerationSeconds: 5
       initContainers:
-        # Clean up stale MySQL lock files from previous ungraceful shutdowns
+        # Clean up stale MySQL lock/InnoDB artifacts only after ungraceful shutdown
         - name: cleanup-mysql-locks
           image: {{ mysql_image }}
           command:
             - /bin/sh
             - -c
             - |
-              echo "Checking for stale MySQL lock files..."
-              rm -f /var/lib/mysql/*.sock /var/lib/mysql/*.pid 2>/dev/null || true
-              echo "Lock file cleanup complete"
+              DATADIR="/var/lib/mysql"
+              # Only run cleanup if datadir has existing MySQL data (not a fresh install)
+              if [ ! -f "${DATADIR}/mysql.ibd" ]; then
+                echo "Fresh install detected — skipping cleanup."
+                exit 0
+              fi
+              # Detect unclean shutdown: pid/sock files should not exist when no mysqld is running
+              if [ -f "${DATADIR}/mysqld.pid" ] || ls ${DATADIR}/*.sock 1>/dev/null 2>&1 || ls ${DATADIR}/*.lck 1>/dev/null 2>&1; then
+                echo "Stale lock artifacts detected — previous shutdown was unclean."
+                rm -f ${DATADIR}/*.sock ${DATADIR}/*.pid ${DATADIR}/*.lck 2>/dev/null || true
+                echo "Stale artifacts removed."
+              else
+                echo "No stale artifacts — previous shutdown was clean."
+              fi
           volumeMounts:
             - name: mysqldb-pvc
               mountPath: /var/lib/mysql/
@@ -103,10 +114,20 @@ spec:
           volumeMounts:
             - name: mysqldb-pvc
               mountPath: /var/lib/mysql/
+          args:
+            - --innodb-use-native-aio=0
+            - --innodb-flush-log-at-trx-commit=1
+            - --innodb-flush-method=fsync
           lifecycle:
             preStop:
               exec:
-                command: ["/bin/sh", "-c", "mysqladmin shutdown -uroot -p${MYSQL_ROOT_PASSWORD} 2>/dev/null || true"]
+                command:
+                  - /bin/sh
+                  - -c
+                  - |
+                    mysqladmin shutdown -uroot -p"${MYSQL_ROOT_PASSWORD}" --wait=45 2>/dev/null || true
+                    while mysqladmin ping -uroot -p"${MYSQL_ROOT_PASSWORD}" 2>/dev/null; do sleep 1; done
+                    rm -f /var/lib/mysql/conf.d/recovery.cnf 2>/dev/null || true
           env:
             - name: MYSQL_DATABASE
               value: {{ mysqldb_name }}
diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2
index 929af037c4..afd162b963 100644
--- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2
+++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2
@@ -9,6 +9,9 @@ spec:
   replicas: 3
   roles:
     - controller
+  template:
+    pod:
+      terminationGracePeriodSeconds: 120
   storage:
     type: jbod
     volumes:
@@ -30,6 +33,9 @@ spec:
   replicas: 3
   roles:
     - broker
+  template:
+    pod:
+      terminationGracePeriodSeconds: 120
   storage:
     type: jbod
     volumes:
@@ -83,6 +89,9 @@ spec:
       log.segment.bytes: {{ kafka_log_segment_bytes }}
       log.retention.bytes: {{ kafka_log_retention_bytes }}
       log.retention.check.interval.ms: 300000
+      controlled.shutdown.enable: true
+      controlled.shutdown.max.retries: 3
+      controlled.shutdown.retry.backoff.ms: 5000
       # Enable topic auto-creation for external clients
       auto.create.topics.enable: true
       num.partitions: 3
diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml
index 5fcb99f410..730eeb1c00 100644
--- a/upgrade/playbooks/upgrade_telemetry.yml
+++ b/upgrade/playbooks/upgrade_telemetry.yml
@@ -35,6 +35,24 @@
       when:
         - manifest.component_status[component_name] | default('pending') == 'completed'
 
+    - name: "Mark as skipped — service_k8s not configured (Slurm-only deployment)"
+      ansible.builtin.copy:
+        content: >-
+          {{ manifest | combine({
+               'component_status': manifest.component_status | combine({
+                 component_name: 'skipped'
+               })
+             }) | to_nice_yaml }}
+        dest: "{{ manifest_path }}"
+        mode: '0644'
+      when:
+        - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool)
+
+    - name: "Skip — service_k8s not configured (Slurm-only deployment)"
+      ansible.builtin.meta: end_play
+      when:
+        - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool)
+
     - name: "Mark as skipped — BuildStream terminal gate active (C-24)"
       ansible.builtin.copy:
         content: >-
diff --git a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml
index d930690682..33b51ae6b6 100644
--- a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml
+++ b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml
@@ -126,8 +126,3 @@
          | selectattr('type', 'equalto', 'tarball')
          | selectattr('package', 'search', 'helm')
          | map(attribute='package') | join }}
-
-# ── Set OIM host ───────────────────────────────────────────────────
-- name: Set oim_host to NFS server IP
-  ansible.builtin.set_fact:
-    oim_host: "{{ k8s_nfs_server_ip }}"
diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
index b90b2c69fb..ac44ac6fe0 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
@@ -60,6 +60,7 @@
   when:
     - omnia_config is defined
     - omnia_config.service_k8s_cluster is defined
+    - omnia_config.service_k8s_cluster | length > 0
   tags: always
 
 - name: Set k8s_client_mount_path
@@ -70,13 +71,21 @@
           | first).mount_point }}
   when:
     - storage_config is defined
+    - storage_config.mounts is defined
     - k8s_nfs_storage_name is defined
+    - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0
   tags: always
 # ── Load high_availability_config.yml ──
+- name: Check if high_availability_config.yml exists
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/high_availability_config.yml"
+  register: ha_config_stat
+
 - name: Read high_availability_config.yml for kube_vip
   ansible.builtin.include_vars:
     file: "{{ input_project_dir }}/high_availability_config.yml"
     name: ha_config
+  when: ha_config_stat.stat.exists
 
 - name: Debug high_availability_config.yml content
   ansible.builtin.debug:
@@ -90,6 +99,7 @@
     kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}"
     cacheable: true
   when:
+    - ha_config is defined
     - ha_config.service_k8s_cluster_ha is defined
     - ha_config.service_k8s_cluster_ha | length > 0
 
@@ -148,6 +158,7 @@
   when:
     - software_config is defined
     - software_config.softwares is defined
+    - software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0
   tags: always
 
 - name: Set os_version from software_config.json
@@ -164,6 +175,7 @@
   when:
     - software_config is defined
     - software_config.softwares is defined
+    - software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0
   tags: always
 
 # ── Load service_k8s JSON for victoria operator package name ──
@@ -172,12 +184,19 @@
     src: "{{ input_project_dir }}/config/{{ architecture }}/rhel/{{ os_version }}/service_k8s_v{{ k8s_version }}.json"
   register: service_k8s_slurp
   failed_when: false
+  when:
+    - architecture is defined
+    - os_version is defined
+    - k8s_version is defined
   tags: always
 
 - name: Parse service_k8s JSON
   ansible.builtin.set_fact:
     service_k8s_config: "{{ service_k8s_slurp.content | b64decode | from_yaml }}"
-  when: service_k8s_slurp is not failed
+  when:
+    - service_k8s_slurp is defined
+    - service_k8s_slurp is not failed
+    - service_k8s_slurp is not skipped
   tags: always
 
 - name: Extract victoria operator package name from service_k8s JSON
diff --git a/upgrade/roles/upgrade_telemetry/tasks/main.yml b/upgrade/roles/upgrade_telemetry/tasks/main.yml
index ee5fd1d282..cccfbc8bf8 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/main.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/main.yml
@@ -51,12 +51,56 @@
     msg: "{{ victoria_upgrade_skipped }}"
   when: not (victoria_upgrade_needed | default(false) | bool)
 
+# ── Phase 2.5: Clean stale Kafka lock files before redeploy ──
+- name: Clean stale Kafka lock files from PVCs (prevents CrashLoopBackOff)
+  ansible.builtin.shell: |
+    set -o pipefail
+    for pvc_name in $(kubectl get pvc -n {{ telemetry_namespace }} -l strimzi.io/cluster=kafka \
+        -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
+      pod_name="kafka-lock-cleanup-${pvc_name}"
+      kubectl delete pod "${pod_name}" -n {{ telemetry_namespace }} --ignore-not-found 2>/dev/null || true
+      kubectl run "${pod_name}" --rm --attach --restart=Never -n {{ telemetry_namespace }} \
+        --image=busybox:latest \
+        --overrides="{
+          \"spec\": {
+            \"containers\": [{
+              \"name\": \"cleanup\",
+              \"image\": \"busybox:latest\",
+              \"command\": [\"sh\", \"-c\", \"find /data -name .lock -delete && echo cleaned ${pvc_name}\"],
+              \"volumeMounts\": [{\"name\": \"data\", \"mountPath\": \"/data\"}]
+            }],
+            \"volumes\": [{\"name\": \"data\", \"persistentVolumeClaim\": {\"claimName\": \"${pvc_name}\"}}],
+            \"restartPolicy\": \"Never\"
+          }
+        }" 2>/dev/null || true
+    done
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+  changed_when: true
+  failed_when: false
+  timeout: 300
+  when:
+    - kube_vip is defined
+    - kube_vip | length > 0
+
 # ── Phase 3: Execute telemetry.sh to redeploy telemetry stack ──
 - name: Phase 3 - Execute telemetry.sh to redeploy telemetry stack
   ansible.builtin.include_tasks: execute_telemetry_sh.yml
+  when:
+    - k8s_client_mount_path is defined
+    - kube_vip is defined
+    - kube_vip | length > 0
+
+- name: Skip telemetry.sh (k8s not configured)
+  ansible.builtin.debug:
+    msg: "Skipping telemetry.sh execution — service_k8s not configured (Slurm-only deployment)."
+  when: k8s_client_mount_path is not defined or kube_vip is not defined
 
 # ── Phase 4: Verify all telemetry pods and set upgrade status ──
 - name: Phase 4 - Verify all telemetry pods and set upgrade status
+  when:
+    - kube_vip is defined
+    - kube_vip | length > 0
   block:
     - name: Get all telemetry pods status
       ansible.builtin.shell:
diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
index 6116afb37f..e3e6333c73 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
@@ -53,7 +53,7 @@
     cmd: >
       kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }}
       --type=strategic
-      -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":120}}}}'
+      -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":60}}}}'
   delegate_to: "{{ kube_vip }}"
   connection: ssh
   register: idrac_patch_result
diff --git a/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml b/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml
index 4fa40ba520..40cd32336a 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml
@@ -16,15 +16,29 @@
 # Install / upgrade VictoriaMetrics operator via Helm
 # ============================================================================
 
+- name: Remove finalizers from VictoriaMetrics CRDs (prevents delete hang)
+  ansible.builtin.shell: |
+    set -o pipefail
+    for crd in $(kubectl get crd 2>/dev/null | grep victoriametrics | awk '{print $1}'); do
+      kubectl patch crd "$crd" --type=merge -p '{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+    done
+  changed_when: false
+  failed_when: false
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+
 - name: Delete existing VictoriaMetrics CRDs (to fix Helm ownership issues)
   ansible.builtin.shell: |
     set -o pipefail
-    kubectl get crd | grep victoriametrics | awk '{print $1}' | xargs kubectl delete crd 2>/dev/null || true
+    for crd in $(kubectl get crd 2>/dev/null | grep victoriametrics | awk '{print $1}'); do
+      timeout 30 kubectl delete crd "$crd" --timeout=30s 2>/dev/null || true
+    done
   register: crd_delete_result
   changed_when: true
   failed_when: false
   delegate_to: "{{ kube_vip }}"
   connection: ssh
+  timeout: 120
 
 - name: Install VictoriaMetrics operator from tarball
   ansible.builtin.command:

From 1525e9e6ba2c50d264247fe3718617d9b76360f5 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Tue, 2 Jun 2026 17:47:11 +0530
Subject: [PATCH 02/33] remove stale services and deployments for victoria

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../telemetry/kafka/kafka.kafka.yaml.j2       |  9 -----
 .../tasks/migrate_statefulset.yml             | 40 +++++++++++++++++++
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2
index afd162b963..929af037c4 100644
--- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2
+++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2
@@ -9,9 +9,6 @@ spec:
   replicas: 3
   roles:
     - controller
-  template:
-    pod:
-      terminationGracePeriodSeconds: 120
   storage:
     type: jbod
     volumes:
@@ -33,9 +30,6 @@ spec:
   replicas: 3
   roles:
     - broker
-  template:
-    pod:
-      terminationGracePeriodSeconds: 120
   storage:
     type: jbod
     volumes:
@@ -89,9 +83,6 @@ spec:
       log.segment.bytes: {{ kafka_log_segment_bytes }}
       log.retention.bytes: {{ kafka_log_retention_bytes }}
       log.retention.check.interval.ms: 300000
-      controlled.shutdown.enable: true
-      controlled.shutdown.max.retries: 3
-      controlled.shutdown.retry.backoff.ms: 5000
       # Enable topic auto-creation for external clients
       auto.create.topics.enable: true
       num.partitions: 3
diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
index 847ad36af4..e99d7bf80c 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
@@ -168,3 +168,43 @@
       when: orphaned_pods.stdout_lines | default([]) | length > 0
       delegate_to: "{{ kube_vip }}"
       connection: ssh
+
+    # ── Cleanup old pre-operator services and deployments ──
+    # The operator creates new services with different names (e.g. vminsert-victoria-cluster),
+    # so the old standalone services become stale and waste LoadBalancer IPs.
+    - name: Find old pre-operator services
+      ansible.builtin.shell: |
+        set -o pipefail
+        kubectl -n {{ telemetry_namespace }} get svc --no-headers 2>/dev/null \
+          | awk '{print $1}' \
+          | grep -xE 'vminsert|vmselect|vmstorage|vmagent' || true
+      register: old_services
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Delete old pre-operator services
+      ansible.builtin.command:
+        cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s
+      loop: "{{ old_services.stdout_lines | default([]) | select() | list }}"
+      changed_when: true
+      failed_when: false
+      when: old_services.stdout_lines | default([]) | select() | list | length > 0
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Delete old vmagent deployment (replaced by operator-managed VMAgent)
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} get deployment {{ old_vmagent_deployment }} --no-headers 2>/dev/null && \
+        kubectl -n {{ telemetry_namespace }} delete deployment {{ old_vmagent_deployment }} --timeout=60s || true
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Display old resource cleanup summary
+      ansible.builtin.debug:
+        msg:
+          - "Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}"
+          - "Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}"

From e4ea12d91650cfb805e6bb9fdc97641d170e17f4 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Tue, 2 Jun 2026 17:54:28 +0530
Subject: [PATCH 03/33]  revert changes as it si taken care in another Pr

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../idrac_telemetry_statefulset.yaml.j2       | 31 +++---------------
 upgrade/playbooks/upgrade_telemetry.yml       | 18 -----------
 .../roles/upgrade_telemetry/tasks/main.yml    | 32 -------------------
 .../patch_idrac_termination_grace_period.yml  |  2 +-
 4 files changed, 6 insertions(+), 77 deletions(-)

diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
index a71ebd0aa0..c3d7b00aee 100644
--- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
+++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
@@ -83,27 +83,16 @@ spec:
         operator: Exists
         tolerationSeconds: 5
       initContainers:
-        # Clean up stale MySQL lock/InnoDB artifacts only after ungraceful shutdown
+        # Clean up stale MySQL lock files from previous ungraceful shutdowns
         - name: cleanup-mysql-locks
           image: {{ mysql_image }}
           command:
             - /bin/sh
             - -c
             - |
-              DATADIR="/var/lib/mysql"
-              # Only run cleanup if datadir has existing MySQL data (not a fresh install)
-              if [ ! -f "${DATADIR}/mysql.ibd" ]; then
-                echo "Fresh install detected — skipping cleanup."
-                exit 0
-              fi
-              # Detect unclean shutdown: pid/sock files should not exist when no mysqld is running
-              if [ -f "${DATADIR}/mysqld.pid" ] || ls ${DATADIR}/*.sock 1>/dev/null 2>&1 || ls ${DATADIR}/*.lck 1>/dev/null 2>&1; then
-                echo "Stale lock artifacts detected — previous shutdown was unclean."
-                rm -f ${DATADIR}/*.sock ${DATADIR}/*.pid ${DATADIR}/*.lck 2>/dev/null || true
-                echo "Stale artifacts removed."
-              else
-                echo "No stale artifacts — previous shutdown was clean."
-              fi
+              echo "Checking for stale MySQL lock files..."
+              rm -f /var/lib/mysql/*.sock /var/lib/mysql/*.pid 2>/dev/null || true
+              echo "Lock file cleanup complete"
           volumeMounts:
             - name: mysqldb-pvc
               mountPath: /var/lib/mysql/
@@ -114,20 +103,10 @@ spec:
           volumeMounts:
             - name: mysqldb-pvc
               mountPath: /var/lib/mysql/
-          args:
-            - --innodb-use-native-aio=0
-            - --innodb-flush-log-at-trx-commit=1
-            - --innodb-flush-method=fsync
           lifecycle:
             preStop:
               exec:
-                command:
-                  - /bin/sh
-                  - -c
-                  - |
-                    mysqladmin shutdown -uroot -p"${MYSQL_ROOT_PASSWORD}" --wait=45 2>/dev/null || true
-                    while mysqladmin ping -uroot -p"${MYSQL_ROOT_PASSWORD}" 2>/dev/null; do sleep 1; done
-                    rm -f /var/lib/mysql/conf.d/recovery.cnf 2>/dev/null || true
+                command: ["/bin/sh", "-c", "mysqladmin shutdown -uroot -p${MYSQL_ROOT_PASSWORD} 2>/dev/null || true"]
           env:
             - name: MYSQL_DATABASE
               value: {{ mysqldb_name }}
diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml
index 730eeb1c00..5fcb99f410 100644
--- a/upgrade/playbooks/upgrade_telemetry.yml
+++ b/upgrade/playbooks/upgrade_telemetry.yml
@@ -35,24 +35,6 @@
       when:
         - manifest.component_status[component_name] | default('pending') == 'completed'
 
-    - name: "Mark as skipped — service_k8s not configured (Slurm-only deployment)"
-      ansible.builtin.copy:
-        content: >-
-          {{ manifest | combine({
-               'component_status': manifest.component_status | combine({
-                 component_name: 'skipped'
-               })
-             }) | to_nice_yaml }}
-        dest: "{{ manifest_path }}"
-        mode: '0644'
-      when:
-        - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool)
-
-    - name: "Skip — service_k8s not configured (Slurm-only deployment)"
-      ansible.builtin.meta: end_play
-      when:
-        - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool)
-
     - name: "Mark as skipped — BuildStream terminal gate active (C-24)"
       ansible.builtin.copy:
         content: >-
diff --git a/upgrade/roles/upgrade_telemetry/tasks/main.yml b/upgrade/roles/upgrade_telemetry/tasks/main.yml
index cccfbc8bf8..68c087306c 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/main.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/main.yml
@@ -51,38 +51,6 @@
     msg: "{{ victoria_upgrade_skipped }}"
   when: not (victoria_upgrade_needed | default(false) | bool)
 
-# ── Phase 2.5: Clean stale Kafka lock files before redeploy ──
-- name: Clean stale Kafka lock files from PVCs (prevents CrashLoopBackOff)
-  ansible.builtin.shell: |
-    set -o pipefail
-    for pvc_name in $(kubectl get pvc -n {{ telemetry_namespace }} -l strimzi.io/cluster=kafka \
-        -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
-      pod_name="kafka-lock-cleanup-${pvc_name}"
-      kubectl delete pod "${pod_name}" -n {{ telemetry_namespace }} --ignore-not-found 2>/dev/null || true
-      kubectl run "${pod_name}" --rm --attach --restart=Never -n {{ telemetry_namespace }} \
-        --image=busybox:latest \
-        --overrides="{
-          \"spec\": {
-            \"containers\": [{
-              \"name\": \"cleanup\",
-              \"image\": \"busybox:latest\",
-              \"command\": [\"sh\", \"-c\", \"find /data -name .lock -delete && echo cleaned ${pvc_name}\"],
-              \"volumeMounts\": [{\"name\": \"data\", \"mountPath\": \"/data\"}]
-            }],
-            \"volumes\": [{\"name\": \"data\", \"persistentVolumeClaim\": {\"claimName\": \"${pvc_name}\"}}],
-            \"restartPolicy\": \"Never\"
-          }
-        }" 2>/dev/null || true
-    done
-  delegate_to: "{{ kube_vip }}"
-  connection: ssh
-  changed_when: true
-  failed_when: false
-  timeout: 300
-  when:
-    - kube_vip is defined
-    - kube_vip | length > 0
-
 # ── Phase 3: Execute telemetry.sh to redeploy telemetry stack ──
 - name: Phase 3 - Execute telemetry.sh to redeploy telemetry stack
   ansible.builtin.include_tasks: execute_telemetry_sh.yml
diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
index e3e6333c73..ae8dd63dde 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
@@ -48,7 +48,7 @@
     msg: "idrac-telemetry current replica count: {{ idrac_replica_count.stdout }}"
   when: idrac_sts_check.rc == 0
 
-- name: Patch terminationGracePeriodSeconds to 120s for graceful MySQL shutdown
+- name: Patch terminationGracePeriodSeconds for graceful MySQL shutdown
   ansible.builtin.command:
     cmd: >
       kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }}

From e5be450cd7127573f1a9aed1137f2447c9a20eb6 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Wed, 3 Jun 2026 07:05:40 +0530
Subject: [PATCH 04/33] revert idrac terminationgraceperiod

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../idrac_telemetry/idrac_telemetry_statefulset.yaml.j2         | 2 +-
 .../roles/upgrade_telemetry/tasks/include_required_input.yml    | 1 +
 .../tasks/patch_idrac_termination_grace_period.yml              | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
index c3d7b00aee..7d56e91d56 100644
--- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
+++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2
@@ -72,7 +72,7 @@ spec:
         - ip: "127.0.0.1"
           hostnames:
             - "mysqldb"
-      terminationGracePeriodSeconds: 60
+      terminationGracePeriodSeconds: 120
       tolerations:
       - effect: NoExecute
         key: node.kubernetes.io/not-ready
diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
index ac44ac6fe0..d4127cfff9 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
@@ -75,6 +75,7 @@
     - k8s_nfs_storage_name is defined
     - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0
   tags: always
+  
 # ── Load high_availability_config.yml ──
 - name: Check if high_availability_config.yml exists
   ansible.builtin.stat:
diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
index ae8dd63dde..76755b45b9 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml
@@ -53,7 +53,7 @@
     cmd: >
       kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }}
       --type=strategic
-      -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":60}}}}'
+      -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":120}}}}'
   delegate_to: "{{ kube_vip }}"
   connection: ssh
   register: idrac_patch_result

From f93d2a13357fb520196049bdbb95302d94e4f0ca Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Wed, 3 Jun 2026 09:11:14 +0530
Subject: [PATCH 05/33] ansible lint fixes

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../roles/upgrade_telemetry/tasks/include_required_input.yml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
index a934f8c4af..9655f55130 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml
@@ -75,7 +75,7 @@
     - k8s_nfs_storage_name is defined
     - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0
   tags: always
-  
+
 # ── Load high_availability_config.yml ──
 - name: Check if high_availability_config.yml exists
   ansible.builtin.stat:

From a659c5744d05dfc81ef5badeabf554f9f28a185b Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Wed, 3 Jun 2026 11:27:49 +0530
Subject: [PATCH 06/33] rescue block for upgrade telemetry

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 upgrade/playbooks/upgrade_telemetry.yml | 78 ++++++++++++-------------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml
index fd21e71f78..83bd219883 100644
--- a/upgrade/playbooks/upgrade_telemetry.yml
+++ b/upgrade/playbooks/upgrade_telemetry.yml
@@ -133,43 +133,41 @@
     #   - VAST exporter (if enabled)
     #   - VictoriaLogs (if enabled)
     #   - UFM exporter (if enabled)
-    - name: Execute telemetry upgrade
-      block:
-        - name: Invoke upgrade_telemetry role
-          ansible.builtin.include_role:
-            name: ../roles/upgrade_telemetry
-
-        - name: Mark telemetry upgrade as completed
-          ansible.builtin.copy:
-            content: >-
-              {{ manifest | combine({
-                   'component_status': manifest.component_status | combine({
-                     component_name: 'completed'
-                   })
-                 }) | to_nice_yaml }}
-            dest: "{{ manifest_path }}"
-            mode: '0644'
-
-        - name: "Display upgrade status completed — {{ component_name }}"
-          ansible.builtin.debug:
-            msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed"
-
-      rescue:
-        - name: Mark telemetry upgrade as failed
-          ansible.builtin.copy:
-            content: >-
-              {{ manifest | combine({
-                   'component_status': manifest.component_status | combine({
-                     component_name: 'failed'
-                   })
-                 }) | to_nice_yaml }}
-            dest: "{{ manifest_path }}"
-            mode: '0644'
-
-        - name: "Display upgrade status failed — {{ component_name }}"
-          ansible.builtin.debug:
-            msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed"
-
-        - name: Fail the play
-          ansible.builtin.fail:
-            msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest."
+    - name: Invoke upgrade_telemetry role
+      ansible.builtin.include_role:
+        name: ../roles/upgrade_telemetry
+
+    - name: Mark telemetry upgrade as completed
+      ansible.builtin.copy:
+        content: >-
+          {{ manifest | combine({
+               'component_status': manifest.component_status | combine({
+                 component_name: 'completed'
+               })
+             }) | to_nice_yaml }}
+        dest: "{{ manifest_path }}"
+        mode: '0644'
+
+    - name: "Display upgrade status completed — {{ component_name }}"
+      ansible.builtin.debug:
+        msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed"
+
+  rescue:
+    - name: Mark telemetry upgrade as failed
+      ansible.builtin.copy:
+        content: >-
+          {{ manifest | combine({
+               'component_status': manifest.component_status | combine({
+                 component_name: 'failed'
+               })
+             }) | to_nice_yaml }}
+        dest: "{{ manifest_path }}"
+        mode: '0644'
+
+    - name: "Display upgrade status failed — {{ component_name }}"
+      ansible.builtin.debug:
+        msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed"
+
+    - name: Fail the play
+      ansible.builtin.fail:
+        msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest."

From 0e8c5e5eabc13f4390896a0e14b7cffc5ec9beae Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Wed, 3 Jun 2026 14:59:12 +0530
Subject: [PATCH 07/33] revert upgrade telemetry

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 upgrade/playbooks/upgrade_telemetry.yml | 78 +++++++++++++------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml
index 83bd219883..398e579564 100644
--- a/upgrade/playbooks/upgrade_telemetry.yml
+++ b/upgrade/playbooks/upgrade_telemetry.yml
@@ -133,41 +133,43 @@
     #   - VAST exporter (if enabled)
     #   - VictoriaLogs (if enabled)
     #   - UFM exporter (if enabled)
-    - name: Invoke upgrade_telemetry role
-      ansible.builtin.include_role:
-        name: ../roles/upgrade_telemetry
-
-    - name: Mark telemetry upgrade as completed
-      ansible.builtin.copy:
-        content: >-
-          {{ manifest | combine({
-               'component_status': manifest.component_status | combine({
-                 component_name: 'completed'
-               })
-             }) | to_nice_yaml }}
-        dest: "{{ manifest_path }}"
-        mode: '0644'
-
-    - name: "Display upgrade status completed — {{ component_name }}"
-      ansible.builtin.debug:
-        msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed"
-
-  rescue:
-    - name: Mark telemetry upgrade as failed
-      ansible.builtin.copy:
-        content: >-
-          {{ manifest | combine({
-               'component_status': manifest.component_status | combine({
-                 component_name: 'failed'
-               })
-             }) | to_nice_yaml }}
-        dest: "{{ manifest_path }}"
-        mode: '0644'
-
-    - name: "Display upgrade status failed — {{ component_name }}"
-      ansible.builtin.debug:
-        msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed"
-
-    - name: Fail the play
-      ansible.builtin.fail:
-        msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest."
+    - name: Execute telemetry upgrade
+      block:
+        - name: Invoke upgrade_telemetry role
+          ansible.builtin.include_role:
+            name: ../roles/upgrade_telemetry
+
+        - name: Mark telemetry upgrade as completed
+          ansible.builtin.copy:
+            content: >-
+              {{ manifest | combine({
+                   'component_status': manifest.component_status | combine({
+                     component_name: 'completed'
+                   })
+                 }) | to_nice_yaml }}
+            dest: "{{ manifest_path }}"
+            mode: '0644'
+
+        - name: "Display upgrade status completed — {{ component_name }}"
+          ansible.builtin.debug:
+            msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed"
+
+      rescue:
+        - name: Mark telemetry upgrade as failed
+          ansible.builtin.copy:
+            content: >-
+              {{ manifest | combine({
+                   'component_status': manifest.component_status | combine({
+                     component_name: 'failed'
+                   })
+                 }) | to_nice_yaml }}
+            dest: "{{ manifest_path }}"
+            mode: '0644'
+
+        - name: "Display upgrade status failed — {{ component_name }}"
+          ansible.builtin.debug:
+            msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed"
+
+        - name: Fail the play
+          ansible.builtin.fail:
+            msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest."
\ No newline at end of file

From 67e8bee1d30f06b7c48eb39f649f54c2ffeec0f4 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 8 Jun 2026 13:13:30 +0530
Subject: [PATCH 08/33] default size of idrac telemetry containers

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 input/telemetry_storage_config.yml      | 12 ++++++------
 provision/roles/telemetry/vars/main.yml | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/input/telemetry_storage_config.yml b/input/telemetry_storage_config.yml
index d44a7ec68f..6805baf6b4 100644
--- a/input/telemetry_storage_config.yml
+++ b/input/telemetry_storage_config.yml
@@ -168,10 +168,10 @@ idrac_telemetry_storage:
     resources:
       requests:
         cpu: "100m"
-        memory: "256Mi"
+        memory: "512Mi"
       limits:
         cpu: "500m"
-        memory: "512Mi"
+        memory: "1.5Gi"
   receiver:
     resources:
       requests:
@@ -184,18 +184,18 @@ idrac_telemetry_storage:
     resources:
       requests:
         cpu: "50m"
-        memory: "64Mi"
+        memory: "128Mi"
       limits:
         cpu: "200m"
-        memory: "256Mi"
+        memory: "512Mi"
   victoria_pump:
     resources:
       requests:
         cpu: "50m"
-        memory: "64Mi"
+        memory: "128Mi"
       limits:
         cpu: "200m"
-        memory: "256Mi"
+        memory: "512Mi"
 
 # Kafka Storage resources
 kafka_storage:
diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml
index 55f0ac6534..4f1f29f9fa 100644
--- a/provision/roles/telemetry/vars/main.yml
+++ b/provision/roles/telemetry/vars/main.yml
@@ -82,10 +82,10 @@ idrac_telemetry_resources:
   activemq:
     requests:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.requests.cpu | default('100m') }}"
-      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.requests.memory | default('256Mi') }}"
+      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.requests.memory | default('512Mi') }}"
     limits:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.limits.cpu | default('500m') }}"
-      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.limits.memory | default('512Mi') }}"
+      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.limits.memory | default('1.5Gi') }}"
   receiver:
     requests:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.receiver.resources.requests.cpu | default('100m') }}"
@@ -96,17 +96,17 @@ idrac_telemetry_resources:
   kafka_pump:
     requests:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.requests.cpu | default('50m') }}"
-      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.requests.memory | default('64Mi') }}"
+      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.requests.memory | default('128Mi') }}"
     limits:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.limits.cpu | default('200m') }}"
-      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.limits.memory | default('256Mi') }}"
+      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.limits.memory | default('512Mi') }}"
   victoria_pump:
     requests:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.requests.cpu | default('50m') }}"
-      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.requests.memory | default('64Mi') }}"
+      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.requests.memory | default('128Mi') }}"
     limits:
       cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.limits.cpu | default('200m') }}"
-      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.limits.memory | default('256Mi') }}"
+      memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.limits.memory | default('512Mi') }}"
 
 # Usage: kafka_deployment.yml
 kafka:

From eb2d898ee0b4cb54dd2dff1763abc6365f522764 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 8 Jun 2026 13:23:13 +0530
Subject: [PATCH 09/33] add new line

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 upgrade/playbooks/upgrade_telemetry.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml
index ccf54d15fc..46fa8af052 100644
--- a/upgrade/playbooks/upgrade_telemetry.yml
+++ b/upgrade/playbooks/upgrade_telemetry.yml
@@ -172,4 +172,4 @@
 
         - name: Fail the play
           ansible.builtin.fail:
-            msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest."
\ No newline at end of file
+            msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest."

From aec78412b578c55c5919491695ac5e7843a299fb Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 8 Jun 2026 14:26:36 +0530
Subject: [PATCH 10/33] input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../x86_64/rhel/10.0/service_k8s_v1.35.1.json     | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
index 966a94d7b9..654208ec54 100644
--- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
+++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
@@ -29,9 +29,10 @@
       { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.3" },
       { "package": "cryptography==45.0.7", "type": "pip_module" },
       { "package": "omsdk==1.2.518", "type": "pip_module" },
-      { "package": "cffi==1.17.1", "type": "pip_module" },
-      { "package": "prometheus_client==0.20.0", "type": "pip_module" },
+      { "package": "cffi==2.0.0", "type": "pip_module" },
+      { "package": "prometheus_client==0.25.0", "type": "pip_module" },
       { "package": "kubernetes==33.1.0", "type": "pip_module" },
+      { "package": "pyyaml==6.0.3", "type": "pip_module" },
       { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" },
       { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" },
       { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" },
@@ -75,11 +76,12 @@
       { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" },
       { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" },
       { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"},
-      { "package": "prettytable==3.14.0", "type": "pip_module" },
+      { "package": "prettytable==3.17.0", "type": "pip_module" },
       { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" },
       { "package": "git", "type": "rpm", "repo_name": "appstream"},
       { "package": "kubernetes==33.1.0", "type": "pip_module" },
-      { "package": "PyMySQL==1.1.2", "type": "pip_module" }
+      { "package": "pyyaml==6.0.3", "type": "pip_module" },
+      { "package": "PyMySQL==1.2.0", "type": "pip_module" }
 
     ]
   },
@@ -103,11 +105,12 @@
       { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" },
       { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" },
       { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"},
-      { "package": "prettytable==3.14.0", "type": "pip_module" },
+      { "package": "prettytable==3.17.0", "type": "pip_module" },
       { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" },
       { "package": "git", "type": "rpm", "repo_name": "appstream"},
       { "package": "kubernetes==33.1.0", "type": "pip_module" },
-      { "package": "PyMySQL==1.1.2", "type": "pip_module" }
+      { "package": "pyyaml==6.0.3", "type": "pip_module" },
+      { "package": "PyMySQL==1.2.0", "type": "pip_module" }
     ]
   },
 

From bcc5132783a276180a0865faddb3a30880093abe Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 8 Jun 2026 15:28:20 +0530
Subject: [PATCH 11/33] update values in upgrade path

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../templates/telemetry_storage_config.j2            | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2
index d44a7ec68f..6805baf6b4 100644
--- a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2
+++ b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2
@@ -168,10 +168,10 @@ idrac_telemetry_storage:
     resources:
       requests:
         cpu: "100m"
-        memory: "256Mi"
+        memory: "512Mi"
       limits:
         cpu: "500m"
-        memory: "512Mi"
+        memory: "1.5Gi"
   receiver:
     resources:
       requests:
@@ -184,18 +184,18 @@ idrac_telemetry_storage:
     resources:
       requests:
         cpu: "50m"
-        memory: "64Mi"
+        memory: "128Mi"
       limits:
         cpu: "200m"
-        memory: "256Mi"
+        memory: "512Mi"
   victoria_pump:
     resources:
       requests:
         cpu: "50m"
-        memory: "64Mi"
+        memory: "128Mi"
       limits:
         cpu: "200m"
-        memory: "256Mi"
+        memory: "512Mi"
 
 # Kafka Storage resources
 kafka_storage:

From 776990dc044acf72955f38c56ea15c5b1af216e9 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:12:38 +0530
Subject: [PATCH 12/33] updating values in integer instead decimal

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 input/telemetry_storage_config.yml                              | 2 +-
 .../templates/telemetry_storage_config.j2                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/input/telemetry_storage_config.yml b/input/telemetry_storage_config.yml
index 6805baf6b4..c80dbdde65 100644
--- a/input/telemetry_storage_config.yml
+++ b/input/telemetry_storage_config.yml
@@ -171,7 +171,7 @@ idrac_telemetry_storage:
         memory: "512Mi"
       limits:
         cpu: "500m"
-        memory: "1.5Gi"
+        memory: "1536Mi"
   receiver:
     resources:
       requests:
diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2
index 6805baf6b4..c80dbdde65 100644
--- a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2
+++ b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2
@@ -171,7 +171,7 @@ idrac_telemetry_storage:
         memory: "512Mi"
       limits:
         cpu: "500m"
-        memory: "1.5Gi"
+        memory: "1536Mi"
   receiver:
     resources:
       requests:

From 0f328dc5222bfe27fd761bc22ef67b9487f2ba81 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 8 Jun 2026 18:46:17 +0530
Subject: [PATCH 13/33] revert service k8s json file

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../x86_64/rhel/10.0/service_k8s_v1.35.1.json   | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
index 654208ec54..1fc9bd65ef 100644
--- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
+++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
@@ -29,10 +29,9 @@
       { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.3" },
       { "package": "cryptography==45.0.7", "type": "pip_module" },
       { "package": "omsdk==1.2.518", "type": "pip_module" },
-      { "package": "cffi==2.0.0", "type": "pip_module" },
-      { "package": "prometheus_client==0.25.0", "type": "pip_module" },
+      { "package": "cffi==1.17.1", "type": "pip_module" },
+      { "package": "prometheus_client==0.20.0", "type": "pip_module" },
       { "package": "kubernetes==33.1.0", "type": "pip_module" },
-      { "package": "pyyaml==6.0.3", "type": "pip_module" },
       { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" },
       { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" },
       { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" },
@@ -76,12 +75,11 @@
       { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" },
       { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" },
       { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"},
-      { "package": "prettytable==3.17.0", "type": "pip_module" },
+      { "package": "prettytable==3.14.0", "type": "pip_module" },
       { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" },
       { "package": "git", "type": "rpm", "repo_name": "appstream"},
       { "package": "kubernetes==33.1.0", "type": "pip_module" },
-      { "package": "pyyaml==6.0.3", "type": "pip_module" },
-      { "package": "PyMySQL==1.2.0", "type": "pip_module" }
+      { "package": "PyMySQL==1.1.2", "type": "pip_module" }
 
     ]
   },
@@ -105,12 +103,11 @@
       { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" },
       { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" },
       { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"},
-      { "package": "prettytable==3.17.0", "type": "pip_module" },
+      { "package": "prettytable==3.14.0", "type": "pip_module" },
       { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" },
       { "package": "git", "type": "rpm", "repo_name": "appstream"},
       { "package": "kubernetes==33.1.0", "type": "pip_module" },
-      { "package": "pyyaml==6.0.3", "type": "pip_module" },
-      { "package": "PyMySQL==1.2.0", "type": "pip_module" }
+      { "package": "PyMySQL==1.1.2", "type": "pip_module" }
     ]
   },
 
@@ -121,4 +118,4 @@
       { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } 
     ]
   }
-}
+}
\ No newline at end of file

From a9700876622155965280f043d50ceaa0ab4e3db3 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 12 Jun 2026 07:34:11 +0530
Subject: [PATCH 14/33] powescale telemetry upgrade and preserve loadbalancer
 IP for Victoria

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../tasks/apply_victoria_crs.yml              | 73 +++++++++++++++++
 .../tasks/migrate_statefulset.yml             | 80 +++++++++++++++++++
 upgrade/roles/upgrade_telemetry/vars/main.yml |  2 +
 3 files changed, 155 insertions(+)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
index 488c39b72c..d725cf067a 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
@@ -50,6 +50,79 @@
   delegate_to: "{{ kube_vip }}"
   connection: ssh
 
+# ── Inject preserved LoadBalancer IPs into VMCluster manifest before apply ──
+# When migrating from 2.1 StatefulSet to operator, old services are deleted
+# and the operator creates new ones. To preserve IPs, we inject loadBalancerIP
+# directly into the VMCluster CR's serviceSpec BEFORE applying, so the operator
+# creates services with the correct IPs from the start (no race condition).
+- name: Create LoadBalancer IP injection script
+  ansible.builtin.copy:
+    dest: /tmp/inject_vm_lb_ips.py
+    mode: "0755"
+    content: |
+      #!/usr/bin/env python3
+      import yaml
+      import sys
+      manifest_path = sys.argv[1]
+      vmselect_ip = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2] else ""
+      vminsert_ip = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] else ""
+      with open(manifest_path) as f:
+          doc = yaml.safe_load(f)
+      spec = doc.get("spec", {})
+      changed = False
+      if vmselect_ip and "vmselect" in spec:
+          svc = spec["vmselect"].setdefault("serviceSpec", {}).setdefault("spec", {})
+          if svc.get("loadBalancerIP") != vmselect_ip:
+              svc["loadBalancerIP"] = vmselect_ip
+              changed = True
+      if vminsert_ip and "vminsert" in spec:
+          svc = spec["vminsert"].setdefault("serviceSpec", {}).setdefault("spec", {})
+          if svc.get("loadBalancerIP") != vminsert_ip:
+              svc["loadBalancerIP"] = vminsert_ip
+              changed = True
+      if changed:
+          with open(manifest_path, "w") as f:
+              yaml.dump(doc, f, default_flow_style=False, sort_keys=False)
+          print("Injected vmselect=" + vmselect_ip + " vminsert=" + vminsert_ip)
+      else:
+          print("IPs already present - no change needed")
+      sys.exit(0 if changed else 2)
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+  when:
+    - preserved_vmselect_ip | default('') | length > 0 or preserved_vminsert_ip | default('') | length > 0
+
+- name: Inject preserved LoadBalancer IPs into VMCluster manifest
+  ansible.builtin.command:
+    cmd: >-
+      python3 /tmp/inject_vm_lb_ips.py
+      "{{ telemetry_deploy_dir }}/deployments/victoria-operator-vmcluster.yaml"
+      "{{ preserved_vmselect_ip | default('') }}"
+      "{{ preserved_vminsert_ip | default('') }}"
+  register: ip_inject_result
+  changed_when: ip_inject_result.rc == 0
+  failed_when: ip_inject_result.rc not in [0, 2]
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+  when:
+    - preserved_vmselect_ip | default('') | length > 0 or preserved_vminsert_ip | default('') | length > 0
+
+- name: Clean up LoadBalancer IP injection script
+  ansible.builtin.file:
+    path: /tmp/inject_vm_lb_ips.py
+    state: absent
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+  changed_when: false
+
+- name: Display LoadBalancer IP injection status
+  ansible.builtin.debug:
+    msg: >-
+      {{ victoria_lb_ips_preserved
+         if (preserved_vminsert_ip | default('') | length > 0)
+            or (preserved_vmselect_ip | default('') | length > 0)
+         else victoria_lb_ips_not_preserved }}
+
 # ── Apply main CR (VMCluster only — 2.2 cluster mode only) ──
 - name: Apply VMCluster CR (cluster mode only) with retry
   ansible.builtin.command:
diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
index e99d7bf80c..4d0efd1081 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
@@ -53,6 +53,49 @@
       delegate_to: "{{ kube_vip }}"
       connection: ssh
 
+    # ── Flush vmstorage data before shutdown ──
+    # Create snapshots on each vmstorage pod to force pending data/indexdb flush.
+    # This prevents corrupted parts.json from in-flight merges during shutdown.
+    - name: Get old vmstorage pod names
+      ansible.builtin.shell: |
+        set -o pipefail
+        kubectl -n {{ telemetry_namespace }} get pods -l {{ old_vm_pod_label }} --no-headers 2>/dev/null \
+          | grep -i "storage\|vmstorage" | awk '{print $1}'
+      register: old_vmstorage_pods
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Force snapshot on each vmstorage pod (flush pending writes)
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} exec {{ item }} -- \
+          wget -q -O- --no-check-certificate "https://localhost:8482/snapshot/create" 2>/dev/null || \
+        kubectl -n {{ telemetry_namespace }} exec {{ item }} -- \
+          wget -q -O- "http://localhost:8482/snapshot/create" 2>/dev/null || true
+      loop: "{{ old_vmstorage_pods.stdout_lines | default([]) }}"
+      changed_when: false
+      failed_when: false
+      when: old_vmstorage_pods.stdout_lines | default([]) | length > 0
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Wait for background merges to settle after writes stopped
+      ansible.builtin.pause:
+        seconds: 30
+        prompt: "Waiting 30s for vmstorage background merges to settle..."
+
+    # ── Ensure sufficient graceful shutdown period ──
+    # Old StatefulSet may have default 30s which is too short for indexdb flush
+    - name: Patch old StatefulSet terminationGracePeriodSeconds to 120s
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} patch statefulset {{ actual_old_statefulset }} \
+          -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":120}}}}'
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
     # ── Graceful shutdown of old StatefulSet ──
     - name: Scale down old StatefulSet
       ansible.builtin.command:
@@ -84,6 +127,11 @@
       delegate_to: "{{ kube_vip }}"
       connection: ssh
 
+    - name: Wait for storage cache flush after pod termination
+      ansible.builtin.pause:
+        seconds: 15
+        prompt: "Waiting 15s for storage cache flush..."
+
     # ── PVC relabeling (data preservation via PV rebind) ──
     - name: Get all old PVCs from StatefulSet (using specific StatefulSet label)
       ansible.builtin.command:
@@ -169,6 +217,38 @@
       delegate_to: "{{ kube_vip }}"
       connection: ssh
 
+    # ── Capture LoadBalancer IPs before deletion ──
+    # Preserve existing IPs to prevent MetalLB from assigning new ones
+    # Only applicable for statefulset_to_operator migration path
+    - name: Get vminsert LoadBalancer IP
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} get svc vminsert -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
+      register: old_vminsert_ip
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Get vmselect LoadBalancer IP
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} get svc vmselect -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
+      register: old_vmselect_ip
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Set LoadBalancer IP facts for preservation
+      ansible.builtin.set_fact:
+        preserved_vminsert_ip: "{{ old_vminsert_ip.stdout | trim }}"
+        preserved_vmselect_ip: "{{ old_vmselect_ip.stdout | trim }}"
+
+    - name: Display preserved LoadBalancer IPs
+      ansible.builtin.debug:
+        msg:
+          - "Preserving vminsert IP: {{ preserved_vminsert_ip if preserved_vminsert_ip else 'None' }}"
+          - "Preserving vmselect IP: {{ preserved_vmselect_ip if preserved_vmselect_ip else 'None' }}"
+
     # ── Cleanup old pre-operator services and deployments ──
     # The operator creates new services with different names (e.g. vminsert-victoria-cluster),
     # so the old standalone services become stale and waste LoadBalancer IPs.
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index 8326120799..62bb33b0c2 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -102,6 +102,8 @@ victoria_unhealthy_pods_warning: >-
 victoria_pods_deleted: "Deleted {{ victoria_unhealthy_pods | length }} unhealthy pod(s). Upgrade will re-create them."
 victoria_backup_completed: "Victoria backup completed: {{ telemetry_backup_dir }}"
 victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }}"
+victoria_lb_ips_preserved: "LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, vmselect: {{ preserved_vmselect_ip | default('N/A') }}"
+victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)"
 victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state."
 victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting"
 telemetry_upgrade_success: "Telemetry upgrade COMPLETED: All telemetry pods are running and ready."

From a43bdda7afc364aca060fa518aa085f1c5ee64c4 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 12 Jun 2026 07:36:42 +0530
Subject: [PATCH 15/33] powerscale telemetry version upgrade

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../x86_64/rhel/10.0/service_k8s_v1.35.1.json       | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
index 1fc9bd65ef..08de51d880 100644
--- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
+++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json
@@ -1,3 +1,4 @@
+
 {
   "service_k8s": {
     "cluster": [
@@ -35,11 +36,11 @@
       { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" },
       { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" },
       { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" },
-      { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" },
-      { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.143.1", "type": "image" },
+      { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.12.0", "type": "image" },
+      { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.150.1", "type": "image" },
       { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" },
-      { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" },
-      { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" },
+      { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.15.0" },
+      { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.10.0" },
       { "package": "quay.io/jetstack/cert-manager-controller", "tag": "v1.10.0", "type": "image" },
       { "package": "quay.io/jetstack/cert-manager-cainjector", "tag": "v1.10.0", "type": "image" },
       { "package": "quay.io/jetstack/cert-manager-webhook", "tag": "v1.10.0", "type": "image" },
@@ -99,7 +100,7 @@
       { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" },
       { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" },
       { "package": "calico-v3.31.4","type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.31.4/manifests/calico.yaml" },
-      { "package": "metallb-native-v0.15.3", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" },     
+      { "package": "metallb-native-v0.15.3", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" },
       { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" },
       { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" },
       { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"},
@@ -115,7 +116,7 @@
     "cluster": [
       { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" },
       { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" },
-      { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } 
+      { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" }
     ]
   }
 }
\ No newline at end of file

From 2f1a6baee821c33af1eb02044ad5f98d4914d76b Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:42:59 +0530
Subject: [PATCH 16/33] ansible lint fixes

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 upgrade/roles/upgrade_telemetry/vars/main.yml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index 62bb33b0c2..ed68b0e89b 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -102,7 +102,10 @@ victoria_unhealthy_pods_warning: >-
 victoria_pods_deleted: "Deleted {{ victoria_unhealthy_pods | length }} unhealthy pod(s). Upgrade will re-create them."
 victoria_backup_completed: "Victoria backup completed: {{ telemetry_backup_dir }}"
 victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }}"
-victoria_lb_ips_preserved: "LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, vmselect: {{ preserved_vmselect_ip | default('N/A') }}"
+victoria_lb_ips_preserved: >-
+  LoadBalancer IPs injected into VMCluster manifest -
+  vminsert: {{ preserved_vminsert_ip | default('N/A') }},
+  vmselect: {{ preserved_vmselect_ip | default('N/A') }}
 victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)"
 victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state."
 victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting"
@@ -145,3 +148,12 @@ idrac_patch_msg: >-
   MySQL will have enough time to flush on NFS during pod restart.
 idrac_skip_patch_msg: "idrac-telemetry StatefulSet not found (first deploy). Skipping patch."
 idrac_replica_restore_msg: "idrac-telemetry scaled back to {{ idrac_replica_count.stdout }} replicas"
+
+# Kafka patch messages
+kafka_broker_patch_msg: >-
+  kafka-broker patched: terminationGracePeriodSeconds=300s.
+  Kafka brokers will have sufficient time for graceful shutdown during rolling restarts.
+kafka_controller_patch_msg: >-
+  kafka-controller patched: terminationGracePeriodSeconds=300s.
+  Kafka controllers will have sufficient time for graceful shutdown during rolling restarts.
+kafka_skip_patch_msg: "Kafka StatefulSets not found (first deploy). Skipping patch."

From b5e756e39f6289c79d6e43c483a971b8bfb93a41 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 12 Jun 2026 15:10:57 +0530
Subject: [PATCH 17/33] update software_config with updated csi driver version

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../scripts/transform_software_config.py                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/import_input_parameters/scripts/transform_software_config.py b/upgrade/roles/import_input_parameters/scripts/transform_software_config.py
index 9a314e7614..03ae8c47c6 100644
--- a/upgrade/roles/import_input_parameters/scripts/transform_software_config.py
+++ b/upgrade/roles/import_input_parameters/scripts/transform_software_config.py
@@ -23,7 +23,7 @@
 # These are the target versions for software entries that should be updated
 TARGET_VERSIONS = {
     "service_k8s": "1.35.1",
-    "csi_driver_powerscale": "v2.16.0"
+    "csi_driver_powerscale": "v2.17.0"
 }
 
 with open(backup_file, 'r', encoding='utf-8') as f:

From 79fb95d4f6d9335f0516e733f3144b22f001b81a Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Sat, 13 Jun 2026 13:31:17 +0530
Subject: [PATCH 18/33] upgrade powerscale values.yml

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../scripts/merge_powerscale_values.py        | 110 +++++++++++-------
 upgrade/roles/upgrade_telemetry/vars/main.yml |   2 +-
 2 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py b/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py
index 0f236c2027..eefd833b4a 100755
--- a/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py
+++ b/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py
@@ -22,24 +22,24 @@
 import yaml
 
 
-def merge_values(v21_file_path, v216_file_path, output_file_path):
+def merge_values(vold_file_path, vnew_file_path, output_file_path):
     """
-    Merge v2.1 PowerScale values into v2.16 template.
+    Merge old PowerScale values into new template.
 
     Args:
-        v21_file_path: Path to v2.1 values.yaml (source settings)
-        v216_file_path: Path to v2.16 values.yaml (target structure)
+        vold_file_path: Path to old values.yaml (source settings)
+        vnew_file_path: Path to new values.yaml (target structure)
         output_file_path: Path to write merged values.yaml
     """
-    # Load v2.1 values (source of user settings)
-    with open(v21_file_path, 'r', encoding='utf-8') as file_handle:
-        v21_values = yaml.safe_load(file_handle)
+    # Load old values (source of user settings)
+    with open(vold_file_path, 'r', encoding='utf-8') as file_handle:
+        vold_values = yaml.safe_load(file_handle)
 
-    # Load v2.16 values (target structure with new defaults)
-    with open(v216_file_path, 'r', encoding='utf-8') as file_handle:
-        v216_values = yaml.safe_load(file_handle)
+    # Load new values (target structure with new defaults)
+    with open(vnew_file_path, 'r', encoding='utf-8') as file_handle:
+        vnew_values = yaml.safe_load(file_handle)
 
-    # Parameters to preserve from v2.1
+    # Parameters to preserve from old version
     preserve_params = [
         'isiPath',
         'isiAccessZone',
@@ -49,12 +49,12 @@ def merge_values(v21_file_path, v216_file_path, output_file_path):
 
     # Preserve top-level parameters
     for param in preserve_params:
-        if param in v21_values:
-            v216_values[param] = v21_values[param]
-            print(f"Preserved {param}: {v21_values[param]}",
+        if param in vold_values:
+            vnew_values[param] = vold_values[param]
+            print(f"Preserved {param}: {vold_values[param]}",
                   file=sys.stderr)
 
-    # Preserve feature flags if enabled in v2.1
+    # Preserve feature flags if enabled in old version
     feature_flags = [
         'storageCapacity',
         'podmon',
@@ -64,62 +64,88 @@ def merge_values(v21_file_path, v216_file_path, output_file_path):
     ]
 
     for feature in feature_flags:
-        if feature in v21_values and isinstance(v21_values[feature], dict):
-            if 'enabled' in v21_values[feature]:
-                if feature not in v216_values:
-                    v216_values[feature] = {}
-                v216_values[feature]['enabled'] = \
-                    v21_values[feature]['enabled']
+        if feature in vold_values and isinstance(vold_values[feature], dict):
+            if 'enabled' in vold_values[feature]:
+                if feature not in vnew_values:
+                    vnew_values[feature] = {}
+                vnew_values[feature]['enabled'] = \
+                    vold_values[feature]['enabled']
                 print(f"Preserved {feature}.enabled: "
-                      f"{v21_values[feature]['enabled']}",
+                      f"{vold_values[feature]['enabled']}",
                       file=sys.stderr)
 
+    # Preserve healthMonitor with both enabled and interval
+    if 'healthMonitor' in vold_values and isinstance(vold_values['healthMonitor'], dict):
+        if 'healthMonitor' not in vnew_values:
+            vnew_values['healthMonitor'] = {}
+        for param in ['enabled', 'interval']:
+            if param in vold_values['healthMonitor']:
+                vnew_values['healthMonitor'][param] = vold_values['healthMonitor'][param]
+                print(f"Preserved healthMonitor.{param}: {vold_values['healthMonitor'][param]}", file=sys.stderr)
+
     # Preserve controller settings
-    if 'controller' in v21_values and \
-       isinstance(v21_values['controller'], dict):
-        if 'controller' not in v216_values:
-            v216_values['controller'] = {}
+    if 'controller' in vold_values and \
+       isinstance(vold_values['controller'], dict):
+        if 'controller' not in vnew_values:
+            vnew_values['controller'] = {}
 
         controller_params = ['nodeSelector', 'tolerations', 'controllerCount']
         for param in controller_params:
-            if param in v21_values['controller']:
-                v216_values['controller'][param] = \
-                    v21_values['controller'][param]
-                print(f"Preserved controller.{param}: {v21_values['controller'][param]}", file=sys.stderr)
+            if param in vold_values['controller']:
+                vnew_values['controller'][param] = \
+                    vold_values['controller'][param]
+                print(f"Preserved controller.{param}: {vold_values['controller'][param]}", file=sys.stderr)
+
+        # Preserve controller-level healthMonitor
+        if 'healthMonitor' in vold_values['controller'] and isinstance(vold_values['controller']['healthMonitor'], dict):
+            if 'healthMonitor' not in vnew_values['controller']:
+                vnew_values['controller']['healthMonitor'] = {}
+            for param in ['enabled', 'interval']:
+                if param in vold_values['controller']['healthMonitor']:
+                    vnew_values['controller']['healthMonitor'][param] = vold_values['controller']['healthMonitor'][param]
+                    print(f"Preserved controller.healthMonitor.{param}: {vold_values['controller']['healthMonitor'][param]}", file=sys.stderr)
 
     # Preserve node settings
-    if 'node' in v21_values and isinstance(v21_values['node'], dict):
-        if 'node' not in v216_values:
-            v216_values['node'] = {}
+    if 'node' in vold_values and isinstance(vold_values['node'], dict):
+        if 'node' not in vnew_values:
+            vnew_values['node'] = {}
 
         node_params = ['nodeSelector', 'tolerations']
         for param in node_params:
-            if param in v21_values['node']:
-                v216_values['node'][param] = v21_values['node'][param]
+            if param in vold_values['node']:
+                vnew_values['node'][param] = vold_values['node'][param]
                 print(f"Preserved node.{param}", file=sys.stderr)
 
+        # Preserve node-level healthMonitor
+        if 'healthMonitor' in vold_values['node'] and isinstance(vold_values['node']['healthMonitor'], dict):
+            if 'healthMonitor' not in vnew_values['node']:
+                vnew_values['node']['healthMonitor'] = {}
+            if 'enabled' in vold_values['node']['healthMonitor']:
+                vnew_values['node']['healthMonitor']['enabled'] = vold_values['node']['healthMonitor']['enabled']
+                print(f"Preserved node.healthMonitor.enabled: {vold_values['node']['healthMonitor']['enabled']}", file=sys.stderr)
+
     # Write merged values to output file
     with open(output_file_path, 'w', encoding='utf-8') as file_handle:
-        yaml.dump(v216_values, file_handle,
+        yaml.dump(vnew_values, file_handle,
                   default_flow_style=False, sort_keys=False)
 
-    print("Successfully merged v2.1 settings into v2.16 values.yaml",
+    print("Successfully merged old settings into new values.yaml",
           file=sys.stderr)
     print(f"Output written to: {output_file_path}", file=sys.stderr)
 
 
 if __name__ == '__main__':
     if len(sys.argv) != 4:
-        print("Usage: merge_powerscale_values.py <v21_values.yaml> "
-              "<v216_values.yaml> <output.yaml>", file=sys.stderr)
+        print("Usage: merge_powerscale_values.py <old_values.yaml> "
+              "<new_values.yaml> <output.yaml>", file=sys.stderr)
         sys.exit(1)
 
-    v21_input = sys.argv[1]
-    v216_input = sys.argv[2]
+    vold_input = sys.argv[1]
+    vnew_input = sys.argv[2]
     output_path = sys.argv[3]
 
     try:
-        merge_values(v21_input, v216_input, output_path)
+        merge_values(vold_input, vnew_input, output_path)
     except (IOError, yaml.YAMLError) as error:
         print(f"ERROR: Failed to merge PowerScale values.yaml: {error}",
               file=sys.stderr)
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index ed68b0e89b..d55c6d6180 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -101,7 +101,7 @@ victoria_unhealthy_pods_warning: >-
   proceeds. The upgrade will re-create them with the new version.
 victoria_pods_deleted: "Deleted {{ victoria_unhealthy_pods | length }} unhealthy pod(s). Upgrade will re-create them."
 victoria_backup_completed: "Victoria backup completed: {{ telemetry_backup_dir }}"
-victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }}"
+victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }})"
 victoria_lb_ips_preserved: >-
   LoadBalancer IPs injected into VMCluster manifest -
   vminsert: {{ preserved_vminsert_ip | default('N/A') }},

From bbef8ae3d76a51b9a2bead7d06209f04727fa7f6 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Sat, 13 Jun 2026 13:33:56 +0530
Subject: [PATCH 19/33] revert kafka patch variables

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 upgrade/roles/upgrade_telemetry/vars/main.yml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index d55c6d6180..04ddfaa86d 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -148,12 +148,3 @@ idrac_patch_msg: >-
   MySQL will have enough time to flush on NFS during pod restart.
 idrac_skip_patch_msg: "idrac-telemetry StatefulSet not found (first deploy). Skipping patch."
 idrac_replica_restore_msg: "idrac-telemetry scaled back to {{ idrac_replica_count.stdout }} replicas"
-
-# Kafka patch messages
-kafka_broker_patch_msg: >-
-  kafka-broker patched: terminationGracePeriodSeconds=300s.
-  Kafka brokers will have sufficient time for graceful shutdown during rolling restarts.
-kafka_controller_patch_msg: >-
-  kafka-controller patched: terminationGracePeriodSeconds=300s.
-  Kafka controllers will have sufficient time for graceful shutdown during rolling restarts.
-kafka_skip_patch_msg: "Kafka StatefulSets not found (first deploy). Skipping patch."

From 668db752cb12df0601b3fbbf2e67e9585da96167 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:13:53 +0530
Subject: [PATCH 20/33] update delegation as mount_on_oim can be false also

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../tasks/backup_telemetry.yml                | 22 ++++++++-----------
 .../tasks/backup_victoria.yml                 |  2 +-
 upgrade/roles/upgrade_telemetry/vars/main.yml |  3 +--
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml
index ed0ff59f83..fcc2185d4e 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml
@@ -27,38 +27,34 @@
   when:
     - k8s_client_mount_path is defined
     - k8s_client_mount_path | length > 0
+    - kube_vip is defined
+    - kube_vip | length > 0
   block:
-    - name: Set telemetry backup directory
-      ansible.builtin.set_fact:
-        tel_backup_dir: "{{ k8s_client_mount_path }}/upgrade/telemetry/omnia_{{ manifest.source_version | default('unknown') }}"
-      delegate_to: oim
-      connection: ssh
-
     - name: Create telemetry backup directory
       ansible.builtin.file:
-        path: "{{ tel_backup_dir }}"
+        path: "{{ telemetry_backup_dir }}"
         state: directory
         mode: '0755'
-      delegate_to: oim
+      delegate_to: "{{ kube_vip }}"
       connection: ssh
 
     - name: Backup telemetry folder (pre-provision)
       ansible.builtin.copy:
         src: "{{ k8s_client_mount_path }}/telemetry"
-        dest: "{{ tel_backup_dir }}/telemetry"
+        dest: "{{ telemetry_backup_dir }}/telemetry"
         remote_src: true
         mode: preserve
-      delegate_to: oim
+      delegate_to: "{{ kube_vip }}"
       connection: ssh
       failed_when: false
 
     - name: Backup idrac_telemetry folder (pre-provision)
       ansible.builtin.copy:
         src: "{{ k8s_client_mount_path }}/idrac_telemetry"
-        dest: "{{ tel_backup_dir }}/idrac_telemetry"
+        dest: "{{ telemetry_backup_dir }}/idrac_telemetry"
         remote_src: true
         mode: preserve
-      delegate_to: oim
+      delegate_to: "{{ kube_vip }}"
       connection: ssh
       failed_when: false
 
@@ -74,7 +70,7 @@
     - name: Backup telemetry.sh from control plane
       ansible.builtin.copy:
         src: /root/telemetry.sh
-        dest: "{{ tel_backup_dir }}/telemetry.sh"
+        dest: "{{ telemetry_backup_dir }}/telemetry.sh"
         mode: "{{ executable_mode }}"
         remote_src: true
       delegate_to: "{{ kube_vip }}"
diff --git a/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml b/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml
index 9c6a487b78..abfdd4d107 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml
@@ -21,7 +21,7 @@
     path: "{{ telemetry_backup_dir }}"
     state: directory
     mode: "0755"
-  delegate_to: "{{ oim_host }}"
+  delegate_to: "{{ kube_vip }}"
   connection: ssh
 
 # ── Backup namespace-level resources ──
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index 04ddfaa86d..c39efbcc3b 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -26,8 +26,7 @@ oim_host: oim
 executable_mode: "0755"
 
 # Upgrade directory paths (on k8s NFS share, resolved at runtime)
-telemetry_upgrade_dir: "{{ k8s_client_mount_path }}/upgrade/telemetry"
-telemetry_backup_dir: "{{ telemetry_upgrade_dir }}/omnia_{{ manifest.source_version | default('unknown') }}"
+telemetry_backup_dir: "{{ k8s_client_mount_path }}/upgrade/backup/telemetry/omnia_{{ manifest.source_version | default('unknown') }}"
 # PV backup location (cluster-wide backup pre-provision)
 telemetry_pv_backup_file: "{{ telemetry_backup_dir }}/all_pvs.yaml"
 

From bc89696b7ccf4ff31af0a2ec46f150b4a9076b57 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:39:33 +0530
Subject: [PATCH 21/33] update vars

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../roles/upgrade_telemetry/tasks/migrate_statefulset.yml   | 4 ++--
 upgrade/roles/upgrade_telemetry/vars/main.yml               | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
index 4d0efd1081..23648e2ed6 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
@@ -83,7 +83,7 @@
     - name: Wait for background merges to settle after writes stopped
       ansible.builtin.pause:
         seconds: 30
-        prompt: "Waiting 30s for vmstorage background merges to settle..."
+        prompt: "{{ vmstorage_merge_wait_msg }}"
 
     # ── Ensure sufficient graceful shutdown period ──
     # Old StatefulSet may have default 30s which is too short for indexdb flush
@@ -130,7 +130,7 @@
     - name: Wait for storage cache flush after pod termination
       ansible.builtin.pause:
         seconds: 15
-        prompt: "Waiting 15s for storage cache flush..."
+        prompt: "{{ storage_cache_flush_msg }}"
 
     # ── PVC relabeling (data preservation via PV rebind) ──
     - name: Get all old PVCs from StatefulSet (using specific StatefulSet label)
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index c39efbcc3b..a869e5a52a 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -142,6 +142,12 @@ mysql_crash_error_msg: |
     1. Check pod logs: kubectl logs -n telemetry <pod-name> -c mysqldb
     2. Check PVC status: kubectl get pvc -n telemetry | grep idrac
     3. Contact support if issue persists.
+
+# ============================================================================
+# PAUSE MESSAGES
+# ============================================================================
+vmstorage_merge_wait_msg: "Waiting 30s for vmstorage background merges to settle..."
+storage_cache_flush_msg: "Waiting 15s for storage cache flush..."
 idrac_patch_msg: >-
   idrac-telemetry patched: terminationGracePeriodSeconds=120s.
   MySQL will have enough time to flush on NFS during pod restart.

From 6abf3201b069ff067134ef96555240e70253d5f0 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:32:19 +0530
Subject: [PATCH 22/33] example files for powescale

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../CSI_driver/secret.yaml                    |  90 ++++
 .../CSI_driver/values.yaml                    | 437 ++++++++++++++++++
 .../powerscale_metrics/values.yaml            | 221 +++++++++
 3 files changed, 748 insertions(+)
 create mode 100644 examples/powerscale_reference_files/CSI_driver/secret.yaml
 create mode 100644 examples/powerscale_reference_files/CSI_driver/values.yaml
 create mode 100644 examples/powerscale_reference_files/powerscale_metrics/values.yaml

diff --git a/examples/powerscale_reference_files/CSI_driver/secret.yaml b/examples/powerscale_reference_files/CSI_driver/secret.yaml
new file mode 100644
index 0000000000..75888d6023
--- /dev/null
+++ b/examples/powerscale_reference_files/CSI_driver/secret.yaml
@@ -0,0 +1,90 @@
+# Copyright © 2020-2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+isilonClusters:
+  # logical name of PowerScale Cluster
+  - clusterName: "cluster1"
+
+    # username for connecting to PowerScale OneFS API server
+    # if authorization is enabled, username will be ignored
+    # Default value: None
+    username: "user"
+
+    # password for connecting to PowerScale OneFS API server
+    # if authorization is enabled, password will be ignored
+    password: "password"
+
+    # HTTPS endpoint of the PowerScale OneFS API server
+    # if authorization is enabled, the endpont should be the localhost address of the csm-authorization-sidecar
+    # Default value: None
+    # Examples: "1.2.3.4", "https://1.2.3.4", "https://abc.myonefs.com"
+    endpoint: "1.2.3.4"
+
+    # endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server
+    # Formerly this attribute was named as "isiPort"
+    # If authorization is enabled, endpointPort must match the port specified in the endpoint parameter of the karavi-authorization-config secret
+    # Allowed value: valid port number
+    # Default value: 8080
+    # endpointPort: 8080
+
+    # Is this a default cluster (would be used by storage classes without ClusterName parameter)
+    # Allowed values:
+    #   true: mark this cluster config as default
+    #   false: mark this cluster config as not default
+    # Default value: false
+    isDefault: true
+
+    # Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified.
+    # Allowed values:
+    #   true: skip OneFS API server's certificate verification
+    #   false: verify OneFS API server's certificates
+    # Default value: default value specified in values.yaml
+    # skipCertificateValidation: true
+
+    # The base path for the volumes to be created on PowerScale cluster
+    # This will be used if a storage class does not have the IsiPath parameter specified.
+    # Ensure that this path exists on PowerScale cluster.
+    # Allowed values: unix absolute path
+    # Default value: default value specified in values.yaml
+    # Examples: "/ifs/data/csi", "/ifs/engineering"
+    # isiPath: "/ifs/data/csi"
+
+    # The permissions for isi volume directory path
+    # This will be used if a storage class does not have the IsiVolumePathPermissions parameter specified.
+    # Allowed values: valid octal mode number
+    # Default value: "0777"
+    # Examples: "0777", "777", "0755"
+    # isiVolumePathPermissions: "0777"
+
+    # ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS
+    # When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the
+    # same exports are unresolvable/doesn't exist anymore.
+    # Allowed values:
+    #   true: ignore existing unresolvable hosts and append new host to the existing export
+    #   false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails
+    # Default value: false
+    # ignoreUnresolvableHosts: false
+
+    # Unique ID if the certificate is used to encrypt replication policy
+    # This will be used if a replication encrypted is enabled, leave empty in case you use unecrypted replication
+    # Allowed values: string, unique id of the certificate
+    # Default value: ""
+    # Examples: "dd9c736cc17e6dd5f7d85fe13528cfc20f3b4b0af4f26595d22328c8d1f461af"
+    # replicationCertificateID: ""
+
+    # To add more PowerScale systems, uncomment the following lines and provide the required values
+    # - clusterName: "cluster2"
+    #   username: "user"
+    #   password: "password"
+    #   endpoint: "1.2.3.4"
+    #   endpointPort: "8080"
diff --git a/examples/powerscale_reference_files/CSI_driver/values.yaml b/examples/powerscale_reference_files/CSI_driver/values.yaml
new file mode 100644
index 0000000000..14826ff22e
--- /dev/null
+++ b/examples/powerscale_reference_files/CSI_driver/values.yaml
@@ -0,0 +1,437 @@
+## K8S/DRIVER ATTRIBUTES
+########################
+# version: version of this values file
+# Note: Do not change this value
+version: "v2.17.0"
+
+images:
+  # "driver" defines the container image, used for the driver container.
+  driver:
+    image: quay.io/dell/container-storage-modules/csi-isilon:v2.17.0
+  # CSI sidecars
+  attacher:
+    image: registry.k8s.io/sig-storage/csi-attacher:v4.11.0
+  provisioner:
+    image: registry.k8s.io/sig-storage/csi-provisioner:v6.2.0
+  snapshotter:
+    image: registry.k8s.io/sig-storage/csi-snapshotter:v8.5.0
+  resizer:
+    image: registry.k8s.io/sig-storage/csi-resizer:v2.1.0
+  registrar:
+    image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.16.0
+  healthmonitor:
+    image: registry.k8s.io/sig-storage/csi-external-health-monitor-controller:v0.17.0
+
+  # CSM sidecars
+  replication:
+    image: quay.io/dell/container-storage-modules/dell-csi-replicator:v1.15.0
+  podmon:
+    image: quay.io/dell/container-storage-modules/podmon:v1.16.0
+  authorization:
+    image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+  metadataretriever:
+    image: quay.io/dell/container-storage-modules/csi-metadata-retriever:v1.14.0
+
+# CSI driver log level
+# Allowed values: "error", "warn"/"warning", "info", "debug"
+# Default value: "info"
+logLevel: "info"
+
+# certSecretCount: Represents number of certificate secrets, which user is going to create for
+# ssl authentication. (isilon-cert-0..isilon-cert-n)
+# Allowed values: n, where n > 0
+# Default value: None
+certSecretCount: 1
+
+# allowedNetworks: Custom networks for PowerScale export
+#   Specify list of networks which can be used for NFS I/O traffic; CIDR format should be used.
+# Allowed values: list of one or more networks
+# Default value: None
+# Examples: [192.168.1.0/24, 192.168.100.0/22]
+allowedNetworks: []
+
+# maxIsilonVolumesPerNode: Specify default value for maximum number of volumes that controller can publish to the node.
+# If value is zero CO SHALL decide how many volumes of this type can be published by the controller to the node.
+# This limit is applicable to all the nodes in the cluster for which node label 'max-isilon-volumes-per-node' is not set.
+# Allowed values: n, where n >= 0
+# Default value: 0
+maxIsilonVolumesPerNode: 0
+
+# imagePullPolicy: Policy to determine if the image should be pulled prior to starting the container.
+# Allowed values:
+#  Always: Always pull the image.
+#  IfNotPresent: Only pull the image if it does not already exist on the node.
+#  Never: Never pull the image.
+# Default value: None
+imagePullPolicy: IfNotPresent
+
+# verbose: Indicates what content of the OneFS REST API message should be logged in debug level logs
+# Allowed Values:
+#   0: log full content of the HTTP request and response
+#   1: log without the HTTP response body
+#   2: log only 1st line of the HTTP request and response
+# Default value: 0
+verbose: 1
+
+# Specify kubelet config dir path.
+# Ensure that the config.yaml file is present at this path.
+# Default value: /var/lib/kubelet
+kubeletConfigDir: /var/lib/kubelet
+
+# enableCustomTopology: Specify if custom topology label <provisionerName>.dellemc.com/<powerscalefqdnorip>:<provisionerName>
+# has to be used for making connection to backend PowerScale Array.
+# If enableCustomTopology is set to true, then do not specify allowedTopologies in storage class.
+# Allowed values:
+#   true : enable custom topology
+#   false: disable custom topology
+# Default value: false
+enableCustomTopology: false
+
+# fsGroupPolicy: Defines if the underlying volume supports changing ownership and permission of the volume before being mounted.
+# Allowed values:
+#   ReadWriteOnceWithFSType: supports volume ownership and permissions change only if the fsType is defined
+#   and the volume's accessModes contains ReadWriteOnce.
+#   File: kubernetes may use fsGroup to change permissions and ownership of the volume
+#   to match user requested fsGroup in the pod's security policy regardless of fstype or access mode.
+#   None: volumes will be mounted with no modifications.
+# Default value: ReadWriteOnceWithFSType
+fsGroupPolicy: ReadWriteOnceWithFSType
+
+# podmonAPIPort: Defines the port to be used within the kubernetes cluster
+# Allowed values:
+#   Any valid and free port.
+# Default value: 8083
+podmonAPIPort: 8083
+
+# maxPathLen: this parameter is used for setting the maximum Path length for the given volume.
+# Default value: 192
+# Examples: 192, 256
+maxPathLen: 192
+
+# azReconcileInterval: Interval to monitor and reconcile network interface labels on nodes.
+# Allowed values: Number followed by unit of time (s,m,h)
+# Default value: 1h
+azReconcileInterval: 1h
+
+# controller: configure controller pod specific parameters
+controller:
+  # controllerCount: defines the number of csi-powerscale controller pods to deploy to
+  # the Kubernetes release.
+  # Allowed values: n, where n > 0
+  # Default value: None
+  controllerCount: 1
+
+  # volumeNamePrefix: Prefix of PersistentVolume names created
+  # Allowed values: string
+  # Default value: csivol
+  # Examples: "k8s", "app1"
+  volumeNamePrefix: csivol
+
+  # leaderElection: configure leader election parameters
+  leaderElection:
+    # Duration, that non-leader candidates will wait to force acquire leadership
+    # Allowed values: Duration, in seconds. Must be greater than leaderElectionRenewDeadline
+    # Default value: 15s
+    leaderElectionLeaseDuration: 15s
+
+    # Duration, that the acting leader will retry refreshing leadership before giving up
+    # Allowed values: Duration, in seconds. Must be greater than leaderElectionRetryPeriod
+    # Default value: 10s
+    leaderElectionRenewDeadline: 10s
+
+    # Duration, the LeaderElector clients should wait between tries of actions.
+    # Allowed values: Duration, in seconds
+    # Default value: 5s
+    leaderElectionRetryPeriod: 5s
+
+  # replication: allows to configure replication
+  # Replication CRDs must be installed before installing driver
+  replication:
+    # enabled: Enable/Disable replication feature
+    # Allowed values:
+    #   true: enable replication feature(install dell-csi-replicator sidecar)
+    #   false: disable replication feature(do not install dell-csi-replicator sidecar)
+    # Default value: false
+    enabled: false
+
+    # replicationContextPrefix: prefix to use for naming of resources created by replication feature
+    # Allowed values: string
+    # Default value: powerscale
+    replicationContextPrefix: "powerscale"
+
+    # replicationPrefix: prefix to prepend to storage classes parameters
+    # Allowed values: string
+    # Default value: replication.storage.dell.com
+    replicationPrefix: "replication.storage.dell.com"
+
+  snapshot:
+    # enabled: Enable/Disable volume snapshot feature
+    # Allowed values:
+    #   true: enable volume snapshot feature(install snapshotter sidecar)
+    #   false: disable volume snapshot feature(do not install snapshotter sidecar)
+    # Default value: None
+    enabled: true
+
+    # snapNamePrefix: Prefix to apply to the names of a created snapshots
+    # Allowed values: string
+    # Default value: csi-snap
+    # Examples: "snap", "snapshot"
+    snapNamePrefix: csi-snap
+
+  resizer:
+    # enabled: Enable/Disable volume expansion feature
+    # Allowed values:
+    #   true: enable volume expansion feature(install resizer sidecar)
+    #   false: disable volume snapshot feature(do not install resizer sidecar)
+    # Default value: None
+    enabled: false
+
+  healthMonitor:
+    # enabled: Enable/Disable health monitor of CSI volumes- volume status, volume condition
+    # Allowed values:
+    #   true: enable checking of health condition of CSI volumes
+    #   false: disable checking of health condition of CSI volumes
+    # Default value: None
+    enabled: true
+
+    # interval: Interval of monitoring volume health condition
+    # Allowed values: Number followed by unit of time (s,m,h)
+    # Default value: 60s
+    interval: 60s
+
+  # nodeSelector: Define node selection constraints for pods of controller deployment.
+  # For the pod to be eligible to run on a node, the node must have each
+  # of the indicated key-value pairs as labels.
+  # Leave as blank to consider all nodes
+  # Allowed values: map of key-value pairs
+  # Default value: None
+  nodeSelector:
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
+  #  node-role.kubernetes.io/master: ""
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
+  #  node-role.kubernetes.io/control-plane: ""
+
+  # tolerations: Define tolerations for the controller deployment, if required.
+  # Default value: None
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
+  tolerations:
+  # - key: "node-role.kubernetes.io/master"
+  #   operator: "Exists"
+  #   effect: "NoSchedule"
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
+  # tolerations:
+  # - key: "node-role.kubernetes.io/control-plane"
+  #   operator: "Exists"
+  #   effect: "NoSchedule"
+
+# node: configure node pod specific parameters
+node:
+  # nodeSelector: Define node selection constraints for pods of node daemonset
+  # For the pod to be eligible to run on a node, the node must have each
+  # of the indicated key-value pairs as labels.
+  # Leave as blank to consider all nodes
+  # Allowed values: map of key-value pairs
+  # Default value: None
+  nodeSelector:
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
+  #  node-role.kubernetes.io/master: ""
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
+  #  node-role.kubernetes.io/control-plane: ""
+
+  # tolerations: Define tolerations for the node daemonset, if required.
+  # Default value: None
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
+  tolerations:
+  #  - key: "node.kubernetes.io/memory-pressure"
+  #    operator: "Exists"
+  #    effect: "NoExecute"
+  #  - key: "node.kubernetes.io/disk-pressure"
+  #    operator: "Exists"
+  #    effect: "NoExecute"
+  #  - key: "node.kubernetes.io/network-unavailable"
+  #    operator: "Exists"
+  #    effect: "NoExecute"
+  # - key: "node-role.kubernetes.io/master"
+  #   operator: "Exists"
+  #   effect: "NoSchedule"
+  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
+  # tolerations:
+  # - key: "node-role.kubernetes.io/control-plane"
+  #   operator: "Exists"
+  #   effect: "NoSchedule"
+
+  # Uncomment if CSM for Resiliency and CSI Driver pods monitor are enabled
+  # tolerations:
+  #  - key: "offline.vxflexos.storage.dell.com"
+  #    operator: "Exists"
+  #    effect: "NoSchedule"
+  #  - key: "vxflexos.podmon.storage.dell.com"
+  #    operator: "Exists"
+  #    effect: "NoSchedule"
+  #  - key: "offline.unity.storage.dell.com"
+  #    operator: "Exists"
+  #    effect: "NoSchedule"
+  #  - key: "unity.podmon.storage.dell.com"
+  #    operator: "Exists"
+  #    effect: "NoSchedule"
+  #  - key: "offline.isilon.storage.dell.com"
+  #    operator: "Exists"
+  #    effect: "NoSchedule"
+  #  - key: "isilon.podmon.storage.dell.com"
+  #    operator: "Exists"
+  #    effect: "NoSchedule"
+
+  # dnsPolicy: Determines the DNS Policy of the Node service.
+  # Allowed values:
+  #   Default: The Pod inherits the name resolution configuration from the node that the pods run on.
+  #   ClusterFirst: Any DNS query that does not match the configured cluster domain suffix, such as "www.kubernetes.io",
+  #   is forwarded to the upstream nameserver inherited from the node.
+  #   ClusterFirstWithHostNet:  For Pods running with hostNetwork, you should explicitly set this DNS policy.
+  #   None: It allows a Pod to ignore DNS settings from the Kubernetes environment.
+  #   All DNS settings are supposed to be provided using the dnsConfig field in the Pod Spec.
+  # Default value: ClusterFirst
+  # ClusterFirstWithHostNet is the recommended DNS policy.
+  # Prior to v1.5 of the driver, the default DNS policy was ClusterFirst.
+  # In certain scenarios, users might need to change the default dnsPolicy.
+  dnsPolicy: ClusterFirstWithHostNet
+
+  healthMonitor:
+    # enabled: Enable/Disable health monitor of CSI volumes- volume usage, volume condition
+    # Allowed values:
+    #   true: enable checking of health condition of CSI volumes
+    #   false: disable checking of health condition of CSI volumes
+    # Default value: None
+    enabled: true
+
+## PLATFORM ATTRIBUTES
+######################
+# endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server
+# Formerly this attribute was named as "isiPort"
+# This value acts as a default value for endpointPort, if not specified for a cluster config in secret
+# If authorization is enabled, endpointPort must match the port specified in the endpointPort parameter of the isilon-creds secret
+# Allowed value: valid port number
+# Default value: 8080
+endpointPort: 8080
+
+# skipCertificateValidation: Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified.
+# Formerly this attribute was named as "isiInsecure"
+# This value acts as a default value for skipCertificateValidation, if not specified for a cluster config in secret
+# Allowed values:
+#   true: skip OneFS API server's certificate verification
+#   false: verify OneFS API server's certificates
+# Default value: false
+skipCertificateValidation: true
+
+# isiAuthType: Indicates whether the authentication will be session-based or basic.
+# Allowed values:
+#   0: enables basic Authentication
+#   1: enables session-based Authentication
+# Default value: 0
+isiAuthType: 0
+
+# isiAccessZone: The name of the access zone a volume can be created in.
+# If storageclass is missing with AccessZone parameter, then value of isiAccessZone is used for the same.
+# Default value: System
+# Examples: System, zone1
+isiAccessZone: System
+
+# enableQuota: Indicates whether the provisioner should attempt to set (later unset) quota
+# on a newly provisioned volume.
+# This requires SmartQuotas to be enabled on PowerScale cluster.
+# Allowed values:
+#   true: set quota for volume
+#   false: do not set quota for volume
+enableQuota: true
+
+# isiPath: The base path for the volumes to be created on PowerScale cluster.
+# This value acts as a default value for isiPath, if not specified for a cluster config in secret
+# Ensure that this path exists on PowerScale cluster.
+# Allowed values: unix absolute path
+# Default value: /ifs
+# Examples: /ifs/data/csi, /ifs/engineering
+isiPath: /ifs/data/csi
+
+# isiVolumePathPermissions: The permissions for isi volume directory path
+# This value acts as a default value for isiVolumePathPermissions, if not specified for a cluster config in secret
+# Allowed values: valid octal mode number
+# Default value: "0777"
+# Examples: "0777", "777", "0755"
+isiVolumePathPermissions: "0777"
+
+# ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS
+# When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the
+# same exports are unresolvable/doesn't exist anymore.
+# Allowed values:
+#   true: ignore existing unresolvable hosts and append new host to the existing export
+#   false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails
+# Default value: false
+ignoreUnresolvableHosts: false
+
+# noProbeOnStart: Indicates whether the controller/node should probe all the PowerScale clusters during driver initialization
+# When set to true, the driver will not set node labels, please manually add
+# the label <provisionerName>.dellemc.com/<powerscalefqdnorip>:<provisionerName> on the nodes for each of the clusters reachable from the node.
+# Allowed values:
+#   true : do not probe all PowerScale clusters during driver initialization
+#   false: probe all PowerScale clusters during driver initialization
+# Default value: false
+noProbeOnStart: false
+
+# autoProbe: automatically probe the PowerScale cluster if not done already during CSI calls.
+# Allowed values:
+#   true : enable auto probe.
+#   false: disable auto probe.
+# Default value: false
+autoProbe: true
+
+authorization:
+  enabled: false
+  # proxyHost: hostname of the csm-authorization server
+  # Default value: None
+  proxyHost:
+  # skipCertificateValidation: certificate validation of the csm-authorization server
+  # Allowed Values:
+  #   "true" - TLS certificate verification will be skipped
+  #   "false" - TLS certificate will be verified
+  # Default value: "true"
+  skipCertificateValidation: true
+
+# Storage Capacity Tracking
+# Note: Capacity tracking is supported in kubernetes v1.24 and above, this feature will be automatically disabled in older versions.
+storageCapacity:
+  # enabled : Enable/Disable storage capacity tracking
+  # Allowed values:
+  #   true: enable storage capacity tracking
+  #   false: disable storage capacity tracking
+  # Default value: true
+  enabled: true
+  # pollInterval : Configure how often external-provisioner polls the driver to detect changed capacity
+  # Allowed values: 1m,2m,3m,...,10m,...,60m etc
+  # Default value: 5m
+  pollInterval: 5m
+
+# Enable this feature only after contact support for additional information
+podmon:
+  enabled: false
+  controller:
+    args:
+      - "--csisock=unix:/var/run/csi/csi.sock"
+      - "--labelvalue=csi-isilon"
+      - "--arrayConnectivityPollRate=60"
+      - "--driverPath=csi-isilon.dellemc.com"
+      - "--mode=controller"
+      - "--skipArrayConnectionValidation=false"
+      - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml"
+      - "--driverPodLabelValue=dell-storage"
+      - "--ignoreVolumelessPods=false"
+
+  node:
+    args:
+      - "--csisock=unix:/var/lib/kubelet/plugins/csi-isilon/csi_sock"
+      - "--labelvalue=csi-isilon"
+      - "--arrayConnectivityPollRate=60"
+      - "--driverPath=csi-isilon.dellemc.com"
+      - "--mode=node"
+      - "--leaderelection=false"
+      - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml"
+      - "--driverPodLabelValue=dell-storage"
+      - "--ignoreVolumelessPods=false"
diff --git a/examples/powerscale_reference_files/powerscale_metrics/values.yaml b/examples/powerscale_reference_files/powerscale_metrics/values.yaml
new file mode 100644
index 0000000000..a89148cd79
--- /dev/null
+++ b/examples/powerscale_reference_files/powerscale_metrics/values.yaml
@@ -0,0 +1,221 @@
+karaviMetricsPowerflex:
+  image: quay.io/dell/container-storage-modules/csm-metrics-powerflex:v1.15.0
+  enabled: false
+  collectorAddr: otel-collector:55680
+  # comma separated list of provisioner names (ex: csi-vxflexos.dellemc.com)
+  provisionerNames: csi-vxflexos.dellemc.com
+  # set sdcMetricsEnabled to "false" to disable collection of SDC metrics
+  sdcMetricsEnabled: "true"
+  # set polling frequency to the PowerFlex array to get metrics data
+  sdcPollFrequencySeconds: 10
+  volumePollFrequencySeconds: 10
+  # set volumeMetricsEnabled to "false" to disable collection of Volume metrics
+  volumeMetricsEnabled: "true"
+  # set storageClassPoolMetricsEnabled to "false" to disable collection of storage class/pool metrics
+  storageClassPoolMetricsEnabled: "true"
+  # set the polling frequency to configure the interval which storage class/pool metrics are gathered
+  storageClassPoolPollFrequencySeconds: 10
+  # set topologyMetricsEnabled to "false" to disable collection of topology metrics
+  topologyMetricsEnabled: "true"
+  # set polling frequency to get topology metrics
+  topologyMetricsPollFrequencySeconds: 30
+  # set the the default max concurrent queries to PowerFlex
+  concurrentPowerflexQueries: 10
+  # set the default endpoint for PowerFlex service
+  endpoint: karavi-metrics-powerflex
+  service:
+    type: ClusterIP
+  logLevel: INFO
+  logFormat: text
+  authorization:
+    enabled: false
+    # sidecarProxy.image: the container image used for the csm-authorization-sidecar.
+    # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    sidecarProxy:
+      image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    # proxyHost: hostname of the csm-authorization server
+    # Default value: None
+    proxyHost:
+    # skipCertificateValidation: certificate validation of the csm-authorization server
+    # Allowed Values:
+    #   "true" - TLS certificate verification will be skipped
+    #   "false" - TLS certificate will be verified
+    # Default value: "true"
+    skipCertificateValidation: true
+
+karaviMetricsPowerstore:
+  image: quay.io/dell/container-storage-modules/csm-metrics-powerstore:v1.15.0
+  enabled: false
+  collectorAddr: otel-collector:55680
+  # comma separated list of provisioner names (ex: csi-powerstore.dellemc.com)
+  provisionerNames: csi-powerstore.dellemc.com
+  # set polling frequency to the PowerStore array to get metrics data
+  volumePollFrequencySeconds: 20
+  spacePollFrequencySeconds: 300
+  arrayPollFrequencySeconds: 300
+  filesystemPollFrequencySeconds: 20
+  # apiTimeout: Defines the timeout for PowerStore API calls in seconds
+  # Allowed values: Number followed by unit (s,m,h)
+  # Examples: 60s, 5m, 1h
+  # Default value: 120s
+  apiTimeout: "120s"
+  # set volumeMetricsEnabled to "false" to disable collection of Volume metrics
+  volumeMetricsEnabled: "true"
+  # set the the default max concurrent queries to PowerStore
+  concurrentPowerstoreQueries: 10
+  # set topologyMetricsEnabled to "false" to disable collection of topology metrics
+  topologyMetricsEnabled: "true"
+  # set polling frequency to get topology metrics
+  topologyMetricsPollFrequencySeconds: 30
+  # set the default endpoint for PowerStore service
+  endpoint: karavi-metrics-powerstore
+  service:
+    type: ClusterIP
+  logLevel: INFO
+  logFormat: text
+  zipkin:
+    uri: ""
+    serviceName: metrics-powerstore
+    probability: 0.0
+  authorization:
+    enabled: false
+    # sidecarProxy.image: the container image used for the csm-authorization-sidecar.
+    # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    sidecarProxy:
+      image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    # proxyHost: hostname of the csm-authorization server
+    # Default value: None
+    proxyHost:
+    # skipCertificateValidation: certificate validation of the csm-authorization server
+    # Allowed Values:
+    #   "true" - TLS certificate verification will be skipped
+    #   "false" - TLS certificate will be verified
+    # Default value: "true"
+    skipCertificateValidation: true
+
+karaviMetricsPowerscale:
+  image: quay.io/dell/container-storage-modules/csm-metrics-powerscale:v1.12.0
+  enabled: true
+  collectorAddr: otel-collector:55680
+  # comma separated list of provisioner names (ex: csi-isilon.dellemc.com)
+  provisionerNames: csi-isilon.dellemc.com
+  # set capacityMetricsEnabled to "false" to disable collection of capacity metrics
+  capacityMetricsEnabled: "true"
+  # set performanceMetricsEnabled to "false" to disable collection of performance metrics
+  performanceMetricsEnabled: "true"
+  # set topologyMetricsEnabled to "false" to disable collection of topology metrics
+  topologyMetricsEnabled: "true"
+  # set polling frequency to get cluster capacity metrics data
+  clusterCapacityPollFrequencySeconds: 30
+  # set polling frequency to get cluster performance data
+  clusterPerformancePollFrequencySeconds: 20
+  # set polling frequency to get quota capacity metrics data
+  quotaCapacityPollFrequencySeconds: 30
+  # set polling frequency to get topology metrics
+  topologyMetricsPollFrequencySeconds: 30
+  # set the default max concurrent queries to PowerScale
+  concurrentPowerscaleQueries: 10
+  # set the default endpoint for PowerScale service
+  endpoint: karavi-metrics-powerscale
+  service:
+    type: ClusterIP
+  logLevel: INFO
+  logFormat: text
+  # isiClientOptions to access Powerscale OneFS API server
+  isiClientOptions:
+    # set isiSkipCertificateValidation to true/false to skip/verify OneFS API server's certificates
+    # default isiSkipCertificateValidation: true to skip OneFS API server's certificates
+    isiSkipCertificateValidation: true
+    # set isiAuthType to 0/1 to enables session-based/basic Authentication
+    # default isiAuthType: 0 to use session-based Authentication
+    isiAuthType: 1
+    # set isiLogVerbose to 0/1/2 decide High/Medium/Low content of the OneFS REST API message should be logged in debug level logs
+    # default isiLogVerbose: 0 to log full content of the HTTP request and response
+    isiLogVerbose: 0
+  authorization:
+    enabled: false
+    # sidecarProxy.image: the container image used for the csm-authorization-sidecar.
+    # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    sidecarProxy:
+      image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    # proxyHost: hostname of the csm-authorization server
+    # Default value: None
+    proxyHost:
+    # skipCertificateValidation: certificate validation of the csm-authorization server
+    # Allowed Values:
+    #   "true" - TLS certificate verification will be skipped
+    #   "false" - TLS certificate will be verified
+    # Default value: "true"
+    skipCertificateValidation: true
+
+karaviMetricsPowermax:
+  image: quay.io/dell/container-storage-modules/csm-metrics-powermax:v1.10.0
+  enabled: false
+  collectorAddr: otel-collector:55680
+  # comma separated list of provisioner names (ex: csi-powermax.dellemc.com)
+  provisionerNames: csi-powermax.dellemc.com
+  # set capacityMetricsEnabled to "false" to disable collection of capacity metrics
+  capacityMetricsEnabled: "true"
+  # set performanceMetricsEnabled to "false" to disable collection of performance metrics
+  performanceMetricsEnabled: "true"
+  # set polling frequency to get capacity metrics data for volume, storagegroup, srp and array
+  capacityPollFrequencySeconds: 3600
+  # set polling frequency to get performance metrics data for volume, storagegroup
+  performancePollFrequencySeconds: 300
+  # set the default max concurrent queries to PowerMax
+  concurrentPowermaxQueries: 10
+  # set topologyMetricsEnabled to "false" to disable collection of topology metrics
+  topologyMetricsEnabled: "true"
+  # set polling frequency to get topology metrics
+  topologyMetricsPollFrequencySeconds: 300
+  # set the default endpoint for PowerMax service
+  endpoint: karavi-metrics-powermax
+  # useSecret
+  # Defines if a Secret should be used to provide Unisphere for PowerMax endpoints
+  # and login credentials instead of the deprecated powermax-reverseproxy-config ConfigMap.
+  # If set to true, the contents of the secret specified by defaultCredentialsSecret
+  # will be used, in the new format, to specify Unisphere for PowerMax endpoints, array IDs,
+  # and login credentials. If set to false, the deprecated ConfigMap will be automatically
+  # created and used.
+  # Default value: false
+  useSecret: false
+  # defaultCredentialsSecret
+  # The name of the Kubernetes Secret containing the details of the PowerMax arrays,
+  # their Unisphere endpoints and their login credentials if useSecret is set to true.
+  # Default value: ""
+  defaultCredentialsSecret: ""
+  service:
+    type: ClusterIP
+  logLevel: INFO
+  logFormat: text
+  authorization:
+    enabled: false
+    # sidecarProxy.image: the container image used for the csm-authorization-sidecar.
+    # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    sidecarProxy:
+      image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0
+    # proxyHost: hostname of the csm-authorization server
+    # Default value: None
+    proxyHost:
+    # skipCertificateValidation: certificate validation of the csm-authorization server
+    # Allowed Values:
+    #   "true" - TLS certificate verification will be skipped
+    #   "false" - TLS certificate will be verified
+    # Default value: "true"
+    skipCertificateValidation: true
+
+otelCollector:
+  image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector:0.150.1
+  service:
+    type: ClusterIP
+  nginxProxy:
+    image: nginxinc/nginx-unprivileged:1.29
+# Karavi-observability requires cert-manager. If cert-manager is already present in cluster, set enabled to false not to install it.
+cert-manager:
+  enabled: true
+  startupapicheck:
+    enabled: false
+    serviceAccount:
+      create: false
+# Optionally, uncomment and specify the name of the pre-created namespace to install the module in it
+# namespace:
\ No newline at end of file

From f6dfbd062266ea50d0f42f2bf15c396b7bb558ce Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:58:52 +0530
Subject: [PATCH 23/33] remove old files

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../powerscale_reference_files/secret.yaml    |  90 ----
 .../powerscale_reference_files/values.yaml    | 437 ------------------
 2 files changed, 527 deletions(-)
 delete mode 100644 examples/powerscale_reference_files/secret.yaml
 delete mode 100644 examples/powerscale_reference_files/values.yaml

diff --git a/examples/powerscale_reference_files/secret.yaml b/examples/powerscale_reference_files/secret.yaml
deleted file mode 100644
index 75888d6023..0000000000
--- a/examples/powerscale_reference_files/secret.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright © 2020-2025 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-isilonClusters:
-  # logical name of PowerScale Cluster
-  - clusterName: "cluster1"
-
-    # username for connecting to PowerScale OneFS API server
-    # if authorization is enabled, username will be ignored
-    # Default value: None
-    username: "user"
-
-    # password for connecting to PowerScale OneFS API server
-    # if authorization is enabled, password will be ignored
-    password: "password"
-
-    # HTTPS endpoint of the PowerScale OneFS API server
-    # if authorization is enabled, the endpont should be the localhost address of the csm-authorization-sidecar
-    # Default value: None
-    # Examples: "1.2.3.4", "https://1.2.3.4", "https://abc.myonefs.com"
-    endpoint: "1.2.3.4"
-
-    # endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server
-    # Formerly this attribute was named as "isiPort"
-    # If authorization is enabled, endpointPort must match the port specified in the endpoint parameter of the karavi-authorization-config secret
-    # Allowed value: valid port number
-    # Default value: 8080
-    # endpointPort: 8080
-
-    # Is this a default cluster (would be used by storage classes without ClusterName parameter)
-    # Allowed values:
-    #   true: mark this cluster config as default
-    #   false: mark this cluster config as not default
-    # Default value: false
-    isDefault: true
-
-    # Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified.
-    # Allowed values:
-    #   true: skip OneFS API server's certificate verification
-    #   false: verify OneFS API server's certificates
-    # Default value: default value specified in values.yaml
-    # skipCertificateValidation: true
-
-    # The base path for the volumes to be created on PowerScale cluster
-    # This will be used if a storage class does not have the IsiPath parameter specified.
-    # Ensure that this path exists on PowerScale cluster.
-    # Allowed values: unix absolute path
-    # Default value: default value specified in values.yaml
-    # Examples: "/ifs/data/csi", "/ifs/engineering"
-    # isiPath: "/ifs/data/csi"
-
-    # The permissions for isi volume directory path
-    # This will be used if a storage class does not have the IsiVolumePathPermissions parameter specified.
-    # Allowed values: valid octal mode number
-    # Default value: "0777"
-    # Examples: "0777", "777", "0755"
-    # isiVolumePathPermissions: "0777"
-
-    # ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS
-    # When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the
-    # same exports are unresolvable/doesn't exist anymore.
-    # Allowed values:
-    #   true: ignore existing unresolvable hosts and append new host to the existing export
-    #   false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails
-    # Default value: false
-    # ignoreUnresolvableHosts: false
-
-    # Unique ID if the certificate is used to encrypt replication policy
-    # This will be used if a replication encrypted is enabled, leave empty in case you use unecrypted replication
-    # Allowed values: string, unique id of the certificate
-    # Default value: ""
-    # Examples: "dd9c736cc17e6dd5f7d85fe13528cfc20f3b4b0af4f26595d22328c8d1f461af"
-    # replicationCertificateID: ""
-
-    # To add more PowerScale systems, uncomment the following lines and provide the required values
-    # - clusterName: "cluster2"
-    #   username: "user"
-    #   password: "password"
-    #   endpoint: "1.2.3.4"
-    #   endpointPort: "8080"
diff --git a/examples/powerscale_reference_files/values.yaml b/examples/powerscale_reference_files/values.yaml
deleted file mode 100644
index 2b612e02ea..0000000000
--- a/examples/powerscale_reference_files/values.yaml
+++ /dev/null
@@ -1,437 +0,0 @@
-## K8S/DRIVER ATTRIBUTES
-########################
-# version: version of this values file
-# Note: Do not change this value
-version: "v2.16.0"
-
-images:
-  # "driver" defines the container image, used for the driver container.
-  driver:
-    image: quay.io/dell/container-storage-modules/csi-isilon:v2.16.0
-  # CSI sidecars
-  attacher:
-    image: registry.k8s.io/sig-storage/csi-attacher:v4.10.0
-  provisioner:
-    image: registry.k8s.io/sig-storage/csi-provisioner:v6.1.0
-  snapshotter:
-    image: registry.k8s.io/sig-storage/csi-snapshotter:v8.4.0
-  resizer:
-    image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0
-  registrar:
-    image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.15.0
-  healthmonitor:
-    image: registry.k8s.io/sig-storage/csi-external-health-monitor-controller:v0.16.0
-
-  # CSM sidecars
-  replication:
-    image: quay.io/dell/container-storage-modules/dell-csi-replicator:v1.14.0
-  podmon:
-    image: quay.io/dell/container-storage-modules/podmon:v1.15.0
-  authorization:
-    image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.4.0
-  metadataretriever:
-    image: quay.io/dell/container-storage-modules/csi-metadata-retriever:v1.13.0
-
-# CSI driver log level
-# Allowed values: "error", "warn"/"warning", "info", "debug"
-# Default value: "info"
-logLevel: "info"
-
-# certSecretCount: Represents number of certificate secrets, which user is going to create for
-# ssl authentication. (isilon-cert-0..isilon-cert-n)
-# Allowed values: n, where n > 0
-# Default value: None
-certSecretCount: 1
-
-# allowedNetworks: Custom networks for PowerScale export
-#   Specify list of networks which can be used for NFS I/O traffic; CIDR format should be used.
-# Allowed values: list of one or more networks
-# Default value: None
-# Examples: [192.168.1.0/24, 192.168.100.0/22]
-allowedNetworks: []
-
-# maxIsilonVolumesPerNode: Specify default value for maximum number of volumes that controller can publish to the node.
-# If value is zero CO SHALL decide how many volumes of this type can be published by the controller to the node.
-# This limit is applicable to all the nodes in the cluster for which node label 'max-isilon-volumes-per-node' is not set.
-# Allowed values: n, where n >= 0
-# Default value: 0
-maxIsilonVolumesPerNode: 0
-
-# imagePullPolicy: Policy to determine if the image should be pulled prior to starting the container.
-# Allowed values:
-#  Always: Always pull the image.
-#  IfNotPresent: Only pull the image if it does not already exist on the node.
-#  Never: Never pull the image.
-# Default value: None
-imagePullPolicy: IfNotPresent
-
-# verbose: Indicates what content of the OneFS REST API message should be logged in debug level logs
-# Allowed Values:
-#   0: log full content of the HTTP request and response
-#   1: log without the HTTP response body
-#   2: log only 1st line of the HTTP request and response
-# Default value: 0
-verbose: 1
-
-# Specify kubelet config dir path.
-# Ensure that the config.yaml file is present at this path.
-# Default value: /var/lib/kubelet
-kubeletConfigDir: /var/lib/kubelet
-
-# enableCustomTopology: Specify if custom topology label <provisionerName>.dellemc.com/<powerscalefqdnorip>:<provisionerName>
-# has to be used for making connection to backend PowerScale Array.
-# If enableCustomTopology is set to true, then do not specify allowedTopologies in storage class.
-# Allowed values:
-#   true : enable custom topology
-#   false: disable custom topology
-# Default value: false
-enableCustomTopology: false
-
-# fsGroupPolicy: Defines if the underlying volume supports changing ownership and permission of the volume before being mounted.
-# Allowed values:
-#   ReadWriteOnceWithFSType: supports volume ownership and permissions change only if the fsType is defined
-#   and the volume's accessModes contains ReadWriteOnce.
-#   File: kubernetes may use fsGroup to change permissions and ownership of the volume
-#   to match user requested fsGroup in the pod's security policy regardless of fstype or access mode.
-#   None: volumes will be mounted with no modifications.
-# Default value: ReadWriteOnceWithFSType
-fsGroupPolicy: ReadWriteOnceWithFSType
-
-# podmonAPIPort: Defines the port to be used within the kubernetes cluster
-# Allowed values:
-#   Any valid and free port.
-# Default value: 8083
-podmonAPIPort: 8083
-
-# maxPathLen: this parameter is used for setting the maximum Path length for the given volume.
-# Default value: 192
-# Examples: 192, 256
-maxPathLen: 192
-
-# azReconcileInterval: Interval to monitor and reconcile network interface labels on nodes.
-# Allowed values: Number followed by unit of time (s,m,h)
-# Default value: 1h
-azReconcileInterval: 1h
-
-# controller: configure controller pod specific parameters
-controller:
-  # controllerCount: defines the number of csi-powerscale controller pods to deploy to
-  # the Kubernetes release.
-  # Allowed values: n, where n > 0
-  # Default value: None
-  controllerCount: 2
-
-  # volumeNamePrefix: Prefix of PersistentVolume names created
-  # Allowed values: string
-  # Default value: csivol
-  # Examples: "k8s", "app1"
-  volumeNamePrefix: csivol
-
-  # leaderElection: configure leader election parameters
-  leaderElection:
-    # Duration, that non-leader candidates will wait to force acquire leadership
-    # Allowed values: Duration, in seconds. Must be greater than leaderElectionRenewDeadline
-    # Default value: 15s
-    leaderElectionLeaseDuration: 15s
-
-    # Duration, that the acting leader will retry refreshing leadership before giving up
-    # Allowed values: Duration, in seconds. Must be greater than leaderElectionRetryPeriod
-    # Default value: 10s
-    leaderElectionRenewDeadline: 10s
-
-    # Duration, the LeaderElector clients should wait between tries of actions.
-    # Allowed values: Duration, in seconds
-    # Default value: 5s
-    leaderElectionRetryPeriod: 5s
-
-  # replication: allows to configure replication
-  # Replication CRDs must be installed before installing driver
-  replication:
-    # enabled: Enable/Disable replication feature
-    # Allowed values:
-    #   true: enable replication feature(install dell-csi-replicator sidecar)
-    #   false: disable replication feature(do not install dell-csi-replicator sidecar)
-    # Default value: false
-    enabled: false
-
-    # replicationContextPrefix: prefix to use for naming of resources created by replication feature
-    # Allowed values: string
-    # Default value: powerscale
-    replicationContextPrefix: "powerscale"
-
-    # replicationPrefix: prefix to prepend to storage classes parameters
-    # Allowed values: string
-    # Default value: replication.storage.dell.com
-    replicationPrefix: "replication.storage.dell.com"
-
-  snapshot:
-    # enabled: Enable/Disable volume snapshot feature
-    # Allowed values:
-    #   true: enable volume snapshot feature(install snapshotter sidecar)
-    #   false: disable volume snapshot feature(do not install snapshotter sidecar)
-    # Default value: None
-    enabled: true
-
-    # snapNamePrefix: Prefix to apply to the names of a created snapshots
-    # Allowed values: string
-    # Default value: csi-snap
-    # Examples: "snap", "snapshot"
-    snapNamePrefix: csi-snap
-
-  resizer:
-    # enabled: Enable/Disable volume expansion feature
-    # Allowed values:
-    #   true: enable volume expansion feature(install resizer sidecar)
-    #   false: disable volume snapshot feature(do not install resizer sidecar)
-    # Default value: None
-    enabled: true
-
-  healthMonitor:
-    # enabled: Enable/Disable health monitor of CSI volumes- volume status, volume condition
-    # Allowed values:
-    #   true: enable checking of health condition of CSI volumes
-    #   false: disable checking of health condition of CSI volumes
-    # Default value: None
-    enabled: false
-
-    # interval: Interval of monitoring volume health condition
-    # Allowed values: Number followed by unit of time (s,m,h)
-    # Default value: 60s
-    interval: 60s
-
-  # nodeSelector: Define node selection constraints for pods of controller deployment.
-  # For the pod to be eligible to run on a node, the node must have each
-  # of the indicated key-value pairs as labels.
-  # Leave as blank to consider all nodes
-  # Allowed values: map of key-value pairs
-  # Default value: None
-  nodeSelector:
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
-  #  node-role.kubernetes.io/master: ""
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
-  #  node-role.kubernetes.io/control-plane: ""
-
-  # tolerations: Define tolerations for the controller deployment, if required.
-  # Default value: None
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
-  tolerations:
-  # - key: "node-role.kubernetes.io/master"
-  #   operator: "Exists"
-  #   effect: "NoSchedule"
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
-  # tolerations:
-  # - key: "node-role.kubernetes.io/control-plane"
-  #   operator: "Exists"
-  #   effect: "NoSchedule"
-
-# node: configure node pod specific parameters
-node:
-  # nodeSelector: Define node selection constraints for pods of node daemonset
-  # For the pod to be eligible to run on a node, the node must have each
-  # of the indicated key-value pairs as labels.
-  # Leave as blank to consider all nodes
-  # Allowed values: map of key-value pairs
-  # Default value: None
-  nodeSelector:
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
-  #  node-role.kubernetes.io/master: ""
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
-  #  node-role.kubernetes.io/control-plane: ""
-
-  # tolerations: Define tolerations for the node daemonset, if required.
-  # Default value: None
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint
-  tolerations:
-  #  - key: "node.kubernetes.io/memory-pressure"
-  #    operator: "Exists"
-  #    effect: "NoExecute"
-  #  - key: "node.kubernetes.io/disk-pressure"
-  #    operator: "Exists"
-  #    effect: "NoExecute"
-  #  - key: "node.kubernetes.io/network-unavailable"
-  #    operator: "Exists"
-  #    effect: "NoExecute"
-  # - key: "node-role.kubernetes.io/master"
-  #   operator: "Exists"
-  #   effect: "NoSchedule"
-  # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint
-  # tolerations:
-  # - key: "node-role.kubernetes.io/control-plane"
-  #   operator: "Exists"
-  #   effect: "NoSchedule"
-
-  # Uncomment if CSM for Resiliency and CSI Driver pods monitor are enabled
-  # tolerations:
-  #  - key: "offline.vxflexos.storage.dell.com"
-  #    operator: "Exists"
-  #    effect: "NoSchedule"
-  #  - key: "vxflexos.podmon.storage.dell.com"
-  #    operator: "Exists"
-  #    effect: "NoSchedule"
-  #  - key: "offline.unity.storage.dell.com"
-  #    operator: "Exists"
-  #    effect: "NoSchedule"
-  #  - key: "unity.podmon.storage.dell.com"
-  #    operator: "Exists"
-  #    effect: "NoSchedule"
-  #  - key: "offline.isilon.storage.dell.com"
-  #    operator: "Exists"
-  #    effect: "NoSchedule"
-  #  - key: "isilon.podmon.storage.dell.com"
-  #    operator: "Exists"
-  #    effect: "NoSchedule"
-
-  # dnsPolicy: Determines the DNS Policy of the Node service.
-  # Allowed values:
-  #   Default: The Pod inherits the name resolution configuration from the node that the pods run on.
-  #   ClusterFirst: Any DNS query that does not match the configured cluster domain suffix, such as "www.kubernetes.io",
-  #   is forwarded to the upstream nameserver inherited from the node.
-  #   ClusterFirstWithHostNet:  For Pods running with hostNetwork, you should explicitly set this DNS policy.
-  #   None: It allows a Pod to ignore DNS settings from the Kubernetes environment.
-  #   All DNS settings are supposed to be provided using the dnsConfig field in the Pod Spec.
-  # Default value: ClusterFirst
-  # ClusterFirstWithHostNet is the recommended DNS policy.
-  # Prior to v1.5 of the driver, the default DNS policy was ClusterFirst.
-  # In certain scenarios, users might need to change the default dnsPolicy.
-  dnsPolicy: ClusterFirstWithHostNet
-
-  healthMonitor:
-    # enabled: Enable/Disable health monitor of CSI volumes- volume usage, volume condition
-    # Allowed values:
-    #   true: enable checking of health condition of CSI volumes
-    #   false: disable checking of health condition of CSI volumes
-    # Default value: None
-    enabled: false
-
-## PLATFORM ATTRIBUTES
-######################
-# endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server
-# Formerly this attribute was named as "isiPort"
-# This value acts as a default value for endpointPort, if not specified for a cluster config in secret
-# If authorization is enabled, endpointPort must match the port specified in the endpointPort parameter of the isilon-creds secret
-# Allowed value: valid port number
-# Default value: 8080
-endpointPort: 8080
-
-# skipCertificateValidation: Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified.
-# Formerly this attribute was named as "isiInsecure"
-# This value acts as a default value for skipCertificateValidation, if not specified for a cluster config in secret
-# Allowed values:
-#   true: skip OneFS API server's certificate verification
-#   false: verify OneFS API server's certificates
-# Default value: false
-skipCertificateValidation: true
-
-# isiAuthType: Indicates whether the authentication will be session-based or basic.
-# Allowed values:
-#   0: enables basic Authentication
-#   1: enables session-based Authentication
-# Default value: 0
-isiAuthType: 0
-
-# isiAccessZone: The name of the access zone a volume can be created in.
-# If storageclass is missing with AccessZone parameter, then value of isiAccessZone is used for the same.
-# Default value: System
-# Examples: System, zone1
-isiAccessZone: System
-
-# enableQuota: Indicates whether the provisioner should attempt to set (later unset) quota
-# on a newly provisioned volume.
-# This requires SmartQuotas to be enabled on PowerScale cluster.
-# Allowed values:
-#   true: set quota for volume
-#   false: do not set quota for volume
-enableQuota: true
-
-# isiPath: The base path for the volumes to be created on PowerScale cluster.
-# This value acts as a default value for isiPath, if not specified for a cluster config in secret
-# Ensure that this path exists on PowerScale cluster.
-# Allowed values: unix absolute path
-# Default value: /ifs
-# Examples: /ifs/data/csi, /ifs/engineering
-isiPath: /ifs/data/csi
-
-# isiVolumePathPermissions: The permissions for isi volume directory path
-# This value acts as a default value for isiVolumePathPermissions, if not specified for a cluster config in secret
-# Allowed values: valid octal mode number
-# Default value: "0777"
-# Examples: "0777", "777", "0755"
-isiVolumePathPermissions: "0777"
-
-# ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS
-# When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the
-# same exports are unresolvable/doesn't exist anymore.
-# Allowed values:
-#   true: ignore existing unresolvable hosts and append new host to the existing export
-#   false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails
-# Default value: false
-ignoreUnresolvableHosts: false
-
-# noProbeOnStart: Indicates whether the controller/node should probe all the PowerScale clusters during driver initialization
-# When set to true, the driver will not set node labels, please manually add
-# the label <provisionerName>.dellemc.com/<powerscalefqdnorip>:<provisionerName> on the nodes for each of the clusters reachable from the node.
-# Allowed values:
-#   true : do not probe all PowerScale clusters during driver initialization
-#   false: probe all PowerScale clusters during driver initialization
-# Default value: false
-noProbeOnStart: false
-
-# autoProbe: automatically probe the PowerScale cluster if not done already during CSI calls.
-# Allowed values:
-#   true : enable auto probe.
-#   false: disable auto probe.
-# Default value: false
-autoProbe: true
-
-authorization:
-  enabled: false
-  # proxyHost: hostname of the csm-authorization server
-  # Default value: None
-  proxyHost:
-  # skipCertificateValidation: certificate validation of the csm-authorization server
-  # Allowed Values:
-  #   "true" - TLS certificate verification will be skipped
-  #   "false" - TLS certificate will be verified
-  # Default value: "true"
-  skipCertificateValidation: true
-
-# Storage Capacity Tracking
-# Note: Capacity tracking is supported in kubernetes v1.24 and above, this feature will be automatically disabled in older versions.
-storageCapacity:
-  # enabled : Enable/Disable storage capacity tracking
-  # Allowed values:
-  #   true: enable storage capacity tracking
-  #   false: disable storage capacity tracking
-  # Default value: true
-  enabled: true
-  # pollInterval : Configure how often external-provisioner polls the driver to detect changed capacity
-  # Allowed values: 1m,2m,3m,...,10m,...,60m etc
-  # Default value: 5m
-  pollInterval: 5m
-
-# Enable this feature only after contact support for additional information
-podmon:
-  enabled: false
-  controller:
-    args:
-      - "--csisock=unix:/var/run/csi/csi.sock"
-      - "--labelvalue=csi-isilon"
-      - "--arrayConnectivityPollRate=60"
-      - "--driverPath=csi-isilon.dellemc.com"
-      - "--mode=controller"
-      - "--skipArrayConnectionValidation=false"
-      - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml"
-      - "--driverPodLabelValue=dell-storage"
-      - "--ignoreVolumelessPods=false"
-
-  node:
-    args:
-      - "--csisock=unix:/var/lib/kubelet/plugins/csi-isilon/csi_sock"
-      - "--labelvalue=csi-isilon"
-      - "--arrayConnectivityPollRate=60"
-      - "--driverPath=csi-isilon.dellemc.com"
-      - "--mode=node"
-      - "--leaderelection=false"
-      - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml"
-      - "--driverPodLabelValue=dell-storage"
-      - "--ignoreVolumelessPods=false"

From e25cab028c312308357613dd7e2bb87ba83f1f4e Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Thu, 18 Jun 2026 08:39:03 +0530
Subject: [PATCH 24/33] Signed-off-by: priti-parate
 <140157516+priti-parate@users.noreply.github.com> Fix for victoria
 loadbalacer IP preservation

---
 .../tasks/apply_victoria_crs.yml              | 130 +++++++++++++++++-
 .../tasks/migrate_statefulset.yml             |   7 +-
 upgrade/roles/upgrade_telemetry/vars/main.yml |  27 ++++
 3 files changed, 156 insertions(+), 8 deletions(-)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
index d725cf067a..f557a7d0e7 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
@@ -117,11 +117,7 @@
 
 - name: Display LoadBalancer IP injection status
   ansible.builtin.debug:
-    msg: >-
-      {{ victoria_lb_ips_preserved
-         if (preserved_vminsert_ip | default('') | length > 0)
-            or (preserved_vmselect_ip | default('') | length > 0)
-         else victoria_lb_ips_not_preserved }}
+    msg: "{{ victoria_lb_ip_injection_status }}"
 
 # ── Apply main CR (VMCluster only — 2.2 cluster mode only) ──
 - name: Apply VMCluster CR (cluster mode only) with retry
@@ -135,6 +131,130 @@
   delegate_to: "{{ kube_vip }}"
   connection: ssh
 
+# ── Wait for VMCluster LoadBalancer IPs and reclaim if stolen ──
+# The operator creates vminsert/vmselect services asynchronously after the CR is applied.
+# We MUST wait for these services to get their LoadBalancer IPs BEFORE Phase 3
+# (telemetry.sh) runs, because telemetry.sh also creates VictoriaLogs services via
+# kubectl apply -k. If VL services are created before VM services exist, MetalLB
+# assigns the freed IPs to VL services, leaving VM services in <pending> state.
+#
+# If the preserved IPs got assigned to wrong services, we reclaim them:
+#   1. Find services holding the preserved IPs that are NOT vminsert/vmselect
+#   2. Delete those services to free the IPs
+#   3. Wait for vminsert/vmselect to claim the preserved IPs
+
+- name: Initial wait for vminsert LoadBalancer IP
+  ansible.builtin.shell: |
+    kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \
+      -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
+  register: vminsert_lb_ip
+  until: vminsert_lb_ip.stdout | trim | length > 0
+  retries: 24
+  delay: 5
+  changed_when: false
+  failed_when: false
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+
+- name: Initial wait for vmselect LoadBalancer IP
+  ansible.builtin.shell: |
+    kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \
+      -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
+  register: vmselect_lb_ip
+  until: vmselect_lb_ip.stdout | trim | length > 0
+  retries: 24
+  delay: 5
+  changed_when: false
+  failed_when: false
+  delegate_to: "{{ kube_vip }}"
+  connection: ssh
+
+# ── Reclaim stolen IPs if VMCluster services are still pending ──
+- name: Reclaim preserved IPs from wrong services
+  when:
+    - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0
+    - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0
+  block:
+    - name: Find services holding preserved IPs that are not VMCluster services
+      ansible.builtin.shell: |
+        set -o pipefail
+        PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}"
+        VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}"
+        kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \
+          python3 -c "
+        import json, sys
+        data = json.load(sys.stdin)
+        preserved = set('${PRESERVED_IPS}'.split())
+        vmcluster = set('${VMCLUSTER_SVCS}'.split())
+        for svc in data.get('items', []):
+            name = svc['metadata']['name']
+            if name in vmcluster:
+                continue
+            ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', [])
+            for ing in ingress:
+                ip = ing.get('ip', '')
+                if ip in preserved:
+                    print(name)
+                    break
+        " || true
+      register: ip_thieves
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Display services holding preserved IPs
+      ansible.builtin.debug:
+        msg: "{{ victoria_lb_ip_thieves_found }}"
+      when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0
+
+    - name: Delete services that stole preserved IPs
+      ansible.builtin.command:
+        cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s
+      loop: "{{ ip_thieves.stdout_lines | default([]) | select() | list }}"
+      changed_when: true
+      failed_when: false
+      when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Wait for vminsert to reclaim preserved IP
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \
+          -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
+      register: vminsert_lb_ip
+      until: vminsert_lb_ip.stdout | trim | length > 0
+      retries: 30
+      delay: 5
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+    - name: Wait for vmselect to reclaim preserved IP
+      ansible.builtin.shell: |
+        kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \
+          -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
+      register: vmselect_lb_ip
+      until: vmselect_lb_ip.stdout | trim | length > 0
+      retries: 30
+      delay: 5
+      changed_when: false
+      failed_when: false
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
+- name: Display confirmed LoadBalancer IPs
+  ansible.builtin.debug:
+    msg: "{{ victoria_lb_ip_confirmed }}"
+
+- name: Warn if LoadBalancer IPs still not assigned after reclaim
+  ansible.builtin.debug:
+    msg: "{{ victoria_lb_ip_reclaim_failed }}"
+  when: >-
+    (vminsert_lb_ip is defined and vminsert_lb_ip.stdout is defined and vminsert_lb_ip.stdout | trim | length == 0) or
+    (vmselect_lb_ip is defined and vmselect_lb_ip.stdout is defined and vmselect_lb_ip.stdout | trim | length == 0)
+
 # ── Apply scrape and agent CRs ──
 - name: Check for VMScrape manifest
   ansible.builtin.stat:
diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
index 23648e2ed6..b290bc639a 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml
@@ -252,6 +252,8 @@
     # ── Cleanup old pre-operator services and deployments ──
     # The operator creates new services with different names (e.g. vminsert-victoria-cluster),
     # so the old standalone services become stale and waste LoadBalancer IPs.
+    # Old services MUST be deleted BEFORE applying VMCluster CR so MetalLB can
+    # assign the same IPs to the new operator-managed services via loadBalancerIP.
     - name: Find old pre-operator services
       ansible.builtin.shell: |
         set -o pipefail
@@ -285,6 +287,5 @@
 
     - name: Display old resource cleanup summary
       ansible.builtin.debug:
-        msg:
-          - "Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}"
-          - "Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}"
+        msg: "{{ victoria_old_svc_cleanup_summary }}"
+        verbosity: 2
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index a869e5a52a..457c6f5bd2 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -105,7 +105,34 @@ victoria_lb_ips_preserved: >-
   LoadBalancer IPs injected into VMCluster manifest -
   vminsert: {{ preserved_vminsert_ip | default('N/A') }},
   vmselect: {{ preserved_vmselect_ip | default('N/A') }}
+victoria_lb_ip_injection_status: >-
+  {{ victoria_lb_ips_preserved
+     if (preserved_vminsert_ip | default('') | length > 0)
+        or (preserved_vmselect_ip | default('') | length > 0)
+     else victoria_lb_ips_not_preserved }}
 victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)"
+victoria_lb_ip_confirmed: >-
+  VMCluster LoadBalancer IPs confirmed -
+  vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }},
+  vmselect-{{ new_vmcluster_name }}: {{ vmselect_lb_ip.stdout | default('PENDING') | trim }}
+victoria_lb_ip_reclaim_needed: >-
+  VMCluster services still pending after initial wait.
+  Checking if preserved IPs were assigned to wrong services...
+victoria_lb_ip_thieves_found: >-
+  Services holding preserved IPs (will be deleted and re-created by telemetry.sh):
+  {{ ip_thieves.stdout_lines | default([]) | select() | list }}
+victoria_lb_ip_reclaim_success: >-
+  Successfully reclaimed preserved IPs for VMCluster services -
+  vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }},
+  vmselect-{{ new_vmcluster_name }}: {{ vmselect_lb_ip.stdout | default('PENDING') | trim }}
+victoria_lb_ip_reclaim_failed: >-
+  WARNING: VMCluster services still do not have LoadBalancer IPs after reclaim attempt.
+  vminsert: {{ vminsert_lb_ip.stdout | default('NONE') | trim }},
+  vmselect: {{ vmselect_lb_ip.stdout | default('NONE') | trim }}.
+  Please use new assigned IPs.
+victoria_old_svc_cleanup_summary: >-
+  Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}.
+  Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}
 victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state."
 victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting"
 telemetry_upgrade_success: "Telemetry upgrade COMPLETED: All telemetry pods are running and ready."

From 1bd65fa73fc99a45ef91e233d1711acefcf59a6e Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Thu, 18 Jun 2026 08:57:39 +0530
Subject: [PATCH 25/33] Signed-off-by: priti-parate
 <140157516+priti-parate@users.noreply.github.com> address review comments

---
 .../tasks/apply_victoria_crs.yml              | 60 +++++++------------
 upgrade/roles/upgrade_telemetry/vars/main.yml |  8 ++-
 2 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
index f557a7d0e7..2a1c195ddb 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
@@ -131,7 +131,7 @@
   delegate_to: "{{ kube_vip }}"
   connection: ssh
 
-# ── Wait for VMCluster LoadBalancer IPs and reclaim if stolen ──
+# ── Wait for VMCluster LoadBalancer IPs and reclaim if reassigned ──
 # The operator creates vminsert/vmselect services asynchronously after the CR is applied.
 # We MUST wait for these services to get their LoadBalancer IPs BEFORE Phase 3
 # (telemetry.sh) runs, because telemetry.sh also creates VictoriaLogs services via
@@ -140,8 +140,8 @@
 #
 # If the preserved IPs got assigned to wrong services, we reclaim them:
 #   1. Find services holding the preserved IPs that are NOT vminsert/vmselect
-#   2. Delete those services to free the IPs
-#   3. Wait for vminsert/vmselect to claim the preserved IPs
+#   2. Delete those conflicting services to free the IPs
+#   3. Wait for vminsert/vmselect to reclaim the preserved IPs
 
 - name: Initial wait for vminsert LoadBalancer IP
   ansible.builtin.shell: |
@@ -149,8 +149,8 @@
       -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
   register: vminsert_lb_ip
   until: vminsert_lb_ip.stdout | trim | length > 0
-  retries: 24
-  delay: 5
+  retries: "{{ lb_ip_wait_retries }}"
+  delay: "{{ lb_ip_wait_delay }}"
   changed_when: false
   failed_when: false
   delegate_to: "{{ kube_vip }}"
@@ -162,42 +162,22 @@
       -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
   register: vmselect_lb_ip
   until: vmselect_lb_ip.stdout | trim | length > 0
-  retries: 24
-  delay: 5
+  retries: "{{ lb_ip_wait_retries }}"
+  delay: "{{ lb_ip_wait_delay }}"
   changed_when: false
   failed_when: false
   delegate_to: "{{ kube_vip }}"
   connection: ssh
 
-# ── Reclaim stolen IPs if VMCluster services are still pending ──
-- name: Reclaim preserved IPs from wrong services
+# ── Reclaim reassigned IPs if VMCluster services are still pending ──
+- name: Reclaim preserved IPs from conflicting services
   when:
     - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0
     - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0
   block:
     - name: Find services holding preserved IPs that are not VMCluster services
-      ansible.builtin.shell: |
-        set -o pipefail
-        PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}"
-        VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}"
-        kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \
-          python3 -c "
-        import json, sys
-        data = json.load(sys.stdin)
-        preserved = set('${PRESERVED_IPS}'.split())
-        vmcluster = set('${VMCLUSTER_SVCS}'.split())
-        for svc in data.get('items', []):
-            name = svc['metadata']['name']
-            if name in vmcluster:
-                continue
-            ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', [])
-            for ing in ingress:
-                ip = ing.get('ip', '')
-                if ip in preserved:
-                    print(name)
-                    break
-        " || true
-      register: ip_thieves
+      ansible.builtin.shell: "{{ lookup('template', 'find_ip_conflict_svcs.sh.j2') }}"
+      register: ip_conflict_svcs
       changed_when: false
       failed_when: false
       delegate_to: "{{ kube_vip }}"
@@ -205,16 +185,16 @@
 
     - name: Display services holding preserved IPs
       ansible.builtin.debug:
-        msg: "{{ victoria_lb_ip_thieves_found }}"
-      when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0
+        msg: "{{ victoria_lb_ip_conflict_svcs_found }}"
+      when: ip_conflict_svcs.stdout_lines | default([]) | select() | list | length > 0
 
-    - name: Delete services that stole preserved IPs
+    - name: Delete conflicting services holding preserved IPs
       ansible.builtin.command:
         cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s
-      loop: "{{ ip_thieves.stdout_lines | default([]) | select() | list }}"
+      loop: "{{ ip_conflict_svcs.stdout_lines | default([]) | select() | list }}"
       changed_when: true
       failed_when: false
-      when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0
+      when: ip_conflict_svcs.stdout_lines | default([]) | select() | list | length > 0
       delegate_to: "{{ kube_vip }}"
       connection: ssh
 
@@ -224,8 +204,8 @@
           -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
       register: vminsert_lb_ip
       until: vminsert_lb_ip.stdout | trim | length > 0
-      retries: 30
-      delay: 5
+      retries: "{{ lb_ip_wait_retries }}"
+      delay: "{{ lb_ip_wait_delay }}"
       changed_when: false
       failed_when: false
       delegate_to: "{{ kube_vip }}"
@@ -237,8 +217,8 @@
           -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
       register: vmselect_lb_ip
       until: vmselect_lb_ip.stdout | trim | length > 0
-      retries: 30
-      delay: 5
+      retries: "{{ lb_ip_wait_retries }}"
+      delay: "{{ lb_ip_wait_delay }}"
       changed_when: false
       failed_when: false
       delegate_to: "{{ kube_vip }}"
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index 457c6f5bd2..ce5129396c 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -45,6 +45,10 @@ pod_wait_delay: 15
 idrac_rollout_retries: 3
 idrac_rollout_delay: 30
 
+# LoadBalancer IP wait configuration
+lb_ip_wait_retries: 30
+lb_ip_wait_delay: 5
+
 # Victoria operator configuration
 # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml
 victoria_operator_release_name: victoria-metrics-operator
@@ -118,9 +122,9 @@ victoria_lb_ip_confirmed: >-
 victoria_lb_ip_reclaim_needed: >-
   VMCluster services still pending after initial wait.
   Checking if preserved IPs were assigned to wrong services...
-victoria_lb_ip_thieves_found: >-
+victoria_lb_ip_conflict_svcs_found: >-
   Services holding preserved IPs (will be deleted and re-created by telemetry.sh):
-  {{ ip_thieves.stdout_lines | default([]) | select() | list }}
+  {{ ip_conflict_svcs.stdout_lines | default([]) | select() | list }}
 victoria_lb_ip_reclaim_success: >-
   Successfully reclaimed preserved IPs for VMCluster services -
   vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }},

From 2221e37516821dcd69177f36ab17196f9b3152bd Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:01:43 +0530
Subject: [PATCH 26/33] adding shell script

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../templates/find_ip_conflict_svcs.sh.j2     | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2

diff --git a/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 b/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2
new file mode 100644
index 0000000000..c21217afad
--- /dev/null
+++ b/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Find services in the telemetry namespace that are holding LoadBalancer IPs
+# which should belong to VMCluster services (vminsert/vmselect).
+# This can happen when MetalLB reassigns freed IPs to other services
+# before the VMCluster services are created by the operator.
+#
+# Usage: bash find_ip_conflict_svcs.sh
+# Output: One service name per line (services holding conflicting IPs)
+
+set -o pipefail
+
+PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}"
+VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}"
+
+kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \
+  python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+preserved = set('${PRESERVED_IPS}'.split())
+vmcluster = set('${VMCLUSTER_SVCS}'.split())
+for svc in data.get('items', []):
+    name = svc['metadata']['name']
+    if name in vmcluster:
+        continue
+    ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', [])
+    for ing in ingress:
+        ip = ing.get('ip', '')
+        if ip in preserved:
+            print(name)
+            break
+" || true

From c40786454077700eb7e12b289572f1f94c5c72d1 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:23:49 +0530
Subject: [PATCH 27/33] address ansible lint issues

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
ansible lint fixes
---
 .../tasks/apply_victoria_crs.yml                | 17 ++++++++++++++++-
 upgrade/roles/upgrade_telemetry/vars/main.yml   |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
index 2a1c195ddb..addac543ab 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
@@ -175,14 +175,29 @@
     - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0
     - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0
   block:
+    - name: Stage IP conflict detection script
+      ansible.builtin.template:
+        src: find_ip_conflict_svcs.sh.j2
+        dest: "{{ ip_conflict_script_path }}"
+        mode: "{{ executable_mode }}"
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
     - name: Find services holding preserved IPs that are not VMCluster services
-      ansible.builtin.shell: "{{ lookup('template', 'find_ip_conflict_svcs.sh.j2') }}"
+      ansible.builtin.command: "{{ ip_conflict_script_path }}"
       register: ip_conflict_svcs
       changed_when: false
       failed_when: false
       delegate_to: "{{ kube_vip }}"
       connection: ssh
 
+    - name: Remove IP conflict detection script
+      ansible.builtin.file:
+        path: "{{ ip_conflict_script_path }}"
+        state: absent
+      delegate_to: "{{ kube_vip }}"
+      connection: ssh
+
     - name: Display services holding preserved IPs
       ansible.builtin.debug:
         msg: "{{ victoria_lb_ip_conflict_svcs_found }}"
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index ce5129396c..5d51a1a057 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -48,6 +48,7 @@ idrac_rollout_delay: 30
 # LoadBalancer IP wait configuration
 lb_ip_wait_retries: 30
 lb_ip_wait_delay: 5
+ip_conflict_script_path: /tmp/find_ip_conflict_svcs.sh
 
 # Victoria operator configuration
 # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml

From 3f47ba9ff51d8101750248252641460635ba88f4 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Thu, 18 Jun 2026 23:16:52 +0530
Subject: [PATCH 28/33] update until condition

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../tasks/apply_victoria_crs.yml              | 61 +++++++------------
 .../templates/inject_vm_lb_ips.py.j2          | 27 ++++++++
 upgrade/roles/upgrade_telemetry/vars/main.yml |  1 +
 3 files changed, 49 insertions(+), 40 deletions(-)
 create mode 100644 upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2

diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
index addac543ab..1e8b7aa3f0 100644
--- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
+++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml
@@ -55,38 +55,11 @@
 # and the operator creates new ones. To preserve IPs, we inject loadBalancerIP
 # directly into the VMCluster CR's serviceSpec BEFORE applying, so the operator
 # creates services with the correct IPs from the start (no race condition).
-- name: Create LoadBalancer IP injection script
-  ansible.builtin.copy:
-    dest: /tmp/inject_vm_lb_ips.py
-    mode: "0755"
-    content: |
-      #!/usr/bin/env python3
-      import yaml
-      import sys
-      manifest_path = sys.argv[1]
-      vmselect_ip = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2] else ""
-      vminsert_ip = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] else ""
-      with open(manifest_path) as f:
-          doc = yaml.safe_load(f)
-      spec = doc.get("spec", {})
-      changed = False
-      if vmselect_ip and "vmselect" in spec:
-          svc = spec["vmselect"].setdefault("serviceSpec", {}).setdefault("spec", {})
-          if svc.get("loadBalancerIP") != vmselect_ip:
-              svc["loadBalancerIP"] = vmselect_ip
-              changed = True
-      if vminsert_ip and "vminsert" in spec:
-          svc = spec["vminsert"].setdefault("serviceSpec", {}).setdefault("spec", {})
-          if svc.get("loadBalancerIP") != vminsert_ip:
-              svc["loadBalancerIP"] = vminsert_ip
-              changed = True
-      if changed:
-          with open(manifest_path, "w") as f:
-              yaml.dump(doc, f, default_flow_style=False, sort_keys=False)
-          print("Injected vmselect=" + vmselect_ip + " vminsert=" + vminsert_ip)
-      else:
-          print("IPs already present - no change needed")
-      sys.exit(0 if changed else 2)
+- name: Stage LoadBalancer IP injection script
+  ansible.builtin.template:
+    src: inject_vm_lb_ips.py.j2
+    dest: "{{ ip_inject_script_path }}"
+    mode: "{{ executable_mode }}"
   delegate_to: "{{ kube_vip }}"
   connection: ssh
   when:
@@ -95,7 +68,7 @@
 - name: Inject preserved LoadBalancer IPs into VMCluster manifest
   ansible.builtin.command:
     cmd: >-
-      python3 /tmp/inject_vm_lb_ips.py
+      python3 {{ ip_inject_script_path }}
       "{{ telemetry_deploy_dir }}/deployments/victoria-operator-vmcluster.yaml"
       "{{ preserved_vmselect_ip | default('') }}"
       "{{ preserved_vminsert_ip | default('') }}"
@@ -109,7 +82,7 @@
 
 - name: Clean up LoadBalancer IP injection script
   ansible.builtin.file:
-    path: /tmp/inject_vm_lb_ips.py
+    path: "{{ ip_inject_script_path }}"
     state: absent
   delegate_to: "{{ kube_vip }}"
   connection: ssh
@@ -148,7 +121,9 @@
     kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \
       -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
   register: vminsert_lb_ip
-  until: vminsert_lb_ip.stdout | trim | length > 0
+  until: >
+    (vminsert_lb_ip is defined) and
+    ((vminsert_lb_ip.stdout | default('') | trim | length) > 0)
   retries: "{{ lb_ip_wait_retries }}"
   delay: "{{ lb_ip_wait_delay }}"
   changed_when: false
@@ -161,7 +136,9 @@
     kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \
       -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
   register: vmselect_lb_ip
-  until: vmselect_lb_ip.stdout | trim | length > 0
+  until: >
+    (vmselect_lb_ip is defined) and
+    ((vmselect_lb_ip.stdout | default('') | trim | length) > 0)
   retries: "{{ lb_ip_wait_retries }}"
   delay: "{{ lb_ip_wait_delay }}"
   changed_when: false
@@ -218,7 +195,9 @@
         kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \
           -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
       register: vminsert_lb_ip
-      until: vminsert_lb_ip.stdout | trim | length > 0
+      until: >
+        (vminsert_lb_ip is defined) and
+        ((vminsert_lb_ip.stdout | default('') | trim | length) > 0)
       retries: "{{ lb_ip_wait_retries }}"
       delay: "{{ lb_ip_wait_delay }}"
       changed_when: false
@@ -231,7 +210,9 @@
         kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \
           -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo ""
       register: vmselect_lb_ip
-      until: vmselect_lb_ip.stdout | trim | length > 0
+      until: >
+        (vmselect_lb_ip is defined) and
+        ((vmselect_lb_ip.stdout | default('') | trim | length) > 0)
       retries: "{{ lb_ip_wait_retries }}"
       delay: "{{ lb_ip_wait_delay }}"
       changed_when: false
@@ -247,8 +228,8 @@
   ansible.builtin.debug:
     msg: "{{ victoria_lb_ip_reclaim_failed }}"
   when: >-
-    (vminsert_lb_ip is defined and vminsert_lb_ip.stdout is defined and vminsert_lb_ip.stdout | trim | length == 0) or
-    (vmselect_lb_ip is defined and vmselect_lb_ip.stdout is defined and vmselect_lb_ip.stdout | trim | length == 0)
+    (vminsert_lb_ip is defined and vminsert_lb_ip.stdout | default('') | trim | length == 0) or
+    (vmselect_lb_ip is defined and vmselect_lb_ip.stdout | default('') | trim | length == 0)
 
 # ── Apply scrape and agent CRs ──
 - name: Check for VMScrape manifest
diff --git a/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2
new file mode 100644
index 0000000000..a6299d49d7
--- /dev/null
+++ b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+import yaml
+import sys
+manifest_path = sys.argv[1]
+vmselect_ip = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2] else ""
+vminsert_ip = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] else ""
+with open(manifest_path) as f:
+    doc = yaml.safe_load(f)
+spec = doc.get("spec", {})
+changed = False
+if vmselect_ip and "vmselect" in spec:
+    svc = spec["vmselect"].setdefault("serviceSpec", {}).setdefault("spec", {})
+    if svc.get("loadBalancerIP") != vmselect_ip:
+        svc["loadBalancerIP"] = vmselect_ip
+        changed = True
+if vminsert_ip and "vminsert" in spec:
+    svc = spec["vminsert"].setdefault("serviceSpec", {}).setdefault("spec", {})
+    if svc.get("loadBalancerIP") != vminsert_ip:
+        svc["loadBalancerIP"] = vminsert_ip
+        changed = True
+if changed:
+    with open(manifest_path, "w") as f:
+        yaml.dump(doc, f, default_flow_style=False, sort_keys=False)
+    print("Injected vmselect=" + vmselect_ip + " vminsert=" + vminsert_ip)
+else:
+    print("IPs already present - no change needed")
+sys.exit(0 if changed else 2)
diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml
index 5d51a1a057..b726a4be10 100644
--- a/upgrade/roles/upgrade_telemetry/vars/main.yml
+++ b/upgrade/roles/upgrade_telemetry/vars/main.yml
@@ -49,6 +49,7 @@ idrac_rollout_delay: 30
 lb_ip_wait_retries: 30
 lb_ip_wait_delay: 5
 ip_conflict_script_path: /tmp/find_ip_conflict_svcs.sh
+ip_inject_script_path: /tmp/inject_vm_lb_ips.py
 
 # Victoria operator configuration
 # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml

From 247a99525f060acc659b1cdae996127df330c3fd Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:25:29 +0530
Subject: [PATCH 29/33] fixed stdout check

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../templates/inject_vm_lb_ips.py.j2          | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2
index a6299d49d7..21e390ae29 100644
--- a/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2
+++ b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2
@@ -1,4 +1,27 @@
 #!/usr/bin/env python3
+
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Find services in the telemetry namespace that are holding LoadBalancer IPs
+# which should belong to VMCluster services (vminsert/vmselect).
+# This can happen when MetalLB reassigns freed IPs to other services
+# before the VMCluster services are created by the operator.
+#
+# Usage: bash find_ip_conflict_svcs.sh
+# Output: One service name per line (services holding conflicting IPs)
+
 import yaml
 import sys
 manifest_path = sys.argv[1]

From 9eabf15f3d8be241eae359dbfc326f193d8a7444 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:48:01 +0530
Subject: [PATCH 30/33] merge

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
merge
---
 .../orchestrator/common/result_poller.py      |  51 ++++
 .../module_utils/local_repo/software_utils.py |  11 +
 common/library/modules/parallel_tasks.py      |  10 +
 input/telemetry_config.yml                    |   2 +-
 input_validation/validate_config.yml          |   9 +-
 local_repo/local_repo.yml                     |   4 +
 .../tasks/validate_additional_cloud_init.yml  |   6 +
 .../tasks/apply_telemetry_on_upgrade.yml      | 240 ------------------
 .../tasks/derive_sink_support_flags.yml       |  74 ++++--
 provision/roles/telemetry/tasks/main.yml      |  40 +--
 .../telemetry/tasks/read_software_config.yml  |  21 +-
 11 files changed, 165 insertions(+), 303 deletions(-)
 delete mode 100644 provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml

diff --git a/build_stream/orchestrator/common/result_poller.py b/build_stream/orchestrator/common/result_poller.py
index 6d35738773..6f40a91965 100644
--- a/build_stream/orchestrator/common/result_poller.py
+++ b/build_stream/orchestrator/common/result_poller.py
@@ -362,6 +362,11 @@ def _on_result_received(self, result: PlaybookResult) -> None:
                 # S12: On restart failure, still persist node_results.json
                 if result.stage_name == "restart":
                     self._on_restart_completed(result)
+                    self._on_restart_failure(result)
+
+                # On deploy failure, mark ImageGroup FAILED
+                if result.stage_name == "deploy":
+                    self._on_deploy_failure(result)
 
                 # On validate failure, mark ImageGroup FAILED
                 if result.stage_name == "validate":
@@ -968,3 +973,49 @@ def _on_deploy_failure(self, result: PlaybookResult) -> None:
                 job_id=str(result.job_id),
                 exc_info=True,
             )
+
+    def _on_restart_failure(self, result: PlaybookResult) -> None:
+        """Transition ImageGroup from RESTARTING to FAILED on restart failure."""
+        if self._image_group_repo is None:
+            log_secure_info(
+                "warning",
+                f"ImageGroup repo not available; skipping restart failure "
+                f"update for job={result.job_id}",
+                job_id=str(result.job_id),
+            )
+            return
+
+        try:
+            image_group = self._image_group_repo.find_by_job_id(
+                JobId(str(result.job_id))
+            )
+            if image_group is None:
+                log_secure_info(
+                    "error",
+                    f"Restart failure callback: No ImageGroup found for job={result.job_id}.",
+                    job_id=str(result.job_id),
+                )
+                return
+
+            self._image_group_repo.update_status(
+                image_group_id=image_group.id,
+                new_status=ImageGroupStatus.FAILED,
+            )
+
+            if hasattr(self._image_group_repo, 'session'):
+                self._image_group_repo.session.commit()
+
+            log_secure_info(
+                "warning",
+                f"Restart FAILED for job={result.job_id}. "
+                f"ImageGroup '{image_group.id}' -> FAILED.",
+                job_id=str(result.job_id),
+            )
+        except Exception as exc:  # pylint: disable=broad-except
+            log_secure_info(
+                "error",
+                "Failed to update ImageGroup status on restart "
+                f"failure for job={result.job_id}: {exc}",
+                job_id=str(result.job_id),
+                exc_info=True,
+            )
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index da20edea12..af3c1ffab9 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -38,6 +38,7 @@
     CSV_COLUMNS,
     SOFTWARE_CONFIG_SUBDIR,
     DEFAULT_STATUS_FILENAME,
+    STATUS_CSV_HEADER,
     RPM_LABEL_TEMPLATE,
     RHEL_OS_URL,
     SOFTWARES_KEY,
@@ -853,6 +854,16 @@ def check_csv_existence(path):
 
 def read_status_csv(csv_path):
     """Reads the status.csv file and returns a list of row dictionaries."""
+    # Ensure file has valid header before reading
+    if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0:
+        with open(csv_path, 'r', encoding='utf-8') as file:
+            lines = file.readlines()
+            if lines and lines[0].strip() != STATUS_CSV_HEADER.strip():
+                # Header missing or invalid - prepend header to existing data
+                with open(csv_path, 'w', encoding='utf-8') as wfile:
+                    wfile.write(STATUS_CSV_HEADER)
+                    wfile.writelines(lines)
+
     with open(csv_path, mode='r', newline='', encoding='utf-8') as file:
         reader = csv.DictReader(file)
         return [row for row in reader]
diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py
index 20268b10fa..99cc28652a 100644
--- a/common/library/modules/parallel_tasks.py
+++ b/common/library/modules/parallel_tasks.py
@@ -160,9 +160,19 @@ def determine_function(
 
         # Construct the status file path using DEFAULT_STATUS_FILENAME.
         status_file = os.path.join(csv_file_path, DEFAULT_STATUS_FILENAME)
+
+        # Ensure file exists with valid header
         if not os.path.exists(status_file) or os.stat(status_file).st_size == 0:
             with open(status_file, 'w', encoding="utf-8") as file:
                 file.write(STATUS_CSV_HEADER)
+        else:
+            with open(status_file, 'r', encoding="utf-8") as file:
+                lines = file.readlines()
+                if lines and lines[0].strip() != STATUS_CSV_HEADER.strip():
+                    # Header missing or invalid - prepend header to existing data
+                    with open(status_file, 'w', encoding="utf-8") as wfile:
+                        wfile.write(STATUS_CSV_HEADER)
+                        wfile.writelines(lines)
 
 
         task_type = task.get("type")
diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml
index bfc2980c0e..765b227786 100644
--- a/input/telemetry_config.yml
+++ b/input/telemetry_config.yml
@@ -424,7 +424,7 @@ powerscale_configurations:
 
   # Path to the CSM Observability (Karavi Observability) values.yaml file
   # Required when powerscale_configurations.powerscale_telemetry_support: true
-  # Reference: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.16.3/charts/karavi-observability/values.yaml
+  # Reference: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.17.1/charts/karavi-observability/values.yaml
   csm_observability_values_file_path: ""
 
 # --------------------------------------------------------------------------
diff --git a/input_validation/validate_config.yml b/input_validation/validate_config.yml
index dc9dfa3913..f3d5469f8a 100644
--- a/input_validation/validate_config.yml
+++ b/input_validation/validate_config.yml
@@ -50,6 +50,11 @@
   tags:
     - always
   tasks:
+    - name: Enable subscription check when validate_config.yml is run directly
+      ansible.builtin.set_fact:
+        run_subscription_check: true
+      when: run_subscription_check is not defined and omnia_run_tags is not defined
+
     - name: Run subscription validation tasks
       when: "'local_repo' in (omnia_run_tags | default(ansible_run_tags | default([]) | list)) or 'all' in (ansible_run_tags | default([]) | list)"
       block:
@@ -94,7 +99,7 @@
       ansible.builtin.include_role:
         name: validate_subscription
         tasks_from: check_rhel_subscription.yml
-      when: "'local_repo' in (hostvars['localhost']['omnia_run_tags'] | default([]))"
+      when: "hostvars['localhost']['run_subscription_check'] | default(false) | bool"
 
 - name: Configure RHEL repository URLs
   hosts: localhost
@@ -107,7 +112,7 @@
       ansible.builtin.include_role:
         name: validate_subscription
         tasks_from: configure_rhel_os_urls.yml
-      when: "'local_repo' in (omnia_run_tags | default([]))"
+      when: "run_subscription_check | default(false) | bool"
 
 - name: Validate omnia input config
   hosts: localhost
diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml
index d4bb1d488d..e6fea817a0 100644
--- a/local_repo/local_repo.yml
+++ b/local_repo/local_repo.yml
@@ -29,6 +29,10 @@
         omnia_run_tags: "{{ (ansible_run_tags | default([]) | list + ['local_repo']) | unique }}"
         cacheable: true
 
+    - name: Enable subscription check for local_repo
+      ansible.builtin.set_fact:
+        run_subscription_check: true
+
     - name: Include metadata vars
       ansible.builtin.include_vars: "/opt/omnia/.data/oim_metadata.yml"
       register: include_metadata
diff --git a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml
index 65747e39a4..50b9545c53 100644
--- a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml
+++ b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml
@@ -33,6 +33,12 @@
     additional_cloud_init_fg_names: []
   when: additional_cloud_init_file_path == ''
 
+- name: Create cloud-init directory
+  ansible.builtin.file:
+    path: "{{ cloud_init_dir }}"
+    state: directory
+    mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
+
 - name: Load additional cloud-init config
   when: additional_cloud_init_file_path != ''
   block:
diff --git a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml b/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml
deleted file mode 100644
index 0cdb4bd2cb..0000000000
--- a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-- name: Apply telemetry configurations for upgrade
-  when:
-    - kube_vip is defined
-    - kube_vip | length > 0
-    - idrac_telemetry_support | default(false) | bool
-  block:
-    - name: Check if telemetry deployment file exists
-      ansible.builtin.stat:
-        path: "{{ idrac_telemetry_statefulset_path }}"
-      register: telemetry_stat
-
-    - name: Get current iDRAC telemetry StatefulSet configuration
-      kubernetes.core.k8s_info:
-        api_version: apps/v1
-        kind: StatefulSet
-        name: idrac-telemetry
-        namespace: "{{ telemetry_namespace }}"
-      register: current_idrac_statefulset
-      failed_when: false
-      when:
-        - telemetry_stat.stat.exists | default(false)
-
-    - name: Set replica count as fact
-      ansible.builtin.set_fact:
-        preserved_replica_count: "{{ current_idrac_statefulset.resources[0].spec.replicas | default(1) }}"
-      when:
-        - current_idrac_statefulset.resources is defined and current_idrac_statefulset.resources | length > 0
-
-    - name: Show current replica count
-      ansible.builtin.debug:
-        msg: "Current replica count: {{ preserved_replica_count }}"
-        verbosity: 2
-      when:
-        - preserved_replica_count is defined
-
-    - name: Read iDRAC telemetry StatefulSet YAML file
-      ansible.builtin.slurp:
-        src: "{{ idrac_telemetry_statefulset_path }}"
-      register: idrac_statefulset_yaml
-
-    - name: Update StatefulSet definition with preserved replica count
-      ansible.builtin.set_fact:
-        updated_statefulset_definition: "{{ idrac_statefulset_yaml.content | b64decode | regex_replace('---\\n', '') | from_yaml | combine({'spec': {'replicas': preserved_replica_count | int}}, recursive=true) }}"  # noqa: yaml[line-length]
-      when:
-        - telemetry_stat.stat.exists | default(false)
-        - preserved_replica_count is defined
-
-    - name: Apply iDRAC telemetry StatefulSet with preserved replica count
-      kubernetes.core.k8s:
-        state: present
-        definition: "{{ updated_statefulset_definition }}"
-      register: kubectl_apply_result
-      when:
-        - updated_statefulset_definition is defined
-        - telemetry_stat.stat.exists | default(false)
-
-    - name: Display kubectl apply result
-      ansible.builtin.debug:
-        msg: "{{ kubectl_apply_result }}"
-      when:
-        - kubectl_apply_result is defined
-
-    - name: Wait for idrac telemetry receiver to be ready
-      kubernetes.core.k8s_info:
-        api_version: v1
-        kind: Pod
-        namespace: "{{ telemetry_namespace }}"
-        label_selectors:
-          - "app=idrac-telemetry-receiver"
-        wait: true
-        wait_condition:
-          type: Ready
-          status: "True"
-        wait_timeout: 120
-      delegate_to: "{{ kube_vip }}"
-      register: idrac_telemetry_receiver_ready
-      failed_when: false
-      when:
-        - idrac_telemetry_support | default(false) | bool
-
-    - name: Display idrac telemetry receiver ready status
-      ansible.builtin.debug:
-        msg: "{{ idrac_telemetry_receiver_ready }}"
-      when:
-        - idrac_telemetry_support | default(false) | bool
-        - idrac_telemetry_receiver_ready is defined
-
-- name: Apply LDMS configurations for upgrade
-  when:
-    - kube_vip is defined
-    - kube_vip | length > 0
-    - ldms_support | default(false) | bool
-  block:
-    - name: Check if LDMS aggregator is running on service k8s cluster
-      kubernetes.core.k8s_info:
-        api_version: apps/v1
-        kind: StatefulSet
-        name: nersc-ldms-aggr
-        namespace: "{{ telemetry_namespace }}"
-      delegate_to: "{{ kube_vip }}"
-      register: ldms_statefulset_info
-      failed_when: false
-
-    - name: Set LDMS running state
-      ansible.builtin.set_fact:
-        ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}"
-
-    - name: Check if LDMS store daemon is running on service k8s cluster
-      kubernetes.core.k8s_info:
-        api_version: v1
-        kind: Pod
-        namespace: "{{ telemetry_namespace }}"
-        label_selectors:
-          - "app=nersc-ldms-store"
-      delegate_to: "{{ kube_vip }}"
-      register: ldms_store_pod_info
-      failed_when: false
-      when:
-        - ldms_running | default(false) | bool
-
-    - name: Set LDMS store daemon running state
-      ansible.builtin.set_fact:
-        ldms_store_running: "{{ ldms_store_pod_info.resources is defined and ldms_store_pod_info.resources | length > 0 }}"
-      when:
-        - ldms_running | default(false) | bool
-
-    - name: Restart LDMS store daemon pod
-      kubernetes.core.k8s:
-        state: absent
-        api_version: v1
-        kind: Pod
-        name: "{{ ldms_store_pod_info.resources[0].metadata.name }}"
-        namespace: "{{ telemetry_namespace }}"
-      delegate_to: "{{ kube_vip }}"
-      failed_when: false
-      when:
-        - ldms_store_running | default(false) | bool
-
-    - name: Wait for LDMS store daemon pod to be ready after restart
-      kubernetes.core.k8s_info:
-        api_version: v1
-        kind: Pod
-        namespace: "{{ telemetry_namespace }}"
-        label_selectors:
-          - "app=nersc-ldms-store"
-        wait: true
-        wait_condition:
-          type: Ready
-          status: "True"
-        wait_timeout: 120
-      delegate_to: "{{ kube_vip }}"
-      register: ldms_store_pod_ready
-      failed_when: false
-      when:
-        - ldms_store_running | default(false) | bool
-
-    - name: Display LDMS store daemon restart status
-      ansible.builtin.debug:
-        msg: >
-          {{ ldms_store_pod_ready_msg
-          if (ldms_store_pod_ready.resources | default([]) | length > 0)
-          else ldms_store_pod_not_ready_msg }}
-      when:
-        - ldms_store_running | default(false) | bool
-
-    - name: Check if decomp.json exists
-      ansible.builtin.stat:
-        path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json"
-      register: decomp_json_stat
-
-    - name: Copy decompose.json if it doesn't exist
-      ansible.builtin.copy:
-        src: files/scripts/decomp.json
-        dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json"
-        mode: "{{ hostvars['localhost']['file_permissions_644'] }}"
-      when: not decomp_json_stat.stat.exists
-
-    - name: Restart LDMS aggregator StatefulSet
-      kubernetes.core.k8s:
-        state: present
-        definition:
-          apiVersion: apps/v1
-          kind: StatefulSet
-          metadata:
-            name: nersc-ldms-aggr
-            namespace: "{{ telemetry_namespace }}"
-          spec:
-            template:
-              metadata:
-                annotations:
-                  kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}"
-      delegate_to: "{{ kube_vip }}"
-      failed_when: false
-      when:
-        - ldms_running | default(false) | bool
-        - ldms_conf_file.stat.exists | default(false)
-        - ldms_bin_file.stat.exists | default(false)
-
-    - name: Wait for LDMS aggregator pod to be ready after restart
-      kubernetes.core.k8s_info:
-        api_version: v1
-        kind: Pod
-        namespace: "{{ telemetry_namespace }}"
-        label_selectors:
-          - "app=nersc-ldms-aggr"
-        wait: true
-        wait_condition:
-          type: Ready
-          status: "True"
-        wait_timeout: 120
-      delegate_to: "{{ kube_vip }}"
-      register: ldms_pod_ready
-      failed_when: false
-      when:
-        - ldms_running | default(false) | bool
-        - ldms_conf_file.stat.exists | default(false)
-        - ldms_bin_file.stat.exists | default(false)
-
-    - name: Display LDMS aggregator restart status
-      ansible.builtin.debug:
-        msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}"
-      when:
-        - ldms_running | default(false) | bool
-        - ldms_conf_file.stat.exists | default(false)
-        - ldms_bin_file.stat.exists | default(false)
diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml
index 3e59602e44..7f2767d20a 100644
--- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml
+++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml
@@ -68,34 +68,52 @@
       additional_remote_write_endpoints: "{{ telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) }}"
   when: telemetry_config.powerscale_configurations is defined
 
-- name: Check if any source targets victoria_metrics
+- name: Check if any enabled source targets victoria_metrics
   ansible.builtin.set_fact:
     victoria_metrics_support: true
     cacheable: true
   when: >-
-    'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or
-    'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or
-    'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or
-    'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))
+    ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and
+     'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or
+    (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and
+     'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or
+    (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and
+     'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or
+    (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and
+     'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])))
 
-- name: Check if any source targets victoria_logs
+- name: Check if any enabled source targets victoria_logs
   ansible.builtin.set_fact:
     victoria_logs_support: true
     cacheable: true
   when: >-
-    'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or
-    'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or
-    'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or
-    'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))
+    (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and
+     'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or
+    ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and
+     'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or
+    (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and
+     'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or
+    (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and
+     'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])))
 
-- name: Check if any source targets Kafka
+- name: Check if any enabled source targets Kafka
   ansible.builtin.set_fact:
     kafka_support: true
     cacheable: true
   when: >-
-    'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or
-    'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) or
-    'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([]))
+    ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and
+     'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or
+    ((telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) and
+     'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([]))) or
+    (((telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or
+      (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool)) and
+     'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([])))
 
 # =============================================================================
 # VECTOR BRIDGE LOGIC - Determine sink requirements based on Vector bridges
@@ -142,17 +160,17 @@
 - name: Set global variable for telemetry_enabled
   ansible.builtin.set_fact:
     telemetry_enabled: true
-  when: >
-    idrac_telemetry_support or
-    powerscale_metrics_enabled or
-    powerscale_log_enabled or
-    victoria_metrics_support or
-    victoria_logs_support or
-    ldms_support or
-    kafka_support or
-    ufm_telemetry_support or
-    ufm_log_enabled or
-    vast_telemetry_support or
-    vast_log_enabled or
-    ome_metrics_enabled or
-    ome_logs_enabled
+  when: >-
+    (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool) or
+    (telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool)
diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml
index c513480a37..5bdd8cc86b 100644
--- a/provision/roles/telemetry/tasks/main.yml
+++ b/provision/roles/telemetry/tasks/main.yml
@@ -27,8 +27,28 @@
 - name: Derive sink support flags from collection_targets
   ansible.builtin.include_tasks: derive_sink_support_flags.yml
 
+- name: Set pulp server facts for cloud-init templates
+  when:
+    - hostvars['localhost']['service_k8s_support'] | default(false) | bool
+  block:
+    - name: Run pulp status command on omnia_core container
+      ansible.builtin.command: /usr/local/bin/pulp status
+      delegate_to: localhost
+      changed_when: false
+      register: pulp_status_output
+
+    - name: Set pulp content origin value
+      ansible.builtin.set_fact:
+        pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}"
+
+    - name: Set pulp_server_ip fact
+      ansible.builtin.set_fact:
+        pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}"
+
 - name: Configure service_k8s telemetry services
-  when: hostvars['localhost']['service_k8s_support'] | default(false) | bool
+  when:
+    - hostvars['localhost']['service_k8s_support'] | default(false) | bool
+    - telemetry_enabled | default(false) | bool
   block:
     - name: Read telemetry packages from software config
       ansible.builtin.include_tasks: read_software_config.yml
@@ -45,18 +65,7 @@
 
     - name: Configure of k8s telemetry service prerequisites
       when:
-        - >-
-          (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or
-          (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or
-          ldms_support | default(false) | bool
+        - telemetry_enabled | default(false) | bool
       block:
         - name: Set NFS info fact
           ansible.builtin.set_fact:
@@ -148,8 +157,3 @@
         - telemetry_enabled | default(false) | bool
       tags:
         - telemetry_deployment
-
-    # - name: Apply telemetry configurations on upgrade
-    #   ansible.builtin.include_tasks: apply_telemetry_on_upgrade.yml
-    #   when:
-    #     - hostvars['localhost']['upgrade_enabled'] | default(false) | bool
diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml
index a50607e4ed..e49bd45587 100644
--- a/provision/roles/telemetry/tasks/read_software_config.yml
+++ b/provision/roles/telemetry/tasks/read_software_config.yml
@@ -13,20 +13,6 @@
 #  limitations under the License.
 ---
 
-- name: Run pulp status command on omnia_core container
-  ansible.builtin.command: /usr/local/bin/pulp status
-  delegate_to: localhost
-  changed_when: false
-  register: pulp_status_output
-
-- name: Set pulp content origin value
-  ansible.builtin.set_fact:
-    pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}"
-
-- name: Set fact for pulp protocol
-  ansible.builtin.set_fact:
-    pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}"
-
 - name: Get cluster_os_type from software_config.json
   ansible.builtin.set_fact:
     cluster_os_type: "{{ software_config['cluster_os_type'] }}"
@@ -51,3 +37,10 @@
         | map(attribute='package')
         | list)
         | unique }}
+
+- name: Extract individual pip module versions from service_k8s.json
+  ansible.builtin.set_fact:
+    kubernetes_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^kubernetes==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
+    prometheus_client_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^prometheus_client==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
+    pyyaml_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^pyyaml==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
+    cffi_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^cffi==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]

From a8ac01502e07a2bb8a7bfabfec3a2b643b614ed7 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:55:21 +0530
Subject: [PATCH 31/33] resolve merge conflict

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 provision/roles/telemetry/tasks/main.yml          | 15 +++++++++++++--
 .../telemetry/tasks/read_software_config.yml      |  9 +--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml
index 5bdd8cc86b..4ec8aea05c 100644
--- a/provision/roles/telemetry/tasks/main.yml
+++ b/provision/roles/telemetry/tasks/main.yml
@@ -65,7 +65,18 @@
 
     - name: Configure of k8s telemetry service prerequisites
       when:
-        - telemetry_enabled | default(false) | bool
+        - >-
+          (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or
+          (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or
+          ldms_support | default(false) | bool
       block:
         - name: Set NFS info fact
           ansible.builtin.set_fact:
@@ -156,4 +167,4 @@
       when:
         - telemetry_enabled | default(false) | bool
       tags:
-        - telemetry_deployment
+        - telemetry_deployment
\ No newline at end of file
diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml
index e49bd45587..6963618579 100644
--- a/provision/roles/telemetry/tasks/read_software_config.yml
+++ b/provision/roles/telemetry/tasks/read_software_config.yml
@@ -36,11 +36,4 @@
         | selectattr('type', 'equalto', 'pip_module')
         | map(attribute='package')
         | list)
-        | unique }}
-
-- name: Extract individual pip module versions from service_k8s.json
-  ansible.builtin.set_fact:
-    kubernetes_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^kubernetes==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
-    prometheus_client_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^prometheus_client==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
-    pyyaml_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^pyyaml==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
-    cffi_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^cffi==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length]
+        | unique }}
\ No newline at end of file

From 8f3c13d8c3ba4ecdea960bbfa384716a33abb085 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:58:11 +0530
Subject: [PATCH 32/33] resolve merge conflict

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 provision/roles/telemetry/tasks/main.yml                 | 2 +-
 provision/roles/telemetry/tasks/read_software_config.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml
index 4ec8aea05c..8a7e9f6ab2 100644
--- a/provision/roles/telemetry/tasks/main.yml
+++ b/provision/roles/telemetry/tasks/main.yml
@@ -167,4 +167,4 @@
       when:
         - telemetry_enabled | default(false) | bool
       tags:
-        - telemetry_deployment
\ No newline at end of file
+        - telemetry_deployment
diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml
index 6963618579..36300d0a52 100644
--- a/provision/roles/telemetry/tasks/read_software_config.yml
+++ b/provision/roles/telemetry/tasks/read_software_config.yml
@@ -36,4 +36,4 @@
         | selectattr('type', 'equalto', 'pip_module')
         | map(attribute='package')
         | list)
-        | unique }}
\ No newline at end of file
+        | unique }}

From f3b05adeb93def9f3c87d6bd7d86868cc7522c19 Mon Sep 17 00:00:00 2001
From: priti-parate <140157516+priti-parate@users.noreply.github.com>
Date: Mon, 22 Jun 2026 15:41:55 +0530
Subject: [PATCH 33/33] input file name in message

Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com>
---
 .../common_utils/en_us_validation_msg.py      | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
index dbaa2acc94..bf63fecbd1 100644
--- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
+++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
@@ -367,13 +367,13 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length):
 # PowerScale telemetry validation messages
 POWERSCALE_VICTORIA_REQUIRED_MSG = (
     "PowerScale telemetry requires VictoriaMetrics to be deployed. "
-    "When telemetry_sources.powerscale.metrics_enabled is true, "
+    "When telemetry_sources.powerscale.metrics_enabled is true in telemetry_config.yml, "
     "'victoria_metrics' must be included in collection_targets "
     "(e.g., 'victoria_metrics' or 'victoria_metrics,victoria_logs')."
 )
 POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG = (
     "PowerScale logs collection requires VictoriaLogs to be deployed. "
-    "When telemetry_sources.powerscale.logs_enabled is true, "
+    "When telemetry_sources.powerscale.logs_enabled is true in telemetry_config.yml, "
     "'victoria_logs' must be included in collection_targets "
     "(e.g., 'victoria_metrics,victoria_logs')."
 )
@@ -386,15 +386,15 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length):
     "PowerScale telemetry requires a service cluster."
 )
 POWERSCALE_CONFIGURATIONS_MISSING_MSG = (
-    "powerscale_configurations section is required when "
+    "powerscale_configurations section is required in telemetry_config.yml when "
     "telemetry_sources.powerscale.metrics_enabled is true. "
     "It must contain csm_observability_values_file_path."
 )
 POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = (
-    "must be a non-empty string in format 'XGi' (e.g., '5Gi')"
+    "must be a non-empty string in format 'XGi' (e.g., '5Gi') in telemetry_config.yml"
 )
 POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = (
-    "csm_observability_values_file_path is required when "
+    "csm_observability_values_file_path is required in telemetry_config.yml when "
     "telemetry_sources.powerscale.metrics_enabled is true. "
     "Please provide the path to the CSM Observability values.yaml file."
 )
@@ -402,34 +402,34 @@ def powerscale_csm_values_not_found_msg(path):
     """Returns error message when CSM Observability values.yaml file is not found."""
     return (
         f"CSM Observability values.yaml file not found at '{path}'. "
-        "Please verify the file path is correct."
+        "Please verify the file path is correct in telemetry_config.yml (csm_observability_values_file_path)."
     )
 POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = (
-    "CSM Observability values.yaml must contain a valid YAML dictionary."
+    "CSM Observability values.yaml (path specified in telemetry_config.yml) must contain a valid YAML dictionary."
 )
 def powerscale_csm_values_parse_error_msg(error):
     """Returns error message when CSM Observability values.yaml fails to parse."""
     return f"Failed to parse CSM Observability values.yaml: {error}"
 POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = (
-    "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section."
+    "CSM Observability values.yaml (path specified in telemetry_config.yml) is missing 'karaviMetricsPowerscale' section."
 )
 POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = (
-    "CSM Metrics PowerScale image is required in CSM Observability values.yaml."
+    "CSM Metrics PowerScale image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)."
 )
 POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = (
-    "OTEL Collector image is required in CSM Observability values.yaml."
+    "OTEL Collector image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)."
 )
 ADDITIONAL_METRIC_ENDPOINTS_URL_EMPTY_MSG = (
-    "Each additional_metric_remote_write_endpoint must have a non-empty 'url' field."
+    "Each additional_metric_remote_write_endpoint in telemetry_config.yml must have a non-empty 'url' field."
 )
 ADDITIONAL_METRIC_ENDPOINTS_URL_INVALID_MSG = (
-    "URL must start with 'http://' or 'https://'."
+    "URL in telemetry_config.yml must start with 'http://' or 'https://'."
 )
 ADDITIONAL_LOG_ENDPOINTS_URL_EMPTY_MSG = (
-    "Each additional_log_write_endpoint must have a non-empty 'url' field."
+    "Each additional_log_write_endpoint in telemetry_config.yml must have a non-empty 'url' field."
 )
 ADDITIONAL_LOG_ENDPOINTS_URL_INVALID_MSG = (
-    "URL must start with 'http://' or 'https://'."
+    "URL in telemetry_config.yml must start with 'http://' or 'https://'."
 )
 def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image):
     """Returns error message when CSM values.yaml image version doesn't match service_k8s (versioned)."""
@@ -457,13 +457,13 @@ def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_
     "PowerScale telemetry requires a service cluster."
 )
 POWERSCALE_CONFIGURATIONS_MISSING_MSG = (
-    "powerscale_configurations section is required and must contain powerscale_telemetry_support."
+    "powerscale_configurations section is required in telemetry_config.yml and must contain powerscale_telemetry_support."
 )
 POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = (
-    "must be a non-empty string in format 'XGi' (e.g., '5Gi')"
+    "must be a non-empty string in format 'XGi' (e.g., '5Gi') in telemetry_config.yml"
 )
 POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = (
-    "csm_observability_values_file_path is required when powerscale_configurations.powerscale_telemetry_support is true. "
+    "csm_observability_values_file_path is required in telemetry_config.yml when powerscale_configurations.powerscale_telemetry_support is true. "
     "Please provide the path to the CSM Observability values.yaml file."
 )
 POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = (
@@ -475,28 +475,28 @@ def powerscale_csm_values_not_found_msg(path):
     """Returns error message when CSM Observability values.yaml file is not found."""
     return (
         f"CSM Observability values.yaml file not found at '{path}'. "
-        "Please verify the file path is correct."
+        "Please verify the file path is correct in telemetry_config.yml (csm_observability_values_file_path)."
     )
 POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = (
-    "CSM Observability values.yaml must contain a valid YAML dictionary."
+    "CSM Observability values.yaml (path specified in telemetry_config.yml) must contain a valid YAML dictionary."
 )
 def powerscale_csm_values_parse_error_msg(error):
     """Returns error message when CSM Observability values.yaml fails to parse."""
     return f"Failed to parse CSM Observability values.yaml: {error}"
 POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = (
-    "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section."
+    "CSM Observability values.yaml (path specified in telemetry_config.yml) is missing 'karaviMetricsPowerscale' section."
 )
 POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = (
-    "CSM Metrics PowerScale image is required in CSM Observability values.yaml."
+    "CSM Metrics PowerScale image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)."
 )
 POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = (
-    "OTEL Collector image is required in CSM Observability values.yaml."
+    "OTEL Collector image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)."
 )
 POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = (
-    "Each additional_remote_write_endpoint must have a non-empty 'url' field."
+    "Each additional_remote_write_endpoint in telemetry_config.yml must have a non-empty 'url' field."
 )
 POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = (
-    "URL must start with 'http://' or 'https://'."
+    "URL in telemetry_config.yml must start with 'http://' or 'https://'."
 )
 def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image):
     """Returns error message when CSM values.yaml image version doesn't match service_k8s.json."""