From eb025fd3619384d9182710ef58324392947ae74e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:36:44 +0530 Subject: [PATCH 01/33] upgrade defects fixes and fix for crashloopback on pod restart Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../idrac_telemetry_statefulset.yaml.j2 | 33 +++++++++++--- .../telemetry/kafka/kafka.kafka.yaml.j2 | 9 ++++ upgrade/playbooks/upgrade_telemetry.yml | 18 ++++++++ .../upgrade_k8s/tasks/load_version_vars.yml | 5 --- .../tasks/include_required_input.yml | 21 ++++++++- .../roles/upgrade_telemetry/tasks/main.yml | 44 +++++++++++++++++++ .../patch_idrac_termination_grace_period.yml | 2 +- .../tasks/upgrade_operator.yml | 16 ++++++- 8 files changed, 134 insertions(+), 14 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index b0c3dd8b3c..a71ebd0aa0 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -72,7 +72,7 @@ spec: - ip: "127.0.0.1" hostnames: - "mysqldb" - terminationGracePeriodSeconds: 10 + terminationGracePeriodSeconds: 60 tolerations: - effect: NoExecute key: node.kubernetes.io/not-ready @@ -83,16 +83,27 @@ spec: operator: Exists tolerationSeconds: 5 initContainers: - # Clean up stale MySQL lock files from previous ungraceful shutdowns + # Clean up stale MySQL lock/InnoDB artifacts only after ungraceful shutdown - name: cleanup-mysql-locks image: {{ mysql_image }} command: - /bin/sh - -c - | - echo "Checking for stale MySQL lock files..." - rm -f /var/lib/mysql/*.sock /var/lib/mysql/*.pid 2>/dev/null || true - echo "Lock file cleanup complete" + DATADIR="/var/lib/mysql" + # Only run cleanup if datadir has existing MySQL data (not a fresh install) + if [ ! -f "${DATADIR}/mysql.ibd" ]; then + echo "Fresh install detected — skipping cleanup." + exit 0 + fi + # Detect unclean shutdown: pid/sock files should not exist when no mysqld is running + if [ -f "${DATADIR}/mysqld.pid" ] || ls ${DATADIR}/*.sock 1>/dev/null 2>&1 || ls ${DATADIR}/*.lck 1>/dev/null 2>&1; then + echo "Stale lock artifacts detected — previous shutdown was unclean." + rm -f ${DATADIR}/*.sock ${DATADIR}/*.pid ${DATADIR}/*.lck 2>/dev/null || true + echo "Stale artifacts removed." + else + echo "No stale artifacts — previous shutdown was clean." + fi volumeMounts: - name: mysqldb-pvc mountPath: /var/lib/mysql/ @@ -103,10 +114,20 @@ spec: volumeMounts: - name: mysqldb-pvc mountPath: /var/lib/mysql/ + args: + - --innodb-use-native-aio=0 + - --innodb-flush-log-at-trx-commit=1 + - --innodb-flush-method=fsync lifecycle: preStop: exec: - command: ["/bin/sh", "-c", "mysqladmin shutdown -uroot -p${MYSQL_ROOT_PASSWORD} 2>/dev/null || true"] + command: + - /bin/sh + - -c + - | + mysqladmin shutdown -uroot -p"${MYSQL_ROOT_PASSWORD}" --wait=45 2>/dev/null || true + while mysqladmin ping -uroot -p"${MYSQL_ROOT_PASSWORD}" 2>/dev/null; do sleep 1; done + rm -f /var/lib/mysql/conf.d/recovery.cnf 2>/dev/null || true env: - name: MYSQL_DATABASE value: {{ mysqldb_name }} diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 index 929af037c4..afd162b963 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 @@ -9,6 +9,9 @@ spec: replicas: 3 roles: - controller + template: + pod: + terminationGracePeriodSeconds: 120 storage: type: jbod volumes: @@ -30,6 +33,9 @@ spec: replicas: 3 roles: - broker + template: + pod: + terminationGracePeriodSeconds: 120 storage: type: jbod volumes: @@ -83,6 +89,9 @@ spec: log.segment.bytes: {{ kafka_log_segment_bytes }} log.retention.bytes: {{ kafka_log_retention_bytes }} log.retention.check.interval.ms: 300000 + controlled.shutdown.enable: true + controlled.shutdown.max.retries: 3 + controlled.shutdown.retry.backoff.ms: 5000 # Enable topic auto-creation for external clients auto.create.topics.enable: true num.partitions: 3 diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml index 5fcb99f410..730eeb1c00 100644 --- a/upgrade/playbooks/upgrade_telemetry.yml +++ b/upgrade/playbooks/upgrade_telemetry.yml @@ -35,6 +35,24 @@ when: - manifest.component_status[component_name] | default('pending') == 'completed' + - name: "Mark as skipped — service_k8s not configured (Slurm-only deployment)" + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'skipped' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + when: + - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool) + + - name: "Skip — service_k8s not configured (Slurm-only deployment)" + ansible.builtin.meta: end_play + when: + - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool) + - name: "Mark as skipped — BuildStream terminal gate active (C-24)" ansible.builtin.copy: content: >- diff --git a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml index d930690682..33b51ae6b6 100644 --- a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml +++ b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml @@ -126,8 +126,3 @@ | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'helm') | map(attribute='package') | join }} - -# ── Set OIM host ─────────────────────────────────────────────────── -- name: Set oim_host to NFS server IP - ansible.builtin.set_fact: - oim_host: "{{ k8s_nfs_server_ip }}" diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml index b90b2c69fb..ac44ac6fe0 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml @@ -60,6 +60,7 @@ when: - omnia_config is defined - omnia_config.service_k8s_cluster is defined + - omnia_config.service_k8s_cluster | length > 0 tags: always - name: Set k8s_client_mount_path @@ -70,13 +71,21 @@ | first).mount_point }} when: - storage_config is defined + - storage_config.mounts is defined - k8s_nfs_storage_name is defined + - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0 tags: always # ── Load high_availability_config.yml ── +- name: Check if high_availability_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/high_availability_config.yml" + register: ha_config_stat + - name: Read high_availability_config.yml for kube_vip ansible.builtin.include_vars: file: "{{ input_project_dir }}/high_availability_config.yml" name: ha_config + when: ha_config_stat.stat.exists - name: Debug high_availability_config.yml content ansible.builtin.debug: @@ -90,6 +99,7 @@ kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" cacheable: true when: + - ha_config is defined - ha_config.service_k8s_cluster_ha is defined - ha_config.service_k8s_cluster_ha | length > 0 @@ -148,6 +158,7 @@ when: - software_config is defined - software_config.softwares is defined + - software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0 tags: always - name: Set os_version from software_config.json @@ -164,6 +175,7 @@ when: - software_config is defined - software_config.softwares is defined + - software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0 tags: always # ── Load service_k8s JSON for victoria operator package name ── @@ -172,12 +184,19 @@ src: "{{ input_project_dir }}/config/{{ architecture }}/rhel/{{ os_version }}/service_k8s_v{{ k8s_version }}.json" register: service_k8s_slurp failed_when: false + when: + - architecture is defined + - os_version is defined + - k8s_version is defined tags: always - name: Parse service_k8s JSON ansible.builtin.set_fact: service_k8s_config: "{{ service_k8s_slurp.content | b64decode | from_yaml }}" - when: service_k8s_slurp is not failed + when: + - service_k8s_slurp is defined + - service_k8s_slurp is not failed + - service_k8s_slurp is not skipped tags: always - name: Extract victoria operator package name from service_k8s JSON diff --git a/upgrade/roles/upgrade_telemetry/tasks/main.yml b/upgrade/roles/upgrade_telemetry/tasks/main.yml index ee5fd1d282..cccfbc8bf8 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/main.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/main.yml @@ -51,12 +51,56 @@ msg: "{{ victoria_upgrade_skipped }}" when: not (victoria_upgrade_needed | default(false) | bool) +# ── Phase 2.5: Clean stale Kafka lock files before redeploy ── +- name: Clean stale Kafka lock files from PVCs (prevents CrashLoopBackOff) + ansible.builtin.shell: | + set -o pipefail + for pvc_name in $(kubectl get pvc -n {{ telemetry_namespace }} -l strimzi.io/cluster=kafka \ + -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do + pod_name="kafka-lock-cleanup-${pvc_name}" + kubectl delete pod "${pod_name}" -n {{ telemetry_namespace }} --ignore-not-found 2>/dev/null || true + kubectl run "${pod_name}" --rm --attach --restart=Never -n {{ telemetry_namespace }} \ + --image=busybox:latest \ + --overrides="{ + \"spec\": { + \"containers\": [{ + \"name\": \"cleanup\", + \"image\": \"busybox:latest\", + \"command\": [\"sh\", \"-c\", \"find /data -name .lock -delete && echo cleaned ${pvc_name}\"], + \"volumeMounts\": [{\"name\": \"data\", \"mountPath\": \"/data\"}] + }], + \"volumes\": [{\"name\": \"data\", \"persistentVolumeClaim\": {\"claimName\": \"${pvc_name}\"}}], + \"restartPolicy\": \"Never\" + } + }" 2>/dev/null || true + done + delegate_to: "{{ kube_vip }}" + connection: ssh + changed_when: true + failed_when: false + timeout: 300 + when: + - kube_vip is defined + - kube_vip | length > 0 + # ── Phase 3: Execute telemetry.sh to redeploy telemetry stack ── - name: Phase 3 - Execute telemetry.sh to redeploy telemetry stack ansible.builtin.include_tasks: execute_telemetry_sh.yml + when: + - k8s_client_mount_path is defined + - kube_vip is defined + - kube_vip | length > 0 + +- name: Skip telemetry.sh (k8s not configured) + ansible.builtin.debug: + msg: "Skipping telemetry.sh execution — service_k8s not configured (Slurm-only deployment)." + when: k8s_client_mount_path is not defined or kube_vip is not defined # ── Phase 4: Verify all telemetry pods and set upgrade status ── - name: Phase 4 - Verify all telemetry pods and set upgrade status + when: + - kube_vip is defined + - kube_vip | length > 0 block: - name: Get all telemetry pods status ansible.builtin.shell: diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml index 6116afb37f..e3e6333c73 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml @@ -53,7 +53,7 @@ cmd: > kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }} --type=strategic - -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":120}}}}' + -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":60}}}}' delegate_to: "{{ kube_vip }}" connection: ssh register: idrac_patch_result diff --git a/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml b/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml index 4fa40ba520..40cd32336a 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml @@ -16,15 +16,29 @@ # Install / upgrade VictoriaMetrics operator via Helm # ============================================================================ +- name: Remove finalizers from VictoriaMetrics CRDs (prevents delete hang) + ansible.builtin.shell: | + set -o pipefail + for crd in $(kubectl get crd 2>/dev/null | grep victoriametrics | awk '{print $1}'); do + kubectl patch crd "$crd" --type=merge -p '{"metadata":{"finalizers":[]}}' 2>/dev/null || true + done + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + - name: Delete existing VictoriaMetrics CRDs (to fix Helm ownership issues) ansible.builtin.shell: | set -o pipefail - kubectl get crd | grep victoriametrics | awk '{print $1}' | xargs kubectl delete crd 2>/dev/null || true + for crd in $(kubectl get crd 2>/dev/null | grep victoriametrics | awk '{print $1}'); do + timeout 30 kubectl delete crd "$crd" --timeout=30s 2>/dev/null || true + done register: crd_delete_result changed_when: true failed_when: false delegate_to: "{{ kube_vip }}" connection: ssh + timeout: 120 - name: Install VictoriaMetrics operator from tarball ansible.builtin.command: From 1525e9e6ba2c50d264247fe3718617d9b76360f5 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 2 Jun 2026 17:47:11 +0530 Subject: [PATCH 02/33] remove stale services and deployments for victoria Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../telemetry/kafka/kafka.kafka.yaml.j2 | 9 ----- .../tasks/migrate_statefulset.yml | 40 +++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 index afd162b963..929af037c4 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 @@ -9,9 +9,6 @@ spec: replicas: 3 roles: - controller - template: - pod: - terminationGracePeriodSeconds: 120 storage: type: jbod volumes: @@ -33,9 +30,6 @@ spec: replicas: 3 roles: - broker - template: - pod: - terminationGracePeriodSeconds: 120 storage: type: jbod volumes: @@ -89,9 +83,6 @@ spec: log.segment.bytes: {{ kafka_log_segment_bytes }} log.retention.bytes: {{ kafka_log_retention_bytes }} log.retention.check.interval.ms: 300000 - controlled.shutdown.enable: true - controlled.shutdown.max.retries: 3 - controlled.shutdown.retry.backoff.ms: 5000 # Enable topic auto-creation for external clients auto.create.topics.enable: true num.partitions: 3 diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml index 847ad36af4..e99d7bf80c 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml @@ -168,3 +168,43 @@ when: orphaned_pods.stdout_lines | default([]) | length > 0 delegate_to: "{{ kube_vip }}" connection: ssh + + # ── Cleanup old pre-operator services and deployments ── + # The operator creates new services with different names (e.g. vminsert-victoria-cluster), + # so the old standalone services become stale and waste LoadBalancer IPs. + - name: Find old pre-operator services + ansible.builtin.shell: | + set -o pipefail + kubectl -n {{ telemetry_namespace }} get svc --no-headers 2>/dev/null \ + | awk '{print $1}' \ + | grep -xE 'vminsert|vmselect|vmstorage|vmagent' || true + register: old_services + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Delete old pre-operator services + ansible.builtin.command: + cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s + loop: "{{ old_services.stdout_lines | default([]) | select() | list }}" + changed_when: true + failed_when: false + when: old_services.stdout_lines | default([]) | select() | list | length > 0 + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Delete old vmagent deployment (replaced by operator-managed VMAgent) + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get deployment {{ old_vmagent_deployment }} --no-headers 2>/dev/null && \ + kubectl -n {{ telemetry_namespace }} delete deployment {{ old_vmagent_deployment }} --timeout=60s || true + changed_when: true + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Display old resource cleanup summary + ansible.builtin.debug: + msg: + - "Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}" + - "Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}" From e4ea12d91650cfb805e6bb9fdc97641d170e17f4 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 2 Jun 2026 17:54:28 +0530 Subject: [PATCH 03/33] revert changes as it si taken care in another Pr Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../idrac_telemetry_statefulset.yaml.j2 | 31 +++--------------- upgrade/playbooks/upgrade_telemetry.yml | 18 ----------- .../roles/upgrade_telemetry/tasks/main.yml | 32 ------------------- .../patch_idrac_termination_grace_period.yml | 2 +- 4 files changed, 6 insertions(+), 77 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index a71ebd0aa0..c3d7b00aee 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -83,27 +83,16 @@ spec: operator: Exists tolerationSeconds: 5 initContainers: - # Clean up stale MySQL lock/InnoDB artifacts only after ungraceful shutdown + # Clean up stale MySQL lock files from previous ungraceful shutdowns - name: cleanup-mysql-locks image: {{ mysql_image }} command: - /bin/sh - -c - | - DATADIR="/var/lib/mysql" - # Only run cleanup if datadir has existing MySQL data (not a fresh install) - if [ ! -f "${DATADIR}/mysql.ibd" ]; then - echo "Fresh install detected — skipping cleanup." - exit 0 - fi - # Detect unclean shutdown: pid/sock files should not exist when no mysqld is running - if [ -f "${DATADIR}/mysqld.pid" ] || ls ${DATADIR}/*.sock 1>/dev/null 2>&1 || ls ${DATADIR}/*.lck 1>/dev/null 2>&1; then - echo "Stale lock artifacts detected — previous shutdown was unclean." - rm -f ${DATADIR}/*.sock ${DATADIR}/*.pid ${DATADIR}/*.lck 2>/dev/null || true - echo "Stale artifacts removed." - else - echo "No stale artifacts — previous shutdown was clean." - fi + echo "Checking for stale MySQL lock files..." + rm -f /var/lib/mysql/*.sock /var/lib/mysql/*.pid 2>/dev/null || true + echo "Lock file cleanup complete" volumeMounts: - name: mysqldb-pvc mountPath: /var/lib/mysql/ @@ -114,20 +103,10 @@ spec: volumeMounts: - name: mysqldb-pvc mountPath: /var/lib/mysql/ - args: - - --innodb-use-native-aio=0 - - --innodb-flush-log-at-trx-commit=1 - - --innodb-flush-method=fsync lifecycle: preStop: exec: - command: - - /bin/sh - - -c - - | - mysqladmin shutdown -uroot -p"${MYSQL_ROOT_PASSWORD}" --wait=45 2>/dev/null || true - while mysqladmin ping -uroot -p"${MYSQL_ROOT_PASSWORD}" 2>/dev/null; do sleep 1; done - rm -f /var/lib/mysql/conf.d/recovery.cnf 2>/dev/null || true + command: ["/bin/sh", "-c", "mysqladmin shutdown -uroot -p${MYSQL_ROOT_PASSWORD} 2>/dev/null || true"] env: - name: MYSQL_DATABASE value: {{ mysqldb_name }} diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml index 730eeb1c00..5fcb99f410 100644 --- a/upgrade/playbooks/upgrade_telemetry.yml +++ b/upgrade/playbooks/upgrade_telemetry.yml @@ -35,24 +35,6 @@ when: - manifest.component_status[component_name] | default('pending') == 'completed' - - name: "Mark as skipped — service_k8s not configured (Slurm-only deployment)" - ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'skipped' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' - when: - - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool) - - - name: "Skip — service_k8s not configured (Slurm-only deployment)" - ansible.builtin.meta: end_play - when: - - not (hostvars['localhost']['k8s_upgrade_enabled'] | default(false) | bool) - - name: "Mark as skipped — BuildStream terminal gate active (C-24)" ansible.builtin.copy: content: >- diff --git a/upgrade/roles/upgrade_telemetry/tasks/main.yml b/upgrade/roles/upgrade_telemetry/tasks/main.yml index cccfbc8bf8..68c087306c 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/main.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/main.yml @@ -51,38 +51,6 @@ msg: "{{ victoria_upgrade_skipped }}" when: not (victoria_upgrade_needed | default(false) | bool) -# ── Phase 2.5: Clean stale Kafka lock files before redeploy ── -- name: Clean stale Kafka lock files from PVCs (prevents CrashLoopBackOff) - ansible.builtin.shell: | - set -o pipefail - for pvc_name in $(kubectl get pvc -n {{ telemetry_namespace }} -l strimzi.io/cluster=kafka \ - -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do - pod_name="kafka-lock-cleanup-${pvc_name}" - kubectl delete pod "${pod_name}" -n {{ telemetry_namespace }} --ignore-not-found 2>/dev/null || true - kubectl run "${pod_name}" --rm --attach --restart=Never -n {{ telemetry_namespace }} \ - --image=busybox:latest \ - --overrides="{ - \"spec\": { - \"containers\": [{ - \"name\": \"cleanup\", - \"image\": \"busybox:latest\", - \"command\": [\"sh\", \"-c\", \"find /data -name .lock -delete && echo cleaned ${pvc_name}\"], - \"volumeMounts\": [{\"name\": \"data\", \"mountPath\": \"/data\"}] - }], - \"volumes\": [{\"name\": \"data\", \"persistentVolumeClaim\": {\"claimName\": \"${pvc_name}\"}}], - \"restartPolicy\": \"Never\" - } - }" 2>/dev/null || true - done - delegate_to: "{{ kube_vip }}" - connection: ssh - changed_when: true - failed_when: false - timeout: 300 - when: - - kube_vip is defined - - kube_vip | length > 0 - # ── Phase 3: Execute telemetry.sh to redeploy telemetry stack ── - name: Phase 3 - Execute telemetry.sh to redeploy telemetry stack ansible.builtin.include_tasks: execute_telemetry_sh.yml diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml index e3e6333c73..ae8dd63dde 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml @@ -48,7 +48,7 @@ msg: "idrac-telemetry current replica count: {{ idrac_replica_count.stdout }}" when: idrac_sts_check.rc == 0 -- name: Patch terminationGracePeriodSeconds to 120s for graceful MySQL shutdown +- name: Patch terminationGracePeriodSeconds for graceful MySQL shutdown ansible.builtin.command: cmd: > kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }} From e5be450cd7127573f1a9aed1137f2447c9a20eb6 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 3 Jun 2026 07:05:40 +0530 Subject: [PATCH 04/33] revert idrac terminationgraceperiod Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 | 2 +- .../roles/upgrade_telemetry/tasks/include_required_input.yml | 1 + .../tasks/patch_idrac_termination_grace_period.yml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index c3d7b00aee..7d56e91d56 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -72,7 +72,7 @@ spec: - ip: "127.0.0.1" hostnames: - "mysqldb" - terminationGracePeriodSeconds: 60 + terminationGracePeriodSeconds: 120 tolerations: - effect: NoExecute key: node.kubernetes.io/not-ready diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml index ac44ac6fe0..d4127cfff9 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml @@ -75,6 +75,7 @@ - k8s_nfs_storage_name is defined - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0 tags: always + # ── Load high_availability_config.yml ── - name: Check if high_availability_config.yml exists ansible.builtin.stat: diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml index ae8dd63dde..76755b45b9 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml @@ -53,7 +53,7 @@ cmd: > kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }} --type=strategic - -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":60}}}}' + -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":120}}}}' delegate_to: "{{ kube_vip }}" connection: ssh register: idrac_patch_result From f93d2a13357fb520196049bdbb95302d94e4f0ca Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 3 Jun 2026 09:11:14 +0530 Subject: [PATCH 05/33] ansible lint fixes Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../roles/upgrade_telemetry/tasks/include_required_input.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml index a934f8c4af..9655f55130 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml @@ -75,7 +75,7 @@ - k8s_nfs_storage_name is defined - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0 tags: always - + # ── Load high_availability_config.yml ── - name: Check if high_availability_config.yml exists ansible.builtin.stat: From a659c5744d05dfc81ef5badeabf554f9f28a185b Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 3 Jun 2026 11:27:49 +0530 Subject: [PATCH 06/33] rescue block for upgrade telemetry Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- upgrade/playbooks/upgrade_telemetry.yml | 78 ++++++++++++------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml index fd21e71f78..83bd219883 100644 --- a/upgrade/playbooks/upgrade_telemetry.yml +++ b/upgrade/playbooks/upgrade_telemetry.yml @@ -133,43 +133,41 @@ # - VAST exporter (if enabled) # - VictoriaLogs (if enabled) # - UFM exporter (if enabled) - - name: Execute telemetry upgrade - block: - - name: Invoke upgrade_telemetry role - ansible.builtin.include_role: - name: ../roles/upgrade_telemetry - - - name: Mark telemetry upgrade as completed - ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'completed' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' - - - name: "Display upgrade status completed — {{ component_name }}" - ansible.builtin.debug: - msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed" - - rescue: - - name: Mark telemetry upgrade as failed - ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'failed' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' - - - name: "Display upgrade status failed — {{ component_name }}" - ansible.builtin.debug: - msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed" - - - name: Fail the play - ansible.builtin.fail: - msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest." + - name: Invoke upgrade_telemetry role + ansible.builtin.include_role: + name: ../roles/upgrade_telemetry + + - name: Mark telemetry upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + - name: "Display upgrade status completed — {{ component_name }}" + ansible.builtin.debug: + msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed" + + rescue: + - name: Mark telemetry upgrade as failed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'failed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + - name: "Display upgrade status failed — {{ component_name }}" + ansible.builtin.debug: + msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed" + + - name: Fail the play + ansible.builtin.fail: + msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest." From 0e8c5e5eabc13f4390896a0e14b7cffc5ec9beae Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:59:12 +0530 Subject: [PATCH 07/33] revert upgrade telemetry Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- upgrade/playbooks/upgrade_telemetry.yml | 78 +++++++++++++------------ 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml index 83bd219883..398e579564 100644 --- a/upgrade/playbooks/upgrade_telemetry.yml +++ b/upgrade/playbooks/upgrade_telemetry.yml @@ -133,41 +133,43 @@ # - VAST exporter (if enabled) # - VictoriaLogs (if enabled) # - UFM exporter (if enabled) - - name: Invoke upgrade_telemetry role - ansible.builtin.include_role: - name: ../roles/upgrade_telemetry - - - name: Mark telemetry upgrade as completed - ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'completed' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' - - - name: "Display upgrade status completed — {{ component_name }}" - ansible.builtin.debug: - msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed" - - rescue: - - name: Mark telemetry upgrade as failed - ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'failed' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' - - - name: "Display upgrade status failed — {{ component_name }}" - ansible.builtin.debug: - msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed" - - - name: Fail the play - ansible.builtin.fail: - msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest." + - name: Execute telemetry upgrade + block: + - name: Invoke upgrade_telemetry role + ansible.builtin.include_role: + name: ../roles/upgrade_telemetry + + - name: Mark telemetry upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + - name: "Display upgrade status completed — {{ component_name }}" + ansible.builtin.debug: + msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: completed" + + rescue: + - name: Mark telemetry upgrade as failed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'failed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + - name: "Display upgrade status failed — {{ component_name }}" + ansible.builtin.debug: + msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: failed" + + - name: Fail the play + ansible.builtin.fail: + msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest." \ No newline at end of file From 67e8bee1d30f06b7c48eb39f649f54c2ffeec0f4 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 8 Jun 2026 13:13:30 +0530 Subject: [PATCH 08/33] default size of idrac telemetry containers Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- input/telemetry_storage_config.yml | 12 ++++++------ provision/roles/telemetry/vars/main.yml | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/input/telemetry_storage_config.yml b/input/telemetry_storage_config.yml index d44a7ec68f..6805baf6b4 100644 --- a/input/telemetry_storage_config.yml +++ b/input/telemetry_storage_config.yml @@ -168,10 +168,10 @@ idrac_telemetry_storage: resources: requests: cpu: "100m" - memory: "256Mi" + memory: "512Mi" limits: cpu: "500m" - memory: "512Mi" + memory: "1.5Gi" receiver: resources: requests: @@ -184,18 +184,18 @@ idrac_telemetry_storage: resources: requests: cpu: "50m" - memory: "64Mi" + memory: "128Mi" limits: cpu: "200m" - memory: "256Mi" + memory: "512Mi" victoria_pump: resources: requests: cpu: "50m" - memory: "64Mi" + memory: "128Mi" limits: cpu: "200m" - memory: "256Mi" + memory: "512Mi" # Kafka Storage resources kafka_storage: diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 55f0ac6534..4f1f29f9fa 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -82,10 +82,10 @@ idrac_telemetry_resources: activemq: requests: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.requests.cpu | default('100m') }}" - memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.requests.memory | default('256Mi') }}" + memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.requests.memory | default('512Mi') }}" limits: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.limits.cpu | default('500m') }}" - memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.limits.memory | default('512Mi') }}" + memory: "{{ telemetry_storage_config.idrac_telemetry_storage.activemq.resources.limits.memory | default('1.5Gi') }}" receiver: requests: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.receiver.resources.requests.cpu | default('100m') }}" @@ -96,17 +96,17 @@ idrac_telemetry_resources: kafka_pump: requests: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.requests.cpu | default('50m') }}" - memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.requests.memory | default('64Mi') }}" + memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.requests.memory | default('128Mi') }}" limits: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.limits.cpu | default('200m') }}" - memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.limits.memory | default('256Mi') }}" + memory: "{{ telemetry_storage_config.idrac_telemetry_storage.kafka_pump.resources.limits.memory | default('512Mi') }}" victoria_pump: requests: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.requests.cpu | default('50m') }}" - memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.requests.memory | default('64Mi') }}" + memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.requests.memory | default('128Mi') }}" limits: cpu: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.limits.cpu | default('200m') }}" - memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.limits.memory | default('256Mi') }}" + memory: "{{ telemetry_storage_config.idrac_telemetry_storage.victoria_pump.resources.limits.memory | default('512Mi') }}" # Usage: kafka_deployment.yml kafka: From eb2d898ee0b4cb54dd2dff1763abc6365f522764 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 8 Jun 2026 13:23:13 +0530 Subject: [PATCH 09/33] add new line Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- upgrade/playbooks/upgrade_telemetry.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml index ccf54d15fc..46fa8af052 100644 --- a/upgrade/playbooks/upgrade_telemetry.yml +++ b/upgrade/playbooks/upgrade_telemetry.yml @@ -172,4 +172,4 @@ - name: Fail the play ansible.builtin.fail: - msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest." \ No newline at end of file + msg: "Telemetry upgrade failed. Status marked as 'failed' in manifest." From aec78412b578c55c5919491695ac5e7843a299fb Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 8 Jun 2026 14:26:36 +0530 Subject: [PATCH 10/33] input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json index 966a94d7b9..654208ec54 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -29,9 +29,10 @@ { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.3" }, { "package": "cryptography==45.0.7", "type": "pip_module" }, { "package": "omsdk==1.2.518", "type": "pip_module" }, - { "package": "cffi==1.17.1", "type": "pip_module" }, - { "package": "prometheus_client==0.20.0", "type": "pip_module" }, + { "package": "cffi==2.0.0", "type": "pip_module" }, + { "package": "prometheus_client==0.25.0", "type": "pip_module" }, { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "pyyaml==6.0.3", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" }, @@ -75,11 +76,12 @@ { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, - { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "prettytable==3.17.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "PyMySQL==1.1.2", "type": "pip_module" } + { "package": "pyyaml==6.0.3", "type": "pip_module" }, + { "package": "PyMySQL==1.2.0", "type": "pip_module" } ] }, @@ -103,11 +105,12 @@ { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, - { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "prettytable==3.17.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "PyMySQL==1.1.2", "type": "pip_module" } + { "package": "pyyaml==6.0.3", "type": "pip_module" }, + { "package": "PyMySQL==1.2.0", "type": "pip_module" } ] }, From bcc5132783a276180a0865faddb3a30880093abe Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:28:20 +0530 Subject: [PATCH 11/33] update values in upgrade path Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../templates/telemetry_storage_config.j2 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 index d44a7ec68f..6805baf6b4 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 @@ -168,10 +168,10 @@ idrac_telemetry_storage: resources: requests: cpu: "100m" - memory: "256Mi" + memory: "512Mi" limits: cpu: "500m" - memory: "512Mi" + memory: "1.5Gi" receiver: resources: requests: @@ -184,18 +184,18 @@ idrac_telemetry_storage: resources: requests: cpu: "50m" - memory: "64Mi" + memory: "128Mi" limits: cpu: "200m" - memory: "256Mi" + memory: "512Mi" victoria_pump: resources: requests: cpu: "50m" - memory: "64Mi" + memory: "128Mi" limits: cpu: "200m" - memory: "256Mi" + memory: "512Mi" # Kafka Storage resources kafka_storage: From 776990dc044acf72955f38c56ea15c5b1af216e9 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 8 Jun 2026 16:12:38 +0530 Subject: [PATCH 12/33] updating values in integer instead decimal Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- input/telemetry_storage_config.yml | 2 +- .../templates/telemetry_storage_config.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/input/telemetry_storage_config.yml b/input/telemetry_storage_config.yml index 6805baf6b4..c80dbdde65 100644 --- a/input/telemetry_storage_config.yml +++ b/input/telemetry_storage_config.yml @@ -171,7 +171,7 @@ idrac_telemetry_storage: memory: "512Mi" limits: cpu: "500m" - memory: "1.5Gi" + memory: "1536Mi" receiver: resources: requests: diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 index 6805baf6b4..c80dbdde65 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_storage_config.j2 @@ -171,7 +171,7 @@ idrac_telemetry_storage: memory: "512Mi" limits: cpu: "500m" - memory: "1.5Gi" + memory: "1536Mi" receiver: resources: requests: From 0f328dc5222bfe27fd761bc22ef67b9487f2ba81 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 8 Jun 2026 18:46:17 +0530 Subject: [PATCH 13/33] revert service k8s json file Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json index 654208ec54..1fc9bd65ef 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -29,10 +29,9 @@ { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.3" }, { "package": "cryptography==45.0.7", "type": "pip_module" }, { "package": "omsdk==1.2.518", "type": "pip_module" }, - { "package": "cffi==2.0.0", "type": "pip_module" }, - { "package": "prometheus_client==0.25.0", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "prometheus_client==0.20.0", "type": "pip_module" }, { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "pyyaml==6.0.3", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" }, @@ -76,12 +75,11 @@ { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, - { "package": "prettytable==3.17.0", "type": "pip_module" }, + { "package": "prettytable==3.14.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "pyyaml==6.0.3", "type": "pip_module" }, - { "package": "PyMySQL==1.2.0", "type": "pip_module" } + { "package": "PyMySQL==1.1.2", "type": "pip_module" } ] }, @@ -105,12 +103,11 @@ { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, - { "package": "prettytable==3.17.0", "type": "pip_module" }, + { "package": "prettytable==3.14.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "pyyaml==6.0.3", "type": "pip_module" }, - { "package": "PyMySQL==1.2.0", "type": "pip_module" } + { "package": "PyMySQL==1.1.2", "type": "pip_module" } ] }, @@ -121,4 +118,4 @@ { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } ] } -} +} \ No newline at end of file From a9700876622155965280f043d50ceaa0ab4e3db3 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 12 Jun 2026 07:34:11 +0530 Subject: [PATCH 14/33] powescale telemetry upgrade and preserve loadbalancer IP for Victoria Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../tasks/apply_victoria_crs.yml | 73 +++++++++++++++++ .../tasks/migrate_statefulset.yml | 80 +++++++++++++++++++ upgrade/roles/upgrade_telemetry/vars/main.yml | 2 + 3 files changed, 155 insertions(+) diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index 488c39b72c..d725cf067a 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -50,6 +50,79 @@ delegate_to: "{{ kube_vip }}" connection: ssh +# ── Inject preserved LoadBalancer IPs into VMCluster manifest before apply ── +# When migrating from 2.1 StatefulSet to operator, old services are deleted +# and the operator creates new ones. To preserve IPs, we inject loadBalancerIP +# directly into the VMCluster CR's serviceSpec BEFORE applying, so the operator +# creates services with the correct IPs from the start (no race condition). +- name: Create LoadBalancer IP injection script + ansible.builtin.copy: + dest: /tmp/inject_vm_lb_ips.py + mode: "0755" + content: | + #!/usr/bin/env python3 + import yaml + import sys + manifest_path = sys.argv[1] + vmselect_ip = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2] else "" + vminsert_ip = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] else "" + with open(manifest_path) as f: + doc = yaml.safe_load(f) + spec = doc.get("spec", {}) + changed = False + if vmselect_ip and "vmselect" in spec: + svc = spec["vmselect"].setdefault("serviceSpec", {}).setdefault("spec", {}) + if svc.get("loadBalancerIP") != vmselect_ip: + svc["loadBalancerIP"] = vmselect_ip + changed = True + if vminsert_ip and "vminsert" in spec: + svc = spec["vminsert"].setdefault("serviceSpec", {}).setdefault("spec", {}) + if svc.get("loadBalancerIP") != vminsert_ip: + svc["loadBalancerIP"] = vminsert_ip + changed = True + if changed: + with open(manifest_path, "w") as f: + yaml.dump(doc, f, default_flow_style=False, sort_keys=False) + print("Injected vmselect=" + vmselect_ip + " vminsert=" + vminsert_ip) + else: + print("IPs already present - no change needed") + sys.exit(0 if changed else 2) + delegate_to: "{{ kube_vip }}" + connection: ssh + when: + - preserved_vmselect_ip | default('') | length > 0 or preserved_vminsert_ip | default('') | length > 0 + +- name: Inject preserved LoadBalancer IPs into VMCluster manifest + ansible.builtin.command: + cmd: >- + python3 /tmp/inject_vm_lb_ips.py + "{{ telemetry_deploy_dir }}/deployments/victoria-operator-vmcluster.yaml" + "{{ preserved_vmselect_ip | default('') }}" + "{{ preserved_vminsert_ip | default('') }}" + register: ip_inject_result + changed_when: ip_inject_result.rc == 0 + failed_when: ip_inject_result.rc not in [0, 2] + delegate_to: "{{ kube_vip }}" + connection: ssh + when: + - preserved_vmselect_ip | default('') | length > 0 or preserved_vminsert_ip | default('') | length > 0 + +- name: Clean up LoadBalancer IP injection script + ansible.builtin.file: + path: /tmp/inject_vm_lb_ips.py + state: absent + delegate_to: "{{ kube_vip }}" + connection: ssh + changed_when: false + +- name: Display LoadBalancer IP injection status + ansible.builtin.debug: + msg: >- + {{ victoria_lb_ips_preserved + if (preserved_vminsert_ip | default('') | length > 0) + or (preserved_vmselect_ip | default('') | length > 0) + else victoria_lb_ips_not_preserved }} + # ── Apply main CR (VMCluster only — 2.2 cluster mode only) ── - name: Apply VMCluster CR (cluster mode only) with retry ansible.builtin.command: diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml index e99d7bf80c..4d0efd1081 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml @@ -53,6 +53,49 @@ delegate_to: "{{ kube_vip }}" connection: ssh + # ── Flush vmstorage data before shutdown ── + # Create snapshots on each vmstorage pod to force pending data/indexdb flush. + # This prevents corrupted parts.json from in-flight merges during shutdown. + - name: Get old vmstorage pod names + ansible.builtin.shell: | + set -o pipefail + kubectl -n {{ telemetry_namespace }} get pods -l {{ old_vm_pod_label }} --no-headers 2>/dev/null \ + | grep -i "storage\|vmstorage" | awk '{print $1}' + register: old_vmstorage_pods + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Force snapshot on each vmstorage pod (flush pending writes) + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} exec {{ item }} -- \ + wget -q -O- --no-check-certificate "https://localhost:8482/snapshot/create" 2>/dev/null || \ + kubectl -n {{ telemetry_namespace }} exec {{ item }} -- \ + wget -q -O- "http://localhost:8482/snapshot/create" 2>/dev/null || true + loop: "{{ old_vmstorage_pods.stdout_lines | default([]) }}" + changed_when: false + failed_when: false + when: old_vmstorage_pods.stdout_lines | default([]) | length > 0 + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Wait for background merges to settle after writes stopped + ansible.builtin.pause: + seconds: 30 + prompt: "Waiting 30s for vmstorage background merges to settle..." + + # ── Ensure sufficient graceful shutdown period ── + # Old StatefulSet may have default 30s which is too short for indexdb flush + - name: Patch old StatefulSet terminationGracePeriodSeconds to 120s + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} patch statefulset {{ actual_old_statefulset }} \ + -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":120}}}}' + changed_when: true + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + # ── Graceful shutdown of old StatefulSet ── - name: Scale down old StatefulSet ansible.builtin.command: @@ -84,6 +127,11 @@ delegate_to: "{{ kube_vip }}" connection: ssh + - name: Wait for storage cache flush after pod termination + ansible.builtin.pause: + seconds: 15 + prompt: "Waiting 15s for storage cache flush..." + # ── PVC relabeling (data preservation via PV rebind) ── - name: Get all old PVCs from StatefulSet (using specific StatefulSet label) ansible.builtin.command: @@ -169,6 +217,38 @@ delegate_to: "{{ kube_vip }}" connection: ssh + # ── Capture LoadBalancer IPs before deletion ── + # Preserve existing IPs to prevent MetalLB from assigning new ones + # Only applicable for statefulset_to_operator migration path + - name: Get vminsert LoadBalancer IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vminsert -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: old_vminsert_ip + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Get vmselect LoadBalancer IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vmselect -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: old_vmselect_ip + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Set LoadBalancer IP facts for preservation + ansible.builtin.set_fact: + preserved_vminsert_ip: "{{ old_vminsert_ip.stdout | trim }}" + preserved_vmselect_ip: "{{ old_vmselect_ip.stdout | trim }}" + + - name: Display preserved LoadBalancer IPs + ansible.builtin.debug: + msg: + - "Preserving vminsert IP: {{ preserved_vminsert_ip if preserved_vminsert_ip else 'None' }}" + - "Preserving vmselect IP: {{ preserved_vmselect_ip if preserved_vmselect_ip else 'None' }}" + # ── Cleanup old pre-operator services and deployments ── # The operator creates new services with different names (e.g. vminsert-victoria-cluster), # so the old standalone services become stale and waste LoadBalancer IPs. diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 8326120799..62bb33b0c2 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -102,6 +102,8 @@ victoria_unhealthy_pods_warning: >- victoria_pods_deleted: "Deleted {{ victoria_unhealthy_pods | length }} unhealthy pod(s). Upgrade will re-create them." victoria_backup_completed: "Victoria backup completed: {{ telemetry_backup_dir }}" victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }}" +victoria_lb_ips_preserved: "LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, vmselect: {{ preserved_vmselect_ip | default('N/A') }}" +victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)" victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state." victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting" telemetry_upgrade_success: "Telemetry upgrade COMPLETED: All telemetry pods are running and ready." From a43bdda7afc364aca060fa518aa085f1c5ee64c4 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 12 Jun 2026 07:36:42 +0530 Subject: [PATCH 15/33] powerscale telemetry version upgrade Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json index 1fc9bd65ef..08de51d880 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -1,3 +1,4 @@ + { "service_k8s": { "cluster": [ @@ -35,11 +36,11 @@ { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" }, - { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, - { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.143.1", "type": "image" }, + { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.12.0", "type": "image" }, + { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.150.1", "type": "image" }, { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, - { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, - { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, + { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.15.0" }, + { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.10.0" }, { "package": "quay.io/jetstack/cert-manager-controller", "tag": "v1.10.0", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-cainjector", "tag": "v1.10.0", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", "tag": "v1.10.0", "type": "image" }, @@ -99,7 +100,7 @@ { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, { "package": "calico-v3.31.4","type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.31.4/manifests/calico.yaml" }, - { "package": "metallb-native-v0.15.3", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" }, + { "package": "metallb-native-v0.15.3", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" }, { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, @@ -115,7 +116,7 @@ "cluster": [ { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, - { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } + { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } ] } } \ No newline at end of file From 2f1a6baee821c33af1eb02044ad5f98d4914d76b Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:42:59 +0530 Subject: [PATCH 16/33] ansible lint fixes Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- upgrade/roles/upgrade_telemetry/vars/main.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 62bb33b0c2..ed68b0e89b 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -102,7 +102,10 @@ victoria_unhealthy_pods_warning: >- victoria_pods_deleted: "Deleted {{ victoria_unhealthy_pods | length }} unhealthy pod(s). Upgrade will re-create them." victoria_backup_completed: "Victoria backup completed: {{ telemetry_backup_dir }}" victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }}" -victoria_lb_ips_preserved: "LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, vmselect: {{ preserved_vmselect_ip | default('N/A') }}" +victoria_lb_ips_preserved: >- + LoadBalancer IPs injected into VMCluster manifest - + vminsert: {{ preserved_vminsert_ip | default('N/A') }}, + vmselect: {{ preserved_vmselect_ip | default('N/A') }} victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)" victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state." victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting" @@ -145,3 +148,12 @@ idrac_patch_msg: >- MySQL will have enough time to flush on NFS during pod restart. idrac_skip_patch_msg: "idrac-telemetry StatefulSet not found (first deploy). Skipping patch." idrac_replica_restore_msg: "idrac-telemetry scaled back to {{ idrac_replica_count.stdout }} replicas" + +# Kafka patch messages +kafka_broker_patch_msg: >- + kafka-broker patched: terminationGracePeriodSeconds=300s. + Kafka brokers will have sufficient time for graceful shutdown during rolling restarts. +kafka_controller_patch_msg: >- + kafka-controller patched: terminationGracePeriodSeconds=300s. + Kafka controllers will have sufficient time for graceful shutdown during rolling restarts. +kafka_skip_patch_msg: "Kafka StatefulSets not found (first deploy). Skipping patch." From b5e756e39f6289c79d6e43c483a971b8bfb93a41 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:10:57 +0530 Subject: [PATCH 17/33] update software_config with updated csi driver version Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../scripts/transform_software_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/scripts/transform_software_config.py b/upgrade/roles/import_input_parameters/scripts/transform_software_config.py index 9a314e7614..03ae8c47c6 100644 --- a/upgrade/roles/import_input_parameters/scripts/transform_software_config.py +++ b/upgrade/roles/import_input_parameters/scripts/transform_software_config.py @@ -23,7 +23,7 @@ # These are the target versions for software entries that should be updated TARGET_VERSIONS = { "service_k8s": "1.35.1", - "csi_driver_powerscale": "v2.16.0" + "csi_driver_powerscale": "v2.17.0" } with open(backup_file, 'r', encoding='utf-8') as f: From 79fb95d4f6d9335f0516e733f3144b22f001b81a Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 13 Jun 2026 13:31:17 +0530 Subject: [PATCH 18/33] upgrade powerscale values.yml Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../scripts/merge_powerscale_values.py | 110 +++++++++++------- upgrade/roles/upgrade_telemetry/vars/main.yml | 2 +- 2 files changed, 69 insertions(+), 43 deletions(-) diff --git a/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py b/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py index 0f236c2027..eefd833b4a 100755 --- a/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py +++ b/upgrade/roles/import_input_parameters/scripts/merge_powerscale_values.py @@ -22,24 +22,24 @@ import yaml -def merge_values(v21_file_path, v216_file_path, output_file_path): +def merge_values(vold_file_path, vnew_file_path, output_file_path): """ - Merge v2.1 PowerScale values into v2.16 template. + Merge old PowerScale values into new template. Args: - v21_file_path: Path to v2.1 values.yaml (source settings) - v216_file_path: Path to v2.16 values.yaml (target structure) + vold_file_path: Path to old values.yaml (source settings) + vnew_file_path: Path to new values.yaml (target structure) output_file_path: Path to write merged values.yaml """ - # Load v2.1 values (source of user settings) - with open(v21_file_path, 'r', encoding='utf-8') as file_handle: - v21_values = yaml.safe_load(file_handle) + # Load old values (source of user settings) + with open(vold_file_path, 'r', encoding='utf-8') as file_handle: + vold_values = yaml.safe_load(file_handle) - # Load v2.16 values (target structure with new defaults) - with open(v216_file_path, 'r', encoding='utf-8') as file_handle: - v216_values = yaml.safe_load(file_handle) + # Load new values (target structure with new defaults) + with open(vnew_file_path, 'r', encoding='utf-8') as file_handle: + vnew_values = yaml.safe_load(file_handle) - # Parameters to preserve from v2.1 + # Parameters to preserve from old version preserve_params = [ 'isiPath', 'isiAccessZone', @@ -49,12 +49,12 @@ def merge_values(v21_file_path, v216_file_path, output_file_path): # Preserve top-level parameters for param in preserve_params: - if param in v21_values: - v216_values[param] = v21_values[param] - print(f"Preserved {param}: {v21_values[param]}", + if param in vold_values: + vnew_values[param] = vold_values[param] + print(f"Preserved {param}: {vold_values[param]}", file=sys.stderr) - # Preserve feature flags if enabled in v2.1 + # Preserve feature flags if enabled in old version feature_flags = [ 'storageCapacity', 'podmon', @@ -64,62 +64,88 @@ def merge_values(v21_file_path, v216_file_path, output_file_path): ] for feature in feature_flags: - if feature in v21_values and isinstance(v21_values[feature], dict): - if 'enabled' in v21_values[feature]: - if feature not in v216_values: - v216_values[feature] = {} - v216_values[feature]['enabled'] = \ - v21_values[feature]['enabled'] + if feature in vold_values and isinstance(vold_values[feature], dict): + if 'enabled' in vold_values[feature]: + if feature not in vnew_values: + vnew_values[feature] = {} + vnew_values[feature]['enabled'] = \ + vold_values[feature]['enabled'] print(f"Preserved {feature}.enabled: " - f"{v21_values[feature]['enabled']}", + f"{vold_values[feature]['enabled']}", file=sys.stderr) + # Preserve healthMonitor with both enabled and interval + if 'healthMonitor' in vold_values and isinstance(vold_values['healthMonitor'], dict): + if 'healthMonitor' not in vnew_values: + vnew_values['healthMonitor'] = {} + for param in ['enabled', 'interval']: + if param in vold_values['healthMonitor']: + vnew_values['healthMonitor'][param] = vold_values['healthMonitor'][param] + print(f"Preserved healthMonitor.{param}: {vold_values['healthMonitor'][param]}", file=sys.stderr) + # Preserve controller settings - if 'controller' in v21_values and \ - isinstance(v21_values['controller'], dict): - if 'controller' not in v216_values: - v216_values['controller'] = {} + if 'controller' in vold_values and \ + isinstance(vold_values['controller'], dict): + if 'controller' not in vnew_values: + vnew_values['controller'] = {} controller_params = ['nodeSelector', 'tolerations', 'controllerCount'] for param in controller_params: - if param in v21_values['controller']: - v216_values['controller'][param] = \ - v21_values['controller'][param] - print(f"Preserved controller.{param}: {v21_values['controller'][param]}", file=sys.stderr) + if param in vold_values['controller']: + vnew_values['controller'][param] = \ + vold_values['controller'][param] + print(f"Preserved controller.{param}: {vold_values['controller'][param]}", file=sys.stderr) + + # Preserve controller-level healthMonitor + if 'healthMonitor' in vold_values['controller'] and isinstance(vold_values['controller']['healthMonitor'], dict): + if 'healthMonitor' not in vnew_values['controller']: + vnew_values['controller']['healthMonitor'] = {} + for param in ['enabled', 'interval']: + if param in vold_values['controller']['healthMonitor']: + vnew_values['controller']['healthMonitor'][param] = vold_values['controller']['healthMonitor'][param] + print(f"Preserved controller.healthMonitor.{param}: {vold_values['controller']['healthMonitor'][param]}", file=sys.stderr) # Preserve node settings - if 'node' in v21_values and isinstance(v21_values['node'], dict): - if 'node' not in v216_values: - v216_values['node'] = {} + if 'node' in vold_values and isinstance(vold_values['node'], dict): + if 'node' not in vnew_values: + vnew_values['node'] = {} node_params = ['nodeSelector', 'tolerations'] for param in node_params: - if param in v21_values['node']: - v216_values['node'][param] = v21_values['node'][param] + if param in vold_values['node']: + vnew_values['node'][param] = vold_values['node'][param] print(f"Preserved node.{param}", file=sys.stderr) + # Preserve node-level healthMonitor + if 'healthMonitor' in vold_values['node'] and isinstance(vold_values['node']['healthMonitor'], dict): + if 'healthMonitor' not in vnew_values['node']: + vnew_values['node']['healthMonitor'] = {} + if 'enabled' in vold_values['node']['healthMonitor']: + vnew_values['node']['healthMonitor']['enabled'] = vold_values['node']['healthMonitor']['enabled'] + print(f"Preserved node.healthMonitor.enabled: {vold_values['node']['healthMonitor']['enabled']}", file=sys.stderr) + # Write merged values to output file with open(output_file_path, 'w', encoding='utf-8') as file_handle: - yaml.dump(v216_values, file_handle, + yaml.dump(vnew_values, file_handle, default_flow_style=False, sort_keys=False) - print("Successfully merged v2.1 settings into v2.16 values.yaml", + print("Successfully merged old settings into new values.yaml", file=sys.stderr) print(f"Output written to: {output_file_path}", file=sys.stderr) if __name__ == '__main__': if len(sys.argv) != 4: - print("Usage: merge_powerscale_values.py " - " ", file=sys.stderr) + print("Usage: merge_powerscale_values.py " + " ", file=sys.stderr) sys.exit(1) - v21_input = sys.argv[1] - v216_input = sys.argv[2] + vold_input = sys.argv[1] + vnew_input = sys.argv[2] output_path = sys.argv[3] try: - merge_values(v21_input, v216_input, output_path) + merge_values(vold_input, vnew_input, output_path) except (IOError, yaml.YAMLError) as error: print(f"ERROR: Failed to merge PowerScale values.yaml: {error}", file=sys.stderr) diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index ed68b0e89b..d55c6d6180 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -101,7 +101,7 @@ victoria_unhealthy_pods_warning: >- proceeds. The upgrade will re-create them with the new version. victoria_pods_deleted: "Deleted {{ victoria_unhealthy_pods | length }} unhealthy pod(s). Upgrade will re-create them." victoria_backup_completed: "Victoria backup completed: {{ telemetry_backup_dir }}" -victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }}" +victoria_crs_applied: "VictoriaMetrics CRs applied (mode: {{ victoria_deploy_mode }})" victoria_lb_ips_preserved: >- LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, From bbef8ae3d76a51b9a2bead7d06209f04727fa7f6 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 13 Jun 2026 13:33:56 +0530 Subject: [PATCH 19/33] revert kafka patch variables Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- upgrade/roles/upgrade_telemetry/vars/main.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index d55c6d6180..04ddfaa86d 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -148,12 +148,3 @@ idrac_patch_msg: >- MySQL will have enough time to flush on NFS during pod restart. idrac_skip_patch_msg: "idrac-telemetry StatefulSet not found (first deploy). Skipping patch." idrac_replica_restore_msg: "idrac-telemetry scaled back to {{ idrac_replica_count.stdout }} replicas" - -# Kafka patch messages -kafka_broker_patch_msg: >- - kafka-broker patched: terminationGracePeriodSeconds=300s. - Kafka brokers will have sufficient time for graceful shutdown during rolling restarts. -kafka_controller_patch_msg: >- - kafka-controller patched: terminationGracePeriodSeconds=300s. - Kafka controllers will have sufficient time for graceful shutdown during rolling restarts. -kafka_skip_patch_msg: "Kafka StatefulSets not found (first deploy). Skipping patch." From 668db752cb12df0601b3fbbf2e67e9585da96167 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:13:53 +0530 Subject: [PATCH 20/33] update delegation as mount_on_oim can be false also Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../tasks/backup_telemetry.yml | 22 ++++++++----------- .../tasks/backup_victoria.yml | 2 +- upgrade/roles/upgrade_telemetry/vars/main.yml | 3 +-- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml index ed0ff59f83..fcc2185d4e 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml @@ -27,38 +27,34 @@ when: - k8s_client_mount_path is defined - k8s_client_mount_path | length > 0 + - kube_vip is defined + - kube_vip | length > 0 block: - - name: Set telemetry backup directory - ansible.builtin.set_fact: - tel_backup_dir: "{{ k8s_client_mount_path }}/upgrade/telemetry/omnia_{{ manifest.source_version | default('unknown') }}" - delegate_to: oim - connection: ssh - - name: Create telemetry backup directory ansible.builtin.file: - path: "{{ tel_backup_dir }}" + path: "{{ telemetry_backup_dir }}" state: directory mode: '0755' - delegate_to: oim + delegate_to: "{{ kube_vip }}" connection: ssh - name: Backup telemetry folder (pre-provision) ansible.builtin.copy: src: "{{ k8s_client_mount_path }}/telemetry" - dest: "{{ tel_backup_dir }}/telemetry" + dest: "{{ telemetry_backup_dir }}/telemetry" remote_src: true mode: preserve - delegate_to: oim + delegate_to: "{{ kube_vip }}" connection: ssh failed_when: false - name: Backup idrac_telemetry folder (pre-provision) ansible.builtin.copy: src: "{{ k8s_client_mount_path }}/idrac_telemetry" - dest: "{{ tel_backup_dir }}/idrac_telemetry" + dest: "{{ telemetry_backup_dir }}/idrac_telemetry" remote_src: true mode: preserve - delegate_to: oim + delegate_to: "{{ kube_vip }}" connection: ssh failed_when: false @@ -74,7 +70,7 @@ - name: Backup telemetry.sh from control plane ansible.builtin.copy: src: /root/telemetry.sh - dest: "{{ tel_backup_dir }}/telemetry.sh" + dest: "{{ telemetry_backup_dir }}/telemetry.sh" mode: "{{ executable_mode }}" remote_src: true delegate_to: "{{ kube_vip }}" diff --git a/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml b/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml index 9c6a487b78..abfdd4d107 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/backup_victoria.yml @@ -21,7 +21,7 @@ path: "{{ telemetry_backup_dir }}" state: directory mode: "0755" - delegate_to: "{{ oim_host }}" + delegate_to: "{{ kube_vip }}" connection: ssh # ── Backup namespace-level resources ── diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 04ddfaa86d..c39efbcc3b 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -26,8 +26,7 @@ oim_host: oim executable_mode: "0755" # Upgrade directory paths (on k8s NFS share, resolved at runtime) -telemetry_upgrade_dir: "{{ k8s_client_mount_path }}/upgrade/telemetry" -telemetry_backup_dir: "{{ telemetry_upgrade_dir }}/omnia_{{ manifest.source_version | default('unknown') }}" +telemetry_backup_dir: "{{ k8s_client_mount_path }}/upgrade/backup/telemetry/omnia_{{ manifest.source_version | default('unknown') }}" # PV backup location (cluster-wide backup pre-provision) telemetry_pv_backup_file: "{{ telemetry_backup_dir }}/all_pvs.yaml" From bc89696b7ccf4ff31af0a2ec46f150b4a9076b57 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:39:33 +0530 Subject: [PATCH 21/33] update vars Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../roles/upgrade_telemetry/tasks/migrate_statefulset.yml | 4 ++-- upgrade/roles/upgrade_telemetry/vars/main.yml | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml index 4d0efd1081..23648e2ed6 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml @@ -83,7 +83,7 @@ - name: Wait for background merges to settle after writes stopped ansible.builtin.pause: seconds: 30 - prompt: "Waiting 30s for vmstorage background merges to settle..." + prompt: "{{ vmstorage_merge_wait_msg }}" # ── Ensure sufficient graceful shutdown period ── # Old StatefulSet may have default 30s which is too short for indexdb flush @@ -130,7 +130,7 @@ - name: Wait for storage cache flush after pod termination ansible.builtin.pause: seconds: 15 - prompt: "Waiting 15s for storage cache flush..." + prompt: "{{ storage_cache_flush_msg }}" # ── PVC relabeling (data preservation via PV rebind) ── - name: Get all old PVCs from StatefulSet (using specific StatefulSet label) diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index c39efbcc3b..a869e5a52a 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -142,6 +142,12 @@ mysql_crash_error_msg: | 1. Check pod logs: kubectl logs -n telemetry -c mysqldb 2. Check PVC status: kubectl get pvc -n telemetry | grep idrac 3. Contact support if issue persists. + +# ============================================================================ +# PAUSE MESSAGES +# ============================================================================ +vmstorage_merge_wait_msg: "Waiting 30s for vmstorage background merges to settle..." +storage_cache_flush_msg: "Waiting 15s for storage cache flush..." idrac_patch_msg: >- idrac-telemetry patched: terminationGracePeriodSeconds=120s. MySQL will have enough time to flush on NFS during pod restart. From 6abf3201b069ff067134ef96555240e70253d5f0 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 17 Jun 2026 14:32:19 +0530 Subject: [PATCH 22/33] example files for powescale Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../CSI_driver/secret.yaml | 90 ++++ .../CSI_driver/values.yaml | 437 ++++++++++++++++++ .../powerscale_metrics/values.yaml | 221 +++++++++ 3 files changed, 748 insertions(+) create mode 100644 examples/powerscale_reference_files/CSI_driver/secret.yaml create mode 100644 examples/powerscale_reference_files/CSI_driver/values.yaml create mode 100644 examples/powerscale_reference_files/powerscale_metrics/values.yaml diff --git a/examples/powerscale_reference_files/CSI_driver/secret.yaml b/examples/powerscale_reference_files/CSI_driver/secret.yaml new file mode 100644 index 0000000000..75888d6023 --- /dev/null +++ b/examples/powerscale_reference_files/CSI_driver/secret.yaml @@ -0,0 +1,90 @@ +# Copyright © 2020-2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +isilonClusters: + # logical name of PowerScale Cluster + - clusterName: "cluster1" + + # username for connecting to PowerScale OneFS API server + # if authorization is enabled, username will be ignored + # Default value: None + username: "user" + + # password for connecting to PowerScale OneFS API server + # if authorization is enabled, password will be ignored + password: "password" + + # HTTPS endpoint of the PowerScale OneFS API server + # if authorization is enabled, the endpont should be the localhost address of the csm-authorization-sidecar + # Default value: None + # Examples: "1.2.3.4", "https://1.2.3.4", "https://abc.myonefs.com" + endpoint: "1.2.3.4" + + # endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server + # Formerly this attribute was named as "isiPort" + # If authorization is enabled, endpointPort must match the port specified in the endpoint parameter of the karavi-authorization-config secret + # Allowed value: valid port number + # Default value: 8080 + # endpointPort: 8080 + + # Is this a default cluster (would be used by storage classes without ClusterName parameter) + # Allowed values: + # true: mark this cluster config as default + # false: mark this cluster config as not default + # Default value: false + isDefault: true + + # Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified. + # Allowed values: + # true: skip OneFS API server's certificate verification + # false: verify OneFS API server's certificates + # Default value: default value specified in values.yaml + # skipCertificateValidation: true + + # The base path for the volumes to be created on PowerScale cluster + # This will be used if a storage class does not have the IsiPath parameter specified. + # Ensure that this path exists on PowerScale cluster. + # Allowed values: unix absolute path + # Default value: default value specified in values.yaml + # Examples: "/ifs/data/csi", "/ifs/engineering" + # isiPath: "/ifs/data/csi" + + # The permissions for isi volume directory path + # This will be used if a storage class does not have the IsiVolumePathPermissions parameter specified. + # Allowed values: valid octal mode number + # Default value: "0777" + # Examples: "0777", "777", "0755" + # isiVolumePathPermissions: "0777" + + # ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS + # When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the + # same exports are unresolvable/doesn't exist anymore. + # Allowed values: + # true: ignore existing unresolvable hosts and append new host to the existing export + # false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails + # Default value: false + # ignoreUnresolvableHosts: false + + # Unique ID if the certificate is used to encrypt replication policy + # This will be used if a replication encrypted is enabled, leave empty in case you use unecrypted replication + # Allowed values: string, unique id of the certificate + # Default value: "" + # Examples: "dd9c736cc17e6dd5f7d85fe13528cfc20f3b4b0af4f26595d22328c8d1f461af" + # replicationCertificateID: "" + + # To add more PowerScale systems, uncomment the following lines and provide the required values + # - clusterName: "cluster2" + # username: "user" + # password: "password" + # endpoint: "1.2.3.4" + # endpointPort: "8080" diff --git a/examples/powerscale_reference_files/CSI_driver/values.yaml b/examples/powerscale_reference_files/CSI_driver/values.yaml new file mode 100644 index 0000000000..14826ff22e --- /dev/null +++ b/examples/powerscale_reference_files/CSI_driver/values.yaml @@ -0,0 +1,437 @@ +## K8S/DRIVER ATTRIBUTES +######################## +# version: version of this values file +# Note: Do not change this value +version: "v2.17.0" + +images: + # "driver" defines the container image, used for the driver container. + driver: + image: quay.io/dell/container-storage-modules/csi-isilon:v2.17.0 + # CSI sidecars + attacher: + image: registry.k8s.io/sig-storage/csi-attacher:v4.11.0 + provisioner: + image: registry.k8s.io/sig-storage/csi-provisioner:v6.2.0 + snapshotter: + image: registry.k8s.io/sig-storage/csi-snapshotter:v8.5.0 + resizer: + image: registry.k8s.io/sig-storage/csi-resizer:v2.1.0 + registrar: + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.16.0 + healthmonitor: + image: registry.k8s.io/sig-storage/csi-external-health-monitor-controller:v0.17.0 + + # CSM sidecars + replication: + image: quay.io/dell/container-storage-modules/dell-csi-replicator:v1.15.0 + podmon: + image: quay.io/dell/container-storage-modules/podmon:v1.16.0 + authorization: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + metadataretriever: + image: quay.io/dell/container-storage-modules/csi-metadata-retriever:v1.14.0 + +# CSI driver log level +# Allowed values: "error", "warn"/"warning", "info", "debug" +# Default value: "info" +logLevel: "info" + +# certSecretCount: Represents number of certificate secrets, which user is going to create for +# ssl authentication. (isilon-cert-0..isilon-cert-n) +# Allowed values: n, where n > 0 +# Default value: None +certSecretCount: 1 + +# allowedNetworks: Custom networks for PowerScale export +# Specify list of networks which can be used for NFS I/O traffic; CIDR format should be used. +# Allowed values: list of one or more networks +# Default value: None +# Examples: [192.168.1.0/24, 192.168.100.0/22] +allowedNetworks: [] + +# maxIsilonVolumesPerNode: Specify default value for maximum number of volumes that controller can publish to the node. +# If value is zero CO SHALL decide how many volumes of this type can be published by the controller to the node. +# This limit is applicable to all the nodes in the cluster for which node label 'max-isilon-volumes-per-node' is not set. +# Allowed values: n, where n >= 0 +# Default value: 0 +maxIsilonVolumesPerNode: 0 + +# imagePullPolicy: Policy to determine if the image should be pulled prior to starting the container. +# Allowed values: +# Always: Always pull the image. +# IfNotPresent: Only pull the image if it does not already exist on the node. +# Never: Never pull the image. +# Default value: None +imagePullPolicy: IfNotPresent + +# verbose: Indicates what content of the OneFS REST API message should be logged in debug level logs +# Allowed Values: +# 0: log full content of the HTTP request and response +# 1: log without the HTTP response body +# 2: log only 1st line of the HTTP request and response +# Default value: 0 +verbose: 1 + +# Specify kubelet config dir path. +# Ensure that the config.yaml file is present at this path. +# Default value: /var/lib/kubelet +kubeletConfigDir: /var/lib/kubelet + +# enableCustomTopology: Specify if custom topology label .dellemc.com/: +# has to be used for making connection to backend PowerScale Array. +# If enableCustomTopology is set to true, then do not specify allowedTopologies in storage class. +# Allowed values: +# true : enable custom topology +# false: disable custom topology +# Default value: false +enableCustomTopology: false + +# fsGroupPolicy: Defines if the underlying volume supports changing ownership and permission of the volume before being mounted. +# Allowed values: +# ReadWriteOnceWithFSType: supports volume ownership and permissions change only if the fsType is defined +# and the volume's accessModes contains ReadWriteOnce. +# File: kubernetes may use fsGroup to change permissions and ownership of the volume +# to match user requested fsGroup in the pod's security policy regardless of fstype or access mode. +# None: volumes will be mounted with no modifications. +# Default value: ReadWriteOnceWithFSType +fsGroupPolicy: ReadWriteOnceWithFSType + +# podmonAPIPort: Defines the port to be used within the kubernetes cluster +# Allowed values: +# Any valid and free port. +# Default value: 8083 +podmonAPIPort: 8083 + +# maxPathLen: this parameter is used for setting the maximum Path length for the given volume. +# Default value: 192 +# Examples: 192, 256 +maxPathLen: 192 + +# azReconcileInterval: Interval to monitor and reconcile network interface labels on nodes. +# Allowed values: Number followed by unit of time (s,m,h) +# Default value: 1h +azReconcileInterval: 1h + +# controller: configure controller pod specific parameters +controller: + # controllerCount: defines the number of csi-powerscale controller pods to deploy to + # the Kubernetes release. + # Allowed values: n, where n > 0 + # Default value: None + controllerCount: 1 + + # volumeNamePrefix: Prefix of PersistentVolume names created + # Allowed values: string + # Default value: csivol + # Examples: "k8s", "app1" + volumeNamePrefix: csivol + + # leaderElection: configure leader election parameters + leaderElection: + # Duration, that non-leader candidates will wait to force acquire leadership + # Allowed values: Duration, in seconds. Must be greater than leaderElectionRenewDeadline + # Default value: 15s + leaderElectionLeaseDuration: 15s + + # Duration, that the acting leader will retry refreshing leadership before giving up + # Allowed values: Duration, in seconds. Must be greater than leaderElectionRetryPeriod + # Default value: 10s + leaderElectionRenewDeadline: 10s + + # Duration, the LeaderElector clients should wait between tries of actions. + # Allowed values: Duration, in seconds + # Default value: 5s + leaderElectionRetryPeriod: 5s + + # replication: allows to configure replication + # Replication CRDs must be installed before installing driver + replication: + # enabled: Enable/Disable replication feature + # Allowed values: + # true: enable replication feature(install dell-csi-replicator sidecar) + # false: disable replication feature(do not install dell-csi-replicator sidecar) + # Default value: false + enabled: false + + # replicationContextPrefix: prefix to use for naming of resources created by replication feature + # Allowed values: string + # Default value: powerscale + replicationContextPrefix: "powerscale" + + # replicationPrefix: prefix to prepend to storage classes parameters + # Allowed values: string + # Default value: replication.storage.dell.com + replicationPrefix: "replication.storage.dell.com" + + snapshot: + # enabled: Enable/Disable volume snapshot feature + # Allowed values: + # true: enable volume snapshot feature(install snapshotter sidecar) + # false: disable volume snapshot feature(do not install snapshotter sidecar) + # Default value: None + enabled: true + + # snapNamePrefix: Prefix to apply to the names of a created snapshots + # Allowed values: string + # Default value: csi-snap + # Examples: "snap", "snapshot" + snapNamePrefix: csi-snap + + resizer: + # enabled: Enable/Disable volume expansion feature + # Allowed values: + # true: enable volume expansion feature(install resizer sidecar) + # false: disable volume snapshot feature(do not install resizer sidecar) + # Default value: None + enabled: false + + healthMonitor: + # enabled: Enable/Disable health monitor of CSI volumes- volume status, volume condition + # Allowed values: + # true: enable checking of health condition of CSI volumes + # false: disable checking of health condition of CSI volumes + # Default value: None + enabled: true + + # interval: Interval of monitoring volume health condition + # Allowed values: Number followed by unit of time (s,m,h) + # Default value: 60s + interval: 60s + + # nodeSelector: Define node selection constraints for pods of controller deployment. + # For the pod to be eligible to run on a node, the node must have each + # of the indicated key-value pairs as labels. + # Leave as blank to consider all nodes + # Allowed values: map of key-value pairs + # Default value: None + nodeSelector: + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint + # node-role.kubernetes.io/master: "" + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint + # node-role.kubernetes.io/control-plane: "" + + # tolerations: Define tolerations for the controller deployment, if required. + # Default value: None + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint + tolerations: + # - key: "node-role.kubernetes.io/master" + # operator: "Exists" + # effect: "NoSchedule" + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint + # tolerations: + # - key: "node-role.kubernetes.io/control-plane" + # operator: "Exists" + # effect: "NoSchedule" + +# node: configure node pod specific parameters +node: + # nodeSelector: Define node selection constraints for pods of node daemonset + # For the pod to be eligible to run on a node, the node must have each + # of the indicated key-value pairs as labels. + # Leave as blank to consider all nodes + # Allowed values: map of key-value pairs + # Default value: None + nodeSelector: + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint + # node-role.kubernetes.io/master: "" + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint + # node-role.kubernetes.io/control-plane: "" + + # tolerations: Define tolerations for the node daemonset, if required. + # Default value: None + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint + tolerations: + # - key: "node.kubernetes.io/memory-pressure" + # operator: "Exists" + # effect: "NoExecute" + # - key: "node.kubernetes.io/disk-pressure" + # operator: "Exists" + # effect: "NoExecute" + # - key: "node.kubernetes.io/network-unavailable" + # operator: "Exists" + # effect: "NoExecute" + # - key: "node-role.kubernetes.io/master" + # operator: "Exists" + # effect: "NoSchedule" + # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint + # tolerations: + # - key: "node-role.kubernetes.io/control-plane" + # operator: "Exists" + # effect: "NoSchedule" + + # Uncomment if CSM for Resiliency and CSI Driver pods monitor are enabled + # tolerations: + # - key: "offline.vxflexos.storage.dell.com" + # operator: "Exists" + # effect: "NoSchedule" + # - key: "vxflexos.podmon.storage.dell.com" + # operator: "Exists" + # effect: "NoSchedule" + # - key: "offline.unity.storage.dell.com" + # operator: "Exists" + # effect: "NoSchedule" + # - key: "unity.podmon.storage.dell.com" + # operator: "Exists" + # effect: "NoSchedule" + # - key: "offline.isilon.storage.dell.com" + # operator: "Exists" + # effect: "NoSchedule" + # - key: "isilon.podmon.storage.dell.com" + # operator: "Exists" + # effect: "NoSchedule" + + # dnsPolicy: Determines the DNS Policy of the Node service. + # Allowed values: + # Default: The Pod inherits the name resolution configuration from the node that the pods run on. + # ClusterFirst: Any DNS query that does not match the configured cluster domain suffix, such as "www.kubernetes.io", + # is forwarded to the upstream nameserver inherited from the node. + # ClusterFirstWithHostNet: For Pods running with hostNetwork, you should explicitly set this DNS policy. + # None: It allows a Pod to ignore DNS settings from the Kubernetes environment. + # All DNS settings are supposed to be provided using the dnsConfig field in the Pod Spec. + # Default value: ClusterFirst + # ClusterFirstWithHostNet is the recommended DNS policy. + # Prior to v1.5 of the driver, the default DNS policy was ClusterFirst. + # In certain scenarios, users might need to change the default dnsPolicy. + dnsPolicy: ClusterFirstWithHostNet + + healthMonitor: + # enabled: Enable/Disable health monitor of CSI volumes- volume usage, volume condition + # Allowed values: + # true: enable checking of health condition of CSI volumes + # false: disable checking of health condition of CSI volumes + # Default value: None + enabled: true + +## PLATFORM ATTRIBUTES +###################### +# endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server +# Formerly this attribute was named as "isiPort" +# This value acts as a default value for endpointPort, if not specified for a cluster config in secret +# If authorization is enabled, endpointPort must match the port specified in the endpointPort parameter of the isilon-creds secret +# Allowed value: valid port number +# Default value: 8080 +endpointPort: 8080 + +# skipCertificateValidation: Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified. +# Formerly this attribute was named as "isiInsecure" +# This value acts as a default value for skipCertificateValidation, if not specified for a cluster config in secret +# Allowed values: +# true: skip OneFS API server's certificate verification +# false: verify OneFS API server's certificates +# Default value: false +skipCertificateValidation: true + +# isiAuthType: Indicates whether the authentication will be session-based or basic. +# Allowed values: +# 0: enables basic Authentication +# 1: enables session-based Authentication +# Default value: 0 +isiAuthType: 0 + +# isiAccessZone: The name of the access zone a volume can be created in. +# If storageclass is missing with AccessZone parameter, then value of isiAccessZone is used for the same. +# Default value: System +# Examples: System, zone1 +isiAccessZone: System + +# enableQuota: Indicates whether the provisioner should attempt to set (later unset) quota +# on a newly provisioned volume. +# This requires SmartQuotas to be enabled on PowerScale cluster. +# Allowed values: +# true: set quota for volume +# false: do not set quota for volume +enableQuota: true + +# isiPath: The base path for the volumes to be created on PowerScale cluster. +# This value acts as a default value for isiPath, if not specified for a cluster config in secret +# Ensure that this path exists on PowerScale cluster. +# Allowed values: unix absolute path +# Default value: /ifs +# Examples: /ifs/data/csi, /ifs/engineering +isiPath: /ifs/data/csi + +# isiVolumePathPermissions: The permissions for isi volume directory path +# This value acts as a default value for isiVolumePathPermissions, if not specified for a cluster config in secret +# Allowed values: valid octal mode number +# Default value: "0777" +# Examples: "0777", "777", "0755" +isiVolumePathPermissions: "0777" + +# ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS +# When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the +# same exports are unresolvable/doesn't exist anymore. +# Allowed values: +# true: ignore existing unresolvable hosts and append new host to the existing export +# false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails +# Default value: false +ignoreUnresolvableHosts: false + +# noProbeOnStart: Indicates whether the controller/node should probe all the PowerScale clusters during driver initialization +# When set to true, the driver will not set node labels, please manually add +# the label .dellemc.com/: on the nodes for each of the clusters reachable from the node. +# Allowed values: +# true : do not probe all PowerScale clusters during driver initialization +# false: probe all PowerScale clusters during driver initialization +# Default value: false +noProbeOnStart: false + +# autoProbe: automatically probe the PowerScale cluster if not done already during CSI calls. +# Allowed values: +# true : enable auto probe. +# false: disable auto probe. +# Default value: false +autoProbe: true + +authorization: + enabled: false + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +# Storage Capacity Tracking +# Note: Capacity tracking is supported in kubernetes v1.24 and above, this feature will be automatically disabled in older versions. +storageCapacity: + # enabled : Enable/Disable storage capacity tracking + # Allowed values: + # true: enable storage capacity tracking + # false: disable storage capacity tracking + # Default value: true + enabled: true + # pollInterval : Configure how often external-provisioner polls the driver to detect changed capacity + # Allowed values: 1m,2m,3m,...,10m,...,60m etc + # Default value: 5m + pollInterval: 5m + +# Enable this feature only after contact support for additional information +podmon: + enabled: false + controller: + args: + - "--csisock=unix:/var/run/csi/csi.sock" + - "--labelvalue=csi-isilon" + - "--arrayConnectivityPollRate=60" + - "--driverPath=csi-isilon.dellemc.com" + - "--mode=controller" + - "--skipArrayConnectionValidation=false" + - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml" + - "--driverPodLabelValue=dell-storage" + - "--ignoreVolumelessPods=false" + + node: + args: + - "--csisock=unix:/var/lib/kubelet/plugins/csi-isilon/csi_sock" + - "--labelvalue=csi-isilon" + - "--arrayConnectivityPollRate=60" + - "--driverPath=csi-isilon.dellemc.com" + - "--mode=node" + - "--leaderelection=false" + - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml" + - "--driverPodLabelValue=dell-storage" + - "--ignoreVolumelessPods=false" diff --git a/examples/powerscale_reference_files/powerscale_metrics/values.yaml b/examples/powerscale_reference_files/powerscale_metrics/values.yaml new file mode 100644 index 0000000000..a89148cd79 --- /dev/null +++ b/examples/powerscale_reference_files/powerscale_metrics/values.yaml @@ -0,0 +1,221 @@ +karaviMetricsPowerflex: + image: quay.io/dell/container-storage-modules/csm-metrics-powerflex:v1.15.0 + enabled: false + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-vxflexos.dellemc.com) + provisionerNames: csi-vxflexos.dellemc.com + # set sdcMetricsEnabled to "false" to disable collection of SDC metrics + sdcMetricsEnabled: "true" + # set polling frequency to the PowerFlex array to get metrics data + sdcPollFrequencySeconds: 10 + volumePollFrequencySeconds: 10 + # set volumeMetricsEnabled to "false" to disable collection of Volume metrics + volumeMetricsEnabled: "true" + # set storageClassPoolMetricsEnabled to "false" to disable collection of storage class/pool metrics + storageClassPoolMetricsEnabled: "true" + # set the polling frequency to configure the interval which storage class/pool metrics are gathered + storageClassPoolPollFrequencySeconds: 10 + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 30 + # set the the default max concurrent queries to PowerFlex + concurrentPowerflexQueries: 10 + # set the default endpoint for PowerFlex service + endpoint: karavi-metrics-powerflex + service: + type: ClusterIP + logLevel: INFO + logFormat: text + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +karaviMetricsPowerstore: + image: quay.io/dell/container-storage-modules/csm-metrics-powerstore:v1.15.0 + enabled: false + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-powerstore.dellemc.com) + provisionerNames: csi-powerstore.dellemc.com + # set polling frequency to the PowerStore array to get metrics data + volumePollFrequencySeconds: 20 + spacePollFrequencySeconds: 300 + arrayPollFrequencySeconds: 300 + filesystemPollFrequencySeconds: 20 + # apiTimeout: Defines the timeout for PowerStore API calls in seconds + # Allowed values: Number followed by unit (s,m,h) + # Examples: 60s, 5m, 1h + # Default value: 120s + apiTimeout: "120s" + # set volumeMetricsEnabled to "false" to disable collection of Volume metrics + volumeMetricsEnabled: "true" + # set the the default max concurrent queries to PowerStore + concurrentPowerstoreQueries: 10 + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 30 + # set the default endpoint for PowerStore service + endpoint: karavi-metrics-powerstore + service: + type: ClusterIP + logLevel: INFO + logFormat: text + zipkin: + uri: "" + serviceName: metrics-powerstore + probability: 0.0 + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +karaviMetricsPowerscale: + image: quay.io/dell/container-storage-modules/csm-metrics-powerscale:v1.12.0 + enabled: true + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-isilon.dellemc.com) + provisionerNames: csi-isilon.dellemc.com + # set capacityMetricsEnabled to "false" to disable collection of capacity metrics + capacityMetricsEnabled: "true" + # set performanceMetricsEnabled to "false" to disable collection of performance metrics + performanceMetricsEnabled: "true" + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get cluster capacity metrics data + clusterCapacityPollFrequencySeconds: 30 + # set polling frequency to get cluster performance data + clusterPerformancePollFrequencySeconds: 20 + # set polling frequency to get quota capacity metrics data + quotaCapacityPollFrequencySeconds: 30 + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 30 + # set the default max concurrent queries to PowerScale + concurrentPowerscaleQueries: 10 + # set the default endpoint for PowerScale service + endpoint: karavi-metrics-powerscale + service: + type: ClusterIP + logLevel: INFO + logFormat: text + # isiClientOptions to access Powerscale OneFS API server + isiClientOptions: + # set isiSkipCertificateValidation to true/false to skip/verify OneFS API server's certificates + # default isiSkipCertificateValidation: true to skip OneFS API server's certificates + isiSkipCertificateValidation: true + # set isiAuthType to 0/1 to enables session-based/basic Authentication + # default isiAuthType: 0 to use session-based Authentication + isiAuthType: 1 + # set isiLogVerbose to 0/1/2 decide High/Medium/Low content of the OneFS REST API message should be logged in debug level logs + # default isiLogVerbose: 0 to log full content of the HTTP request and response + isiLogVerbose: 0 + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +karaviMetricsPowermax: + image: quay.io/dell/container-storage-modules/csm-metrics-powermax:v1.10.0 + enabled: false + collectorAddr: otel-collector:55680 + # comma separated list of provisioner names (ex: csi-powermax.dellemc.com) + provisionerNames: csi-powermax.dellemc.com + # set capacityMetricsEnabled to "false" to disable collection of capacity metrics + capacityMetricsEnabled: "true" + # set performanceMetricsEnabled to "false" to disable collection of performance metrics + performanceMetricsEnabled: "true" + # set polling frequency to get capacity metrics data for volume, storagegroup, srp and array + capacityPollFrequencySeconds: 3600 + # set polling frequency to get performance metrics data for volume, storagegroup + performancePollFrequencySeconds: 300 + # set the default max concurrent queries to PowerMax + concurrentPowermaxQueries: 10 + # set topologyMetricsEnabled to "false" to disable collection of topology metrics + topologyMetricsEnabled: "true" + # set polling frequency to get topology metrics + topologyMetricsPollFrequencySeconds: 300 + # set the default endpoint for PowerMax service + endpoint: karavi-metrics-powermax + # useSecret + # Defines if a Secret should be used to provide Unisphere for PowerMax endpoints + # and login credentials instead of the deprecated powermax-reverseproxy-config ConfigMap. + # If set to true, the contents of the secret specified by defaultCredentialsSecret + # will be used, in the new format, to specify Unisphere for PowerMax endpoints, array IDs, + # and login credentials. If set to false, the deprecated ConfigMap will be automatically + # created and used. + # Default value: false + useSecret: false + # defaultCredentialsSecret + # The name of the Kubernetes Secret containing the details of the PowerMax arrays, + # their Unisphere endpoints and their login credentials if useSecret is set to true. + # Default value: "" + defaultCredentialsSecret: "" + service: + type: ClusterIP + logLevel: INFO + logFormat: text + authorization: + enabled: false + # sidecarProxy.image: the container image used for the csm-authorization-sidecar. + # Default value: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + sidecarProxy: + image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.5.0 + # proxyHost: hostname of the csm-authorization server + # Default value: None + proxyHost: + # skipCertificateValidation: certificate validation of the csm-authorization server + # Allowed Values: + # "true" - TLS certificate verification will be skipped + # "false" - TLS certificate will be verified + # Default value: "true" + skipCertificateValidation: true + +otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector:0.150.1 + service: + type: ClusterIP + nginxProxy: + image: nginxinc/nginx-unprivileged:1.29 +# Karavi-observability requires cert-manager. If cert-manager is already present in cluster, set enabled to false not to install it. +cert-manager: + enabled: true + startupapicheck: + enabled: false + serviceAccount: + create: false +# Optionally, uncomment and specify the name of the pre-created namespace to install the module in it +# namespace: \ No newline at end of file From f6dfbd062266ea50d0f42f2bf15c396b7bb558ce Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 17 Jun 2026 14:58:52 +0530 Subject: [PATCH 23/33] remove old files Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../powerscale_reference_files/secret.yaml | 90 ---- .../powerscale_reference_files/values.yaml | 437 ------------------ 2 files changed, 527 deletions(-) delete mode 100644 examples/powerscale_reference_files/secret.yaml delete mode 100644 examples/powerscale_reference_files/values.yaml diff --git a/examples/powerscale_reference_files/secret.yaml b/examples/powerscale_reference_files/secret.yaml deleted file mode 100644 index 75888d6023..0000000000 --- a/examples/powerscale_reference_files/secret.yaml +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright © 2020-2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -isilonClusters: - # logical name of PowerScale Cluster - - clusterName: "cluster1" - - # username for connecting to PowerScale OneFS API server - # if authorization is enabled, username will be ignored - # Default value: None - username: "user" - - # password for connecting to PowerScale OneFS API server - # if authorization is enabled, password will be ignored - password: "password" - - # HTTPS endpoint of the PowerScale OneFS API server - # if authorization is enabled, the endpont should be the localhost address of the csm-authorization-sidecar - # Default value: None - # Examples: "1.2.3.4", "https://1.2.3.4", "https://abc.myonefs.com" - endpoint: "1.2.3.4" - - # endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server - # Formerly this attribute was named as "isiPort" - # If authorization is enabled, endpointPort must match the port specified in the endpoint parameter of the karavi-authorization-config secret - # Allowed value: valid port number - # Default value: 8080 - # endpointPort: 8080 - - # Is this a default cluster (would be used by storage classes without ClusterName parameter) - # Allowed values: - # true: mark this cluster config as default - # false: mark this cluster config as not default - # Default value: false - isDefault: true - - # Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified. - # Allowed values: - # true: skip OneFS API server's certificate verification - # false: verify OneFS API server's certificates - # Default value: default value specified in values.yaml - # skipCertificateValidation: true - - # The base path for the volumes to be created on PowerScale cluster - # This will be used if a storage class does not have the IsiPath parameter specified. - # Ensure that this path exists on PowerScale cluster. - # Allowed values: unix absolute path - # Default value: default value specified in values.yaml - # Examples: "/ifs/data/csi", "/ifs/engineering" - # isiPath: "/ifs/data/csi" - - # The permissions for isi volume directory path - # This will be used if a storage class does not have the IsiVolumePathPermissions parameter specified. - # Allowed values: valid octal mode number - # Default value: "0777" - # Examples: "0777", "777", "0755" - # isiVolumePathPermissions: "0777" - - # ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS - # When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the - # same exports are unresolvable/doesn't exist anymore. - # Allowed values: - # true: ignore existing unresolvable hosts and append new host to the existing export - # false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails - # Default value: false - # ignoreUnresolvableHosts: false - - # Unique ID if the certificate is used to encrypt replication policy - # This will be used if a replication encrypted is enabled, leave empty in case you use unecrypted replication - # Allowed values: string, unique id of the certificate - # Default value: "" - # Examples: "dd9c736cc17e6dd5f7d85fe13528cfc20f3b4b0af4f26595d22328c8d1f461af" - # replicationCertificateID: "" - - # To add more PowerScale systems, uncomment the following lines and provide the required values - # - clusterName: "cluster2" - # username: "user" - # password: "password" - # endpoint: "1.2.3.4" - # endpointPort: "8080" diff --git a/examples/powerscale_reference_files/values.yaml b/examples/powerscale_reference_files/values.yaml deleted file mode 100644 index 2b612e02ea..0000000000 --- a/examples/powerscale_reference_files/values.yaml +++ /dev/null @@ -1,437 +0,0 @@ -## K8S/DRIVER ATTRIBUTES -######################## -# version: version of this values file -# Note: Do not change this value -version: "v2.16.0" - -images: - # "driver" defines the container image, used for the driver container. - driver: - image: quay.io/dell/container-storage-modules/csi-isilon:v2.16.0 - # CSI sidecars - attacher: - image: registry.k8s.io/sig-storage/csi-attacher:v4.10.0 - provisioner: - image: registry.k8s.io/sig-storage/csi-provisioner:v6.1.0 - snapshotter: - image: registry.k8s.io/sig-storage/csi-snapshotter:v8.4.0 - resizer: - image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 - registrar: - image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.15.0 - healthmonitor: - image: registry.k8s.io/sig-storage/csi-external-health-monitor-controller:v0.16.0 - - # CSM sidecars - replication: - image: quay.io/dell/container-storage-modules/dell-csi-replicator:v1.14.0 - podmon: - image: quay.io/dell/container-storage-modules/podmon:v1.15.0 - authorization: - image: quay.io/dell/container-storage-modules/csm-authorization-sidecar:v2.4.0 - metadataretriever: - image: quay.io/dell/container-storage-modules/csi-metadata-retriever:v1.13.0 - -# CSI driver log level -# Allowed values: "error", "warn"/"warning", "info", "debug" -# Default value: "info" -logLevel: "info" - -# certSecretCount: Represents number of certificate secrets, which user is going to create for -# ssl authentication. (isilon-cert-0..isilon-cert-n) -# Allowed values: n, where n > 0 -# Default value: None -certSecretCount: 1 - -# allowedNetworks: Custom networks for PowerScale export -# Specify list of networks which can be used for NFS I/O traffic; CIDR format should be used. -# Allowed values: list of one or more networks -# Default value: None -# Examples: [192.168.1.0/24, 192.168.100.0/22] -allowedNetworks: [] - -# maxIsilonVolumesPerNode: Specify default value for maximum number of volumes that controller can publish to the node. -# If value is zero CO SHALL decide how many volumes of this type can be published by the controller to the node. -# This limit is applicable to all the nodes in the cluster for which node label 'max-isilon-volumes-per-node' is not set. -# Allowed values: n, where n >= 0 -# Default value: 0 -maxIsilonVolumesPerNode: 0 - -# imagePullPolicy: Policy to determine if the image should be pulled prior to starting the container. -# Allowed values: -# Always: Always pull the image. -# IfNotPresent: Only pull the image if it does not already exist on the node. -# Never: Never pull the image. -# Default value: None -imagePullPolicy: IfNotPresent - -# verbose: Indicates what content of the OneFS REST API message should be logged in debug level logs -# Allowed Values: -# 0: log full content of the HTTP request and response -# 1: log without the HTTP response body -# 2: log only 1st line of the HTTP request and response -# Default value: 0 -verbose: 1 - -# Specify kubelet config dir path. -# Ensure that the config.yaml file is present at this path. -# Default value: /var/lib/kubelet -kubeletConfigDir: /var/lib/kubelet - -# enableCustomTopology: Specify if custom topology label .dellemc.com/: -# has to be used for making connection to backend PowerScale Array. -# If enableCustomTopology is set to true, then do not specify allowedTopologies in storage class. -# Allowed values: -# true : enable custom topology -# false: disable custom topology -# Default value: false -enableCustomTopology: false - -# fsGroupPolicy: Defines if the underlying volume supports changing ownership and permission of the volume before being mounted. -# Allowed values: -# ReadWriteOnceWithFSType: supports volume ownership and permissions change only if the fsType is defined -# and the volume's accessModes contains ReadWriteOnce. -# File: kubernetes may use fsGroup to change permissions and ownership of the volume -# to match user requested fsGroup in the pod's security policy regardless of fstype or access mode. -# None: volumes will be mounted with no modifications. -# Default value: ReadWriteOnceWithFSType -fsGroupPolicy: ReadWriteOnceWithFSType - -# podmonAPIPort: Defines the port to be used within the kubernetes cluster -# Allowed values: -# Any valid and free port. -# Default value: 8083 -podmonAPIPort: 8083 - -# maxPathLen: this parameter is used for setting the maximum Path length for the given volume. -# Default value: 192 -# Examples: 192, 256 -maxPathLen: 192 - -# azReconcileInterval: Interval to monitor and reconcile network interface labels on nodes. -# Allowed values: Number followed by unit of time (s,m,h) -# Default value: 1h -azReconcileInterval: 1h - -# controller: configure controller pod specific parameters -controller: - # controllerCount: defines the number of csi-powerscale controller pods to deploy to - # the Kubernetes release. - # Allowed values: n, where n > 0 - # Default value: None - controllerCount: 2 - - # volumeNamePrefix: Prefix of PersistentVolume names created - # Allowed values: string - # Default value: csivol - # Examples: "k8s", "app1" - volumeNamePrefix: csivol - - # leaderElection: configure leader election parameters - leaderElection: - # Duration, that non-leader candidates will wait to force acquire leadership - # Allowed values: Duration, in seconds. Must be greater than leaderElectionRenewDeadline - # Default value: 15s - leaderElectionLeaseDuration: 15s - - # Duration, that the acting leader will retry refreshing leadership before giving up - # Allowed values: Duration, in seconds. Must be greater than leaderElectionRetryPeriod - # Default value: 10s - leaderElectionRenewDeadline: 10s - - # Duration, the LeaderElector clients should wait between tries of actions. - # Allowed values: Duration, in seconds - # Default value: 5s - leaderElectionRetryPeriod: 5s - - # replication: allows to configure replication - # Replication CRDs must be installed before installing driver - replication: - # enabled: Enable/Disable replication feature - # Allowed values: - # true: enable replication feature(install dell-csi-replicator sidecar) - # false: disable replication feature(do not install dell-csi-replicator sidecar) - # Default value: false - enabled: false - - # replicationContextPrefix: prefix to use for naming of resources created by replication feature - # Allowed values: string - # Default value: powerscale - replicationContextPrefix: "powerscale" - - # replicationPrefix: prefix to prepend to storage classes parameters - # Allowed values: string - # Default value: replication.storage.dell.com - replicationPrefix: "replication.storage.dell.com" - - snapshot: - # enabled: Enable/Disable volume snapshot feature - # Allowed values: - # true: enable volume snapshot feature(install snapshotter sidecar) - # false: disable volume snapshot feature(do not install snapshotter sidecar) - # Default value: None - enabled: true - - # snapNamePrefix: Prefix to apply to the names of a created snapshots - # Allowed values: string - # Default value: csi-snap - # Examples: "snap", "snapshot" - snapNamePrefix: csi-snap - - resizer: - # enabled: Enable/Disable volume expansion feature - # Allowed values: - # true: enable volume expansion feature(install resizer sidecar) - # false: disable volume snapshot feature(do not install resizer sidecar) - # Default value: None - enabled: true - - healthMonitor: - # enabled: Enable/Disable health monitor of CSI volumes- volume status, volume condition - # Allowed values: - # true: enable checking of health condition of CSI volumes - # false: disable checking of health condition of CSI volumes - # Default value: None - enabled: false - - # interval: Interval of monitoring volume health condition - # Allowed values: Number followed by unit of time (s,m,h) - # Default value: 60s - interval: 60s - - # nodeSelector: Define node selection constraints for pods of controller deployment. - # For the pod to be eligible to run on a node, the node must have each - # of the indicated key-value pairs as labels. - # Leave as blank to consider all nodes - # Allowed values: map of key-value pairs - # Default value: None - nodeSelector: - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint - # node-role.kubernetes.io/master: "" - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint - # node-role.kubernetes.io/control-plane: "" - - # tolerations: Define tolerations for the controller deployment, if required. - # Default value: None - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint - tolerations: - # - key: "node-role.kubernetes.io/master" - # operator: "Exists" - # effect: "NoSchedule" - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint - # tolerations: - # - key: "node-role.kubernetes.io/control-plane" - # operator: "Exists" - # effect: "NoSchedule" - -# node: configure node pod specific parameters -node: - # nodeSelector: Define node selection constraints for pods of node daemonset - # For the pod to be eligible to run on a node, the node must have each - # of the indicated key-value pairs as labels. - # Leave as blank to consider all nodes - # Allowed values: map of key-value pairs - # Default value: None - nodeSelector: - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint - # node-role.kubernetes.io/master: "" - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint - # node-role.kubernetes.io/control-plane: "" - - # tolerations: Define tolerations for the node daemonset, if required. - # Default value: None - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/master taint - tolerations: - # - key: "node.kubernetes.io/memory-pressure" - # operator: "Exists" - # effect: "NoExecute" - # - key: "node.kubernetes.io/disk-pressure" - # operator: "Exists" - # effect: "NoExecute" - # - key: "node.kubernetes.io/network-unavailable" - # operator: "Exists" - # effect: "NoExecute" - # - key: "node-role.kubernetes.io/master" - # operator: "Exists" - # effect: "NoSchedule" - # Uncomment if nodes you wish to use have the node-role.kubernetes.io/control-plane taint - # tolerations: - # - key: "node-role.kubernetes.io/control-plane" - # operator: "Exists" - # effect: "NoSchedule" - - # Uncomment if CSM for Resiliency and CSI Driver pods monitor are enabled - # tolerations: - # - key: "offline.vxflexos.storage.dell.com" - # operator: "Exists" - # effect: "NoSchedule" - # - key: "vxflexos.podmon.storage.dell.com" - # operator: "Exists" - # effect: "NoSchedule" - # - key: "offline.unity.storage.dell.com" - # operator: "Exists" - # effect: "NoSchedule" - # - key: "unity.podmon.storage.dell.com" - # operator: "Exists" - # effect: "NoSchedule" - # - key: "offline.isilon.storage.dell.com" - # operator: "Exists" - # effect: "NoSchedule" - # - key: "isilon.podmon.storage.dell.com" - # operator: "Exists" - # effect: "NoSchedule" - - # dnsPolicy: Determines the DNS Policy of the Node service. - # Allowed values: - # Default: The Pod inherits the name resolution configuration from the node that the pods run on. - # ClusterFirst: Any DNS query that does not match the configured cluster domain suffix, such as "www.kubernetes.io", - # is forwarded to the upstream nameserver inherited from the node. - # ClusterFirstWithHostNet: For Pods running with hostNetwork, you should explicitly set this DNS policy. - # None: It allows a Pod to ignore DNS settings from the Kubernetes environment. - # All DNS settings are supposed to be provided using the dnsConfig field in the Pod Spec. - # Default value: ClusterFirst - # ClusterFirstWithHostNet is the recommended DNS policy. - # Prior to v1.5 of the driver, the default DNS policy was ClusterFirst. - # In certain scenarios, users might need to change the default dnsPolicy. - dnsPolicy: ClusterFirstWithHostNet - - healthMonitor: - # enabled: Enable/Disable health monitor of CSI volumes- volume usage, volume condition - # Allowed values: - # true: enable checking of health condition of CSI volumes - # false: disable checking of health condition of CSI volumes - # Default value: None - enabled: false - -## PLATFORM ATTRIBUTES -###################### -# endpointPort: Specify the HTTPs port number of the PowerScale OneFS API server -# Formerly this attribute was named as "isiPort" -# This value acts as a default value for endpointPort, if not specified for a cluster config in secret -# If authorization is enabled, endpointPort must match the port specified in the endpointPort parameter of the isilon-creds secret -# Allowed value: valid port number -# Default value: 8080 -endpointPort: 8080 - -# skipCertificateValidation: Specify whether the PowerScale OneFS API server's certificate chain and host name should be verified. -# Formerly this attribute was named as "isiInsecure" -# This value acts as a default value for skipCertificateValidation, if not specified for a cluster config in secret -# Allowed values: -# true: skip OneFS API server's certificate verification -# false: verify OneFS API server's certificates -# Default value: false -skipCertificateValidation: true - -# isiAuthType: Indicates whether the authentication will be session-based or basic. -# Allowed values: -# 0: enables basic Authentication -# 1: enables session-based Authentication -# Default value: 0 -isiAuthType: 0 - -# isiAccessZone: The name of the access zone a volume can be created in. -# If storageclass is missing with AccessZone parameter, then value of isiAccessZone is used for the same. -# Default value: System -# Examples: System, zone1 -isiAccessZone: System - -# enableQuota: Indicates whether the provisioner should attempt to set (later unset) quota -# on a newly provisioned volume. -# This requires SmartQuotas to be enabled on PowerScale cluster. -# Allowed values: -# true: set quota for volume -# false: do not set quota for volume -enableQuota: true - -# isiPath: The base path for the volumes to be created on PowerScale cluster. -# This value acts as a default value for isiPath, if not specified for a cluster config in secret -# Ensure that this path exists on PowerScale cluster. -# Allowed values: unix absolute path -# Default value: /ifs -# Examples: /ifs/data/csi, /ifs/engineering -isiPath: /ifs/data/csi - -# isiVolumePathPermissions: The permissions for isi volume directory path -# This value acts as a default value for isiVolumePathPermissions, if not specified for a cluster config in secret -# Allowed values: valid octal mode number -# Default value: "0777" -# Examples: "0777", "777", "0755" -isiVolumePathPermissions: "0777" - -# ignoreUnresolvableHosts: Ignore unresolvable hosts on the OneFS -# When set to true, OneFS allows new host to add to existing export list though any of the existing hosts from the -# same exports are unresolvable/doesn't exist anymore. -# Allowed values: -# true: ignore existing unresolvable hosts and append new host to the existing export -# false: exhibits OneFS default behavior i.e. if any of existing hosts are unresolvable while adding new one it fails -# Default value: false -ignoreUnresolvableHosts: false - -# noProbeOnStart: Indicates whether the controller/node should probe all the PowerScale clusters during driver initialization -# When set to true, the driver will not set node labels, please manually add -# the label .dellemc.com/: on the nodes for each of the clusters reachable from the node. -# Allowed values: -# true : do not probe all PowerScale clusters during driver initialization -# false: probe all PowerScale clusters during driver initialization -# Default value: false -noProbeOnStart: false - -# autoProbe: automatically probe the PowerScale cluster if not done already during CSI calls. -# Allowed values: -# true : enable auto probe. -# false: disable auto probe. -# Default value: false -autoProbe: true - -authorization: - enabled: false - # proxyHost: hostname of the csm-authorization server - # Default value: None - proxyHost: - # skipCertificateValidation: certificate validation of the csm-authorization server - # Allowed Values: - # "true" - TLS certificate verification will be skipped - # "false" - TLS certificate will be verified - # Default value: "true" - skipCertificateValidation: true - -# Storage Capacity Tracking -# Note: Capacity tracking is supported in kubernetes v1.24 and above, this feature will be automatically disabled in older versions. -storageCapacity: - # enabled : Enable/Disable storage capacity tracking - # Allowed values: - # true: enable storage capacity tracking - # false: disable storage capacity tracking - # Default value: true - enabled: true - # pollInterval : Configure how often external-provisioner polls the driver to detect changed capacity - # Allowed values: 1m,2m,3m,...,10m,...,60m etc - # Default value: 5m - pollInterval: 5m - -# Enable this feature only after contact support for additional information -podmon: - enabled: false - controller: - args: - - "--csisock=unix:/var/run/csi/csi.sock" - - "--labelvalue=csi-isilon" - - "--arrayConnectivityPollRate=60" - - "--driverPath=csi-isilon.dellemc.com" - - "--mode=controller" - - "--skipArrayConnectionValidation=false" - - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml" - - "--driverPodLabelValue=dell-storage" - - "--ignoreVolumelessPods=false" - - node: - args: - - "--csisock=unix:/var/lib/kubelet/plugins/csi-isilon/csi_sock" - - "--labelvalue=csi-isilon" - - "--arrayConnectivityPollRate=60" - - "--driverPath=csi-isilon.dellemc.com" - - "--mode=node" - - "--leaderelection=false" - - "--driver-config-params=/csi-isilon-config-params/driver-config-params.yaml" - - "--driverPodLabelValue=dell-storage" - - "--ignoreVolumelessPods=false" From e25cab028c312308357613dd7e2bb87ba83f1f4e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 18 Jun 2026 08:39:03 +0530 Subject: [PATCH 24/33] Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Fix for victoria loadbalacer IP preservation --- .../tasks/apply_victoria_crs.yml | 130 +++++++++++++++++- .../tasks/migrate_statefulset.yml | 7 +- upgrade/roles/upgrade_telemetry/vars/main.yml | 27 ++++ 3 files changed, 156 insertions(+), 8 deletions(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index d725cf067a..f557a7d0e7 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -117,11 +117,7 @@ - name: Display LoadBalancer IP injection status ansible.builtin.debug: - msg: >- - {{ victoria_lb_ips_preserved - if (preserved_vminsert_ip | default('') | length > 0) - or (preserved_vmselect_ip | default('') | length > 0) - else victoria_lb_ips_not_preserved }} + msg: "{{ victoria_lb_ip_injection_status }}" # ── Apply main CR (VMCluster only — 2.2 cluster mode only) ── - name: Apply VMCluster CR (cluster mode only) with retry @@ -135,6 +131,130 @@ delegate_to: "{{ kube_vip }}" connection: ssh +# ── Wait for VMCluster LoadBalancer IPs and reclaim if stolen ── +# The operator creates vminsert/vmselect services asynchronously after the CR is applied. +# We MUST wait for these services to get their LoadBalancer IPs BEFORE Phase 3 +# (telemetry.sh) runs, because telemetry.sh also creates VictoriaLogs services via +# kubectl apply -k. If VL services are created before VM services exist, MetalLB +# assigns the freed IPs to VL services, leaving VM services in state. +# +# If the preserved IPs got assigned to wrong services, we reclaim them: +# 1. Find services holding the preserved IPs that are NOT vminsert/vmselect +# 2. Delete those services to free the IPs +# 3. Wait for vminsert/vmselect to claim the preserved IPs + +- name: Initial wait for vminsert LoadBalancer IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vminsert_lb_ip + until: vminsert_lb_ip.stdout | trim | length > 0 + retries: 24 + delay: 5 + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + +- name: Initial wait for vmselect LoadBalancer IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vmselect_lb_ip + until: vmselect_lb_ip.stdout | trim | length > 0 + retries: 24 + delay: 5 + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + +# ── Reclaim stolen IPs if VMCluster services are still pending ── +- name: Reclaim preserved IPs from wrong services + when: + - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0 + - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0 + block: + - name: Find services holding preserved IPs that are not VMCluster services + ansible.builtin.shell: | + set -o pipefail + PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}" + VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}" + kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \ + python3 -c " + import json, sys + data = json.load(sys.stdin) + preserved = set('${PRESERVED_IPS}'.split()) + vmcluster = set('${VMCLUSTER_SVCS}'.split()) + for svc in data.get('items', []): + name = svc['metadata']['name'] + if name in vmcluster: + continue + ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', []) + for ing in ingress: + ip = ing.get('ip', '') + if ip in preserved: + print(name) + break + " || true + register: ip_thieves + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Display services holding preserved IPs + ansible.builtin.debug: + msg: "{{ victoria_lb_ip_thieves_found }}" + when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0 + + - name: Delete services that stole preserved IPs + ansible.builtin.command: + cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s + loop: "{{ ip_thieves.stdout_lines | default([]) | select() | list }}" + changed_when: true + failed_when: false + when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0 + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Wait for vminsert to reclaim preserved IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vminsert_lb_ip + until: vminsert_lb_ip.stdout | trim | length > 0 + retries: 30 + delay: 5 + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Wait for vmselect to reclaim preserved IP + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" + register: vmselect_lb_ip + until: vmselect_lb_ip.stdout | trim | length > 0 + retries: 30 + delay: 5 + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + +- name: Display confirmed LoadBalancer IPs + ansible.builtin.debug: + msg: "{{ victoria_lb_ip_confirmed }}" + +- name: Warn if LoadBalancer IPs still not assigned after reclaim + ansible.builtin.debug: + msg: "{{ victoria_lb_ip_reclaim_failed }}" + when: >- + (vminsert_lb_ip is defined and vminsert_lb_ip.stdout is defined and vminsert_lb_ip.stdout | trim | length == 0) or + (vmselect_lb_ip is defined and vmselect_lb_ip.stdout is defined and vmselect_lb_ip.stdout | trim | length == 0) + # ── Apply scrape and agent CRs ── - name: Check for VMScrape manifest ansible.builtin.stat: diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml index 23648e2ed6..b290bc639a 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml @@ -252,6 +252,8 @@ # ── Cleanup old pre-operator services and deployments ── # The operator creates new services with different names (e.g. vminsert-victoria-cluster), # so the old standalone services become stale and waste LoadBalancer IPs. + # Old services MUST be deleted BEFORE applying VMCluster CR so MetalLB can + # assign the same IPs to the new operator-managed services via loadBalancerIP. - name: Find old pre-operator services ansible.builtin.shell: | set -o pipefail @@ -285,6 +287,5 @@ - name: Display old resource cleanup summary ansible.builtin.debug: - msg: - - "Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}" - - "Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}" + msg: "{{ victoria_old_svc_cleanup_summary }}" + verbosity: 2 diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index a869e5a52a..457c6f5bd2 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -105,7 +105,34 @@ victoria_lb_ips_preserved: >- LoadBalancer IPs injected into VMCluster manifest - vminsert: {{ preserved_vminsert_ip | default('N/A') }}, vmselect: {{ preserved_vmselect_ip | default('N/A') }} +victoria_lb_ip_injection_status: >- + {{ victoria_lb_ips_preserved + if (preserved_vminsert_ip | default('') | length > 0) + or (preserved_vmselect_ip | default('') | length > 0) + else victoria_lb_ips_not_preserved }} victoria_lb_ips_not_preserved: "No old LoadBalancer IPs found to preserve (fresh deploy or already operator-managed)" +victoria_lb_ip_confirmed: >- + VMCluster LoadBalancer IPs confirmed - + vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }}, + vmselect-{{ new_vmcluster_name }}: {{ vmselect_lb_ip.stdout | default('PENDING') | trim }} +victoria_lb_ip_reclaim_needed: >- + VMCluster services still pending after initial wait. + Checking if preserved IPs were assigned to wrong services... +victoria_lb_ip_thieves_found: >- + Services holding preserved IPs (will be deleted and re-created by telemetry.sh): + {{ ip_thieves.stdout_lines | default([]) | select() | list }} +victoria_lb_ip_reclaim_success: >- + Successfully reclaimed preserved IPs for VMCluster services - + vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }}, + vmselect-{{ new_vmcluster_name }}: {{ vmselect_lb_ip.stdout | default('PENDING') | trim }} +victoria_lb_ip_reclaim_failed: >- + WARNING: VMCluster services still do not have LoadBalancer IPs after reclaim attempt. + vminsert: {{ vminsert_lb_ip.stdout | default('NONE') | trim }}, + vmselect: {{ vmselect_lb_ip.stdout | default('NONE') | trim }}. + Please use new assigned IPs. +victoria_old_svc_cleanup_summary: >- + Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}. + Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }} victoria_pods_not_ready: "Telemetry upgrade FAILED: Some pods are not ready. {{ pods_not_ready.stdout | int }} pod(s) not in Running state." victoria_pods_ready_after_wait: "All telemetry pods are ready after waiting" telemetry_upgrade_success: "Telemetry upgrade COMPLETED: All telemetry pods are running and ready." From 1bd65fa73fc99a45ef91e233d1711acefcf59a6e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 18 Jun 2026 08:57:39 +0530 Subject: [PATCH 25/33] Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> address review comments --- .../tasks/apply_victoria_crs.yml | 60 +++++++------------ upgrade/roles/upgrade_telemetry/vars/main.yml | 8 ++- 2 files changed, 26 insertions(+), 42 deletions(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index f557a7d0e7..2a1c195ddb 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -131,7 +131,7 @@ delegate_to: "{{ kube_vip }}" connection: ssh -# ── Wait for VMCluster LoadBalancer IPs and reclaim if stolen ── +# ── Wait for VMCluster LoadBalancer IPs and reclaim if reassigned ── # The operator creates vminsert/vmselect services asynchronously after the CR is applied. # We MUST wait for these services to get their LoadBalancer IPs BEFORE Phase 3 # (telemetry.sh) runs, because telemetry.sh also creates VictoriaLogs services via @@ -140,8 +140,8 @@ # # If the preserved IPs got assigned to wrong services, we reclaim them: # 1. Find services holding the preserved IPs that are NOT vminsert/vmselect -# 2. Delete those services to free the IPs -# 3. Wait for vminsert/vmselect to claim the preserved IPs +# 2. Delete those conflicting services to free the IPs +# 3. Wait for vminsert/vmselect to reclaim the preserved IPs - name: Initial wait for vminsert LoadBalancer IP ansible.builtin.shell: | @@ -149,8 +149,8 @@ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vminsert_lb_ip until: vminsert_lb_ip.stdout | trim | length > 0 - retries: 24 - delay: 5 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" changed_when: false failed_when: false delegate_to: "{{ kube_vip }}" @@ -162,42 +162,22 @@ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vmselect_lb_ip until: vmselect_lb_ip.stdout | trim | length > 0 - retries: 24 - delay: 5 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" changed_when: false failed_when: false delegate_to: "{{ kube_vip }}" connection: ssh -# ── Reclaim stolen IPs if VMCluster services are still pending ── -- name: Reclaim preserved IPs from wrong services +# ── Reclaim reassigned IPs if VMCluster services are still pending ── +- name: Reclaim preserved IPs from conflicting services when: - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0 - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0 block: - name: Find services holding preserved IPs that are not VMCluster services - ansible.builtin.shell: | - set -o pipefail - PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}" - VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}" - kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \ - python3 -c " - import json, sys - data = json.load(sys.stdin) - preserved = set('${PRESERVED_IPS}'.split()) - vmcluster = set('${VMCLUSTER_SVCS}'.split()) - for svc in data.get('items', []): - name = svc['metadata']['name'] - if name in vmcluster: - continue - ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', []) - for ing in ingress: - ip = ing.get('ip', '') - if ip in preserved: - print(name) - break - " || true - register: ip_thieves + ansible.builtin.shell: "{{ lookup('template', 'find_ip_conflict_svcs.sh.j2') }}" + register: ip_conflict_svcs changed_when: false failed_when: false delegate_to: "{{ kube_vip }}" @@ -205,16 +185,16 @@ - name: Display services holding preserved IPs ansible.builtin.debug: - msg: "{{ victoria_lb_ip_thieves_found }}" - when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0 + msg: "{{ victoria_lb_ip_conflict_svcs_found }}" + when: ip_conflict_svcs.stdout_lines | default([]) | select() | list | length > 0 - - name: Delete services that stole preserved IPs + - name: Delete conflicting services holding preserved IPs ansible.builtin.command: cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s - loop: "{{ ip_thieves.stdout_lines | default([]) | select() | list }}" + loop: "{{ ip_conflict_svcs.stdout_lines | default([]) | select() | list }}" changed_when: true failed_when: false - when: ip_thieves.stdout_lines | default([]) | select() | list | length > 0 + when: ip_conflict_svcs.stdout_lines | default([]) | select() | list | length > 0 delegate_to: "{{ kube_vip }}" connection: ssh @@ -224,8 +204,8 @@ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vminsert_lb_ip until: vminsert_lb_ip.stdout | trim | length > 0 - retries: 30 - delay: 5 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" changed_when: false failed_when: false delegate_to: "{{ kube_vip }}" @@ -237,8 +217,8 @@ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vmselect_lb_ip until: vmselect_lb_ip.stdout | trim | length > 0 - retries: 30 - delay: 5 + retries: "{{ lb_ip_wait_retries }}" + delay: "{{ lb_ip_wait_delay }}" changed_when: false failed_when: false delegate_to: "{{ kube_vip }}" diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 457c6f5bd2..ce5129396c 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -45,6 +45,10 @@ pod_wait_delay: 15 idrac_rollout_retries: 3 idrac_rollout_delay: 30 +# LoadBalancer IP wait configuration +lb_ip_wait_retries: 30 +lb_ip_wait_delay: 5 + # Victoria operator configuration # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml victoria_operator_release_name: victoria-metrics-operator @@ -118,9 +122,9 @@ victoria_lb_ip_confirmed: >- victoria_lb_ip_reclaim_needed: >- VMCluster services still pending after initial wait. Checking if preserved IPs were assigned to wrong services... -victoria_lb_ip_thieves_found: >- +victoria_lb_ip_conflict_svcs_found: >- Services holding preserved IPs (will be deleted and re-created by telemetry.sh): - {{ ip_thieves.stdout_lines | default([]) | select() | list }} + {{ ip_conflict_svcs.stdout_lines | default([]) | select() | list }} victoria_lb_ip_reclaim_success: >- Successfully reclaimed preserved IPs for VMCluster services - vminsert-{{ new_vmcluster_name }}: {{ vminsert_lb_ip.stdout | default('PENDING') | trim }}, From 2221e37516821dcd69177f36ab17196f9b3152bd Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 18 Jun 2026 09:01:43 +0530 Subject: [PATCH 26/33] adding shell script Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../templates/find_ip_conflict_svcs.sh.j2 | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 diff --git a/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 b/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 new file mode 100644 index 0000000000..c21217afad --- /dev/null +++ b/upgrade/roles/upgrade_telemetry/templates/find_ip_conflict_svcs.sh.j2 @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Find services in the telemetry namespace that are holding LoadBalancer IPs +# which should belong to VMCluster services (vminsert/vmselect). +# This can happen when MetalLB reassigns freed IPs to other services +# before the VMCluster services are created by the operator. +# +# Usage: bash find_ip_conflict_svcs.sh +# Output: One service name per line (services holding conflicting IPs) + +set -o pipefail + +PRESERVED_IPS="{{ preserved_vminsert_ip | default('') }} {{ preserved_vmselect_ip | default('') }}" +VMCLUSTER_SVCS="vminsert-{{ new_vmcluster_name }} vmselect-{{ new_vmcluster_name }}" + +kubectl -n {{ telemetry_namespace }} get svc -o json 2>/dev/null | \ + python3 -c " +import json, sys +data = json.load(sys.stdin) +preserved = set('${PRESERVED_IPS}'.split()) +vmcluster = set('${VMCLUSTER_SVCS}'.split()) +for svc in data.get('items', []): + name = svc['metadata']['name'] + if name in vmcluster: + continue + ingress = svc.get('status', {}).get('loadBalancer', {}).get('ingress', []) + for ing in ingress: + ip = ing.get('ip', '') + if ip in preserved: + print(name) + break +" || true From c40786454077700eb7e12b289572f1f94c5c72d1 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 18 Jun 2026 09:23:49 +0530 Subject: [PATCH 27/33] address ansible lint issues Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> ansible lint fixes --- .../tasks/apply_victoria_crs.yml | 17 ++++++++++++++++- upgrade/roles/upgrade_telemetry/vars/main.yml | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index 2a1c195ddb..addac543ab 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -175,14 +175,29 @@ - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0 - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0 block: + - name: Stage IP conflict detection script + ansible.builtin.template: + src: find_ip_conflict_svcs.sh.j2 + dest: "{{ ip_conflict_script_path }}" + mode: "{{ executable_mode }}" + delegate_to: "{{ kube_vip }}" + connection: ssh + - name: Find services holding preserved IPs that are not VMCluster services - ansible.builtin.shell: "{{ lookup('template', 'find_ip_conflict_svcs.sh.j2') }}" + ansible.builtin.command: "{{ ip_conflict_script_path }}" register: ip_conflict_svcs changed_when: false failed_when: false delegate_to: "{{ kube_vip }}" connection: ssh + - name: Remove IP conflict detection script + ansible.builtin.file: + path: "{{ ip_conflict_script_path }}" + state: absent + delegate_to: "{{ kube_vip }}" + connection: ssh + - name: Display services holding preserved IPs ansible.builtin.debug: msg: "{{ victoria_lb_ip_conflict_svcs_found }}" diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index ce5129396c..5d51a1a057 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -48,6 +48,7 @@ idrac_rollout_delay: 30 # LoadBalancer IP wait configuration lb_ip_wait_retries: 30 lb_ip_wait_delay: 5 +ip_conflict_script_path: /tmp/find_ip_conflict_svcs.sh # Victoria operator configuration # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml From 3f47ba9ff51d8101750248252641460635ba88f4 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:16:52 +0530 Subject: [PATCH 28/33] update until condition Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../tasks/apply_victoria_crs.yml | 61 +++++++------------ .../templates/inject_vm_lb_ips.py.j2 | 27 ++++++++ upgrade/roles/upgrade_telemetry/vars/main.yml | 1 + 3 files changed, 49 insertions(+), 40 deletions(-) create mode 100644 upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index addac543ab..1e8b7aa3f0 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -55,38 +55,11 @@ # and the operator creates new ones. To preserve IPs, we inject loadBalancerIP # directly into the VMCluster CR's serviceSpec BEFORE applying, so the operator # creates services with the correct IPs from the start (no race condition). -- name: Create LoadBalancer IP injection script - ansible.builtin.copy: - dest: /tmp/inject_vm_lb_ips.py - mode: "0755" - content: | - #!/usr/bin/env python3 - import yaml - import sys - manifest_path = sys.argv[1] - vmselect_ip = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2] else "" - vminsert_ip = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] else "" - with open(manifest_path) as f: - doc = yaml.safe_load(f) - spec = doc.get("spec", {}) - changed = False - if vmselect_ip and "vmselect" in spec: - svc = spec["vmselect"].setdefault("serviceSpec", {}).setdefault("spec", {}) - if svc.get("loadBalancerIP") != vmselect_ip: - svc["loadBalancerIP"] = vmselect_ip - changed = True - if vminsert_ip and "vminsert" in spec: - svc = spec["vminsert"].setdefault("serviceSpec", {}).setdefault("spec", {}) - if svc.get("loadBalancerIP") != vminsert_ip: - svc["loadBalancerIP"] = vminsert_ip - changed = True - if changed: - with open(manifest_path, "w") as f: - yaml.dump(doc, f, default_flow_style=False, sort_keys=False) - print("Injected vmselect=" + vmselect_ip + " vminsert=" + vminsert_ip) - else: - print("IPs already present - no change needed") - sys.exit(0 if changed else 2) +- name: Stage LoadBalancer IP injection script + ansible.builtin.template: + src: inject_vm_lb_ips.py.j2 + dest: "{{ ip_inject_script_path }}" + mode: "{{ executable_mode }}" delegate_to: "{{ kube_vip }}" connection: ssh when: @@ -95,7 +68,7 @@ - name: Inject preserved LoadBalancer IPs into VMCluster manifest ansible.builtin.command: cmd: >- - python3 /tmp/inject_vm_lb_ips.py + python3 {{ ip_inject_script_path }} "{{ telemetry_deploy_dir }}/deployments/victoria-operator-vmcluster.yaml" "{{ preserved_vmselect_ip | default('') }}" "{{ preserved_vminsert_ip | default('') }}" @@ -109,7 +82,7 @@ - name: Clean up LoadBalancer IP injection script ansible.builtin.file: - path: /tmp/inject_vm_lb_ips.py + path: "{{ ip_inject_script_path }}" state: absent delegate_to: "{{ kube_vip }}" connection: ssh @@ -148,7 +121,9 @@ kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vminsert_lb_ip - until: vminsert_lb_ip.stdout | trim | length > 0 + until: > + (vminsert_lb_ip is defined) and + ((vminsert_lb_ip.stdout | default('') | trim | length) > 0) retries: "{{ lb_ip_wait_retries }}" delay: "{{ lb_ip_wait_delay }}" changed_when: false @@ -161,7 +136,9 @@ kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vmselect_lb_ip - until: vmselect_lb_ip.stdout | trim | length > 0 + until: > + (vmselect_lb_ip is defined) and + ((vmselect_lb_ip.stdout | default('') | trim | length) > 0) retries: "{{ lb_ip_wait_retries }}" delay: "{{ lb_ip_wait_delay }}" changed_when: false @@ -218,7 +195,9 @@ kubectl -n {{ telemetry_namespace }} get svc vminsert-{{ new_vmcluster_name }} \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vminsert_lb_ip - until: vminsert_lb_ip.stdout | trim | length > 0 + until: > + (vminsert_lb_ip is defined) and + ((vminsert_lb_ip.stdout | default('') | trim | length) > 0) retries: "{{ lb_ip_wait_retries }}" delay: "{{ lb_ip_wait_delay }}" changed_when: false @@ -231,7 +210,9 @@ kubectl -n {{ telemetry_namespace }} get svc vmselect-{{ new_vmcluster_name }} \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "" register: vmselect_lb_ip - until: vmselect_lb_ip.stdout | trim | length > 0 + until: > + (vmselect_lb_ip is defined) and + ((vmselect_lb_ip.stdout | default('') | trim | length) > 0) retries: "{{ lb_ip_wait_retries }}" delay: "{{ lb_ip_wait_delay }}" changed_when: false @@ -247,8 +228,8 @@ ansible.builtin.debug: msg: "{{ victoria_lb_ip_reclaim_failed }}" when: >- - (vminsert_lb_ip is defined and vminsert_lb_ip.stdout is defined and vminsert_lb_ip.stdout | trim | length == 0) or - (vmselect_lb_ip is defined and vmselect_lb_ip.stdout is defined and vmselect_lb_ip.stdout | trim | length == 0) + (vminsert_lb_ip is defined and vminsert_lb_ip.stdout | default('') | trim | length == 0) or + (vmselect_lb_ip is defined and vmselect_lb_ip.stdout | default('') | trim | length == 0) # ── Apply scrape and agent CRs ── - name: Check for VMScrape manifest diff --git a/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 new file mode 100644 index 0000000000..a6299d49d7 --- /dev/null +++ b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +import yaml +import sys +manifest_path = sys.argv[1] +vmselect_ip = sys.argv[2] if len(sys.argv) > 2 and sys.argv[2] else "" +vminsert_ip = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] else "" +with open(manifest_path) as f: + doc = yaml.safe_load(f) +spec = doc.get("spec", {}) +changed = False +if vmselect_ip and "vmselect" in spec: + svc = spec["vmselect"].setdefault("serviceSpec", {}).setdefault("spec", {}) + if svc.get("loadBalancerIP") != vmselect_ip: + svc["loadBalancerIP"] = vmselect_ip + changed = True +if vminsert_ip and "vminsert" in spec: + svc = spec["vminsert"].setdefault("serviceSpec", {}).setdefault("spec", {}) + if svc.get("loadBalancerIP") != vminsert_ip: + svc["loadBalancerIP"] = vminsert_ip + changed = True +if changed: + with open(manifest_path, "w") as f: + yaml.dump(doc, f, default_flow_style=False, sort_keys=False) + print("Injected vmselect=" + vmselect_ip + " vminsert=" + vminsert_ip) +else: + print("IPs already present - no change needed") +sys.exit(0 if changed else 2) diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 5d51a1a057..b726a4be10 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -49,6 +49,7 @@ idrac_rollout_delay: 30 lb_ip_wait_retries: 30 lb_ip_wait_delay: 5 ip_conflict_script_path: /tmp/find_ip_conflict_svcs.sh +ip_inject_script_path: /tmp/inject_vm_lb_ips.py # Victoria operator configuration # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml From 247a99525f060acc659b1cdae996127df330c3fd Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:25:29 +0530 Subject: [PATCH 29/33] fixed stdout check Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../templates/inject_vm_lb_ips.py.j2 | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 index a6299d49d7..21e390ae29 100644 --- a/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 +++ b/upgrade/roles/upgrade_telemetry/templates/inject_vm_lb_ips.py.j2 @@ -1,4 +1,27 @@ #!/usr/bin/env python3 + +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Find services in the telemetry namespace that are holding LoadBalancer IPs +# which should belong to VMCluster services (vminsert/vmselect). +# This can happen when MetalLB reassigns freed IPs to other services +# before the VMCluster services are created by the operator. +# +# Usage: bash find_ip_conflict_svcs.sh +# Output: One service name per line (services holding conflicting IPs) + import yaml import sys manifest_path = sys.argv[1] From 9eabf15f3d8be241eae359dbfc326f193d8a7444 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:48:01 +0530 Subject: [PATCH 30/33] merge Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> merge --- .../orchestrator/common/result_poller.py | 51 ++++ .../module_utils/local_repo/software_utils.py | 11 + common/library/modules/parallel_tasks.py | 10 + input/telemetry_config.yml | 2 +- input_validation/validate_config.yml | 9 +- local_repo/local_repo.yml | 4 + .../tasks/validate_additional_cloud_init.yml | 6 + .../tasks/apply_telemetry_on_upgrade.yml | 240 ------------------ .../tasks/derive_sink_support_flags.yml | 74 ++++-- provision/roles/telemetry/tasks/main.yml | 40 +-- .../telemetry/tasks/read_software_config.yml | 21 +- 11 files changed, 165 insertions(+), 303 deletions(-) delete mode 100644 provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml diff --git a/build_stream/orchestrator/common/result_poller.py b/build_stream/orchestrator/common/result_poller.py index 6d35738773..6f40a91965 100644 --- a/build_stream/orchestrator/common/result_poller.py +++ b/build_stream/orchestrator/common/result_poller.py @@ -362,6 +362,11 @@ def _on_result_received(self, result: PlaybookResult) -> None: # S12: On restart failure, still persist node_results.json if result.stage_name == "restart": self._on_restart_completed(result) + self._on_restart_failure(result) + + # On deploy failure, mark ImageGroup FAILED + if result.stage_name == "deploy": + self._on_deploy_failure(result) # On validate failure, mark ImageGroup FAILED if result.stage_name == "validate": @@ -968,3 +973,49 @@ def _on_deploy_failure(self, result: PlaybookResult) -> None: job_id=str(result.job_id), exc_info=True, ) + + def _on_restart_failure(self, result: PlaybookResult) -> None: + """Transition ImageGroup from RESTARTING to FAILED on restart failure.""" + if self._image_group_repo is None: + log_secure_info( + "warning", + f"ImageGroup repo not available; skipping restart failure " + f"update for job={result.job_id}", + job_id=str(result.job_id), + ) + return + + try: + image_group = self._image_group_repo.find_by_job_id( + JobId(str(result.job_id)) + ) + if image_group is None: + log_secure_info( + "error", + f"Restart failure callback: No ImageGroup found for job={result.job_id}.", + job_id=str(result.job_id), + ) + return + + self._image_group_repo.update_status( + image_group_id=image_group.id, + new_status=ImageGroupStatus.FAILED, + ) + + if hasattr(self._image_group_repo, 'session'): + self._image_group_repo.session.commit() + + log_secure_info( + "warning", + f"Restart FAILED for job={result.job_id}. " + f"ImageGroup '{image_group.id}' -> FAILED.", + job_id=str(result.job_id), + ) + except Exception as exc: # pylint: disable=broad-except + log_secure_info( + "error", + "Failed to update ImageGroup status on restart " + f"failure for job={result.job_id}: {exc}", + job_id=str(result.job_id), + exc_info=True, + ) diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index da20edea12..af3c1ffab9 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -38,6 +38,7 @@ CSV_COLUMNS, SOFTWARE_CONFIG_SUBDIR, DEFAULT_STATUS_FILENAME, + STATUS_CSV_HEADER, RPM_LABEL_TEMPLATE, RHEL_OS_URL, SOFTWARES_KEY, @@ -853,6 +854,16 @@ def check_csv_existence(path): def read_status_csv(csv_path): """Reads the status.csv file and returns a list of row dictionaries.""" + # Ensure file has valid header before reading + if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0: + with open(csv_path, 'r', encoding='utf-8') as file: + lines = file.readlines() + if lines and lines[0].strip() != STATUS_CSV_HEADER.strip(): + # Header missing or invalid - prepend header to existing data + with open(csv_path, 'w', encoding='utf-8') as wfile: + wfile.write(STATUS_CSV_HEADER) + wfile.writelines(lines) + with open(csv_path, mode='r', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) return [row for row in reader] diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 20268b10fa..99cc28652a 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -160,9 +160,19 @@ def determine_function( # Construct the status file path using DEFAULT_STATUS_FILENAME. status_file = os.path.join(csv_file_path, DEFAULT_STATUS_FILENAME) + + # Ensure file exists with valid header if not os.path.exists(status_file) or os.stat(status_file).st_size == 0: with open(status_file, 'w', encoding="utf-8") as file: file.write(STATUS_CSV_HEADER) + else: + with open(status_file, 'r', encoding="utf-8") as file: + lines = file.readlines() + if lines and lines[0].strip() != STATUS_CSV_HEADER.strip(): + # Header missing or invalid - prepend header to existing data + with open(status_file, 'w', encoding="utf-8") as wfile: + wfile.write(STATUS_CSV_HEADER) + wfile.writelines(lines) task_type = task.get("type") diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index bfc2980c0e..765b227786 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -424,7 +424,7 @@ powerscale_configurations: # Path to the CSM Observability (Karavi Observability) values.yaml file # Required when powerscale_configurations.powerscale_telemetry_support: true - # Reference: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.16.3/charts/karavi-observability/values.yaml + # Reference: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.17.1/charts/karavi-observability/values.yaml csm_observability_values_file_path: "" # -------------------------------------------------------------------------- diff --git a/input_validation/validate_config.yml b/input_validation/validate_config.yml index dc9dfa3913..f3d5469f8a 100644 --- a/input_validation/validate_config.yml +++ b/input_validation/validate_config.yml @@ -50,6 +50,11 @@ tags: - always tasks: + - name: Enable subscription check when validate_config.yml is run directly + ansible.builtin.set_fact: + run_subscription_check: true + when: run_subscription_check is not defined and omnia_run_tags is not defined + - name: Run subscription validation tasks when: "'local_repo' in (omnia_run_tags | default(ansible_run_tags | default([]) | list)) or 'all' in (ansible_run_tags | default([]) | list)" block: @@ -94,7 +99,7 @@ ansible.builtin.include_role: name: validate_subscription tasks_from: check_rhel_subscription.yml - when: "'local_repo' in (hostvars['localhost']['omnia_run_tags'] | default([]))" + when: "hostvars['localhost']['run_subscription_check'] | default(false) | bool" - name: Configure RHEL repository URLs hosts: localhost @@ -107,7 +112,7 @@ ansible.builtin.include_role: name: validate_subscription tasks_from: configure_rhel_os_urls.yml - when: "'local_repo' in (omnia_run_tags | default([]))" + when: "run_subscription_check | default(false) | bool" - name: Validate omnia input config hosts: localhost diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index d4bb1d488d..e6fea817a0 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -29,6 +29,10 @@ omnia_run_tags: "{{ (ansible_run_tags | default([]) | list + ['local_repo']) | unique }}" cacheable: true + - name: Enable subscription check for local_repo + ansible.builtin.set_fact: + run_subscription_check: true + - name: Include metadata vars ansible.builtin.include_vars: "/opt/omnia/.data/oim_metadata.yml" register: include_metadata diff --git a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml index 65747e39a4..50b9545c53 100644 --- a/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml +++ b/provision/roles/configure_ochami/tasks/validate_additional_cloud_init.yml @@ -33,6 +33,12 @@ additional_cloud_init_fg_names: [] when: additional_cloud_init_file_path == '' +- name: Create cloud-init directory + ansible.builtin.file: + path: "{{ cloud_init_dir }}" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + - name: Load additional cloud-init config when: additional_cloud_init_file_path != '' block: diff --git a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml b/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml deleted file mode 100644 index 0cdb4bd2cb..0000000000 --- a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Apply telemetry configurations for upgrade - when: - - kube_vip is defined - - kube_vip | length > 0 - - idrac_telemetry_support | default(false) | bool - block: - - name: Check if telemetry deployment file exists - ansible.builtin.stat: - path: "{{ idrac_telemetry_statefulset_path }}" - register: telemetry_stat - - - name: Get current iDRAC telemetry StatefulSet configuration - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet - name: idrac-telemetry - namespace: "{{ telemetry_namespace }}" - register: current_idrac_statefulset - failed_when: false - when: - - telemetry_stat.stat.exists | default(false) - - - name: Set replica count as fact - ansible.builtin.set_fact: - preserved_replica_count: "{{ current_idrac_statefulset.resources[0].spec.replicas | default(1) }}" - when: - - current_idrac_statefulset.resources is defined and current_idrac_statefulset.resources | length > 0 - - - name: Show current replica count - ansible.builtin.debug: - msg: "Current replica count: {{ preserved_replica_count }}" - verbosity: 2 - when: - - preserved_replica_count is defined - - - name: Read iDRAC telemetry StatefulSet YAML file - ansible.builtin.slurp: - src: "{{ idrac_telemetry_statefulset_path }}" - register: idrac_statefulset_yaml - - - name: Update StatefulSet definition with preserved replica count - ansible.builtin.set_fact: - updated_statefulset_definition: "{{ idrac_statefulset_yaml.content | b64decode | regex_replace('---\\n', '') | from_yaml | combine({'spec': {'replicas': preserved_replica_count | int}}, recursive=true) }}" # noqa: yaml[line-length] - when: - - telemetry_stat.stat.exists | default(false) - - preserved_replica_count is defined - - - name: Apply iDRAC telemetry StatefulSet with preserved replica count - kubernetes.core.k8s: - state: present - definition: "{{ updated_statefulset_definition }}" - register: kubectl_apply_result - when: - - updated_statefulset_definition is defined - - telemetry_stat.stat.exists | default(false) - - - name: Display kubectl apply result - ansible.builtin.debug: - msg: "{{ kubectl_apply_result }}" - when: - - kubectl_apply_result is defined - - - name: Wait for idrac telemetry receiver to be ready - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=idrac-telemetry-receiver" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: idrac_telemetry_receiver_ready - failed_when: false - when: - - idrac_telemetry_support | default(false) | bool - - - name: Display idrac telemetry receiver ready status - ansible.builtin.debug: - msg: "{{ idrac_telemetry_receiver_ready }}" - when: - - idrac_telemetry_support | default(false) | bool - - idrac_telemetry_receiver_ready is defined - -- name: Apply LDMS configurations for upgrade - when: - - kube_vip is defined - - kube_vip | length > 0 - - ldms_support | default(false) | bool - block: - - name: Check if LDMS aggregator is running on service k8s cluster - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet - name: nersc-ldms-aggr - namespace: "{{ telemetry_namespace }}" - delegate_to: "{{ kube_vip }}" - register: ldms_statefulset_info - failed_when: false - - - name: Set LDMS running state - ansible.builtin.set_fact: - ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}" - - - name: Check if LDMS store daemon is running on service k8s cluster - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-store" - delegate_to: "{{ kube_vip }}" - register: ldms_store_pod_info - failed_when: false - when: - - ldms_running | default(false) | bool - - - name: Set LDMS store daemon running state - ansible.builtin.set_fact: - ldms_store_running: "{{ ldms_store_pod_info.resources is defined and ldms_store_pod_info.resources | length > 0 }}" - when: - - ldms_running | default(false) | bool - - - name: Restart LDMS store daemon pod - kubernetes.core.k8s: - state: absent - api_version: v1 - kind: Pod - name: "{{ ldms_store_pod_info.resources[0].metadata.name }}" - namespace: "{{ telemetry_namespace }}" - delegate_to: "{{ kube_vip }}" - failed_when: false - when: - - ldms_store_running | default(false) | bool - - - name: Wait for LDMS store daemon pod to be ready after restart - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-store" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: ldms_store_pod_ready - failed_when: false - when: - - ldms_store_running | default(false) | bool - - - name: Display LDMS store daemon restart status - ansible.builtin.debug: - msg: > - {{ ldms_store_pod_ready_msg - if (ldms_store_pod_ready.resources | default([]) | length > 0) - else ldms_store_pod_not_ready_msg }} - when: - - ldms_store_running | default(false) | bool - - - name: Check if decomp.json exists - ansible.builtin.stat: - path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json" - register: decomp_json_stat - - - name: Copy decompose.json if it doesn't exist - ansible.builtin.copy: - src: files/scripts/decomp.json - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json" - mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - when: not decomp_json_stat.stat.exists - - - name: Restart LDMS aggregator StatefulSet - kubernetes.core.k8s: - state: present - definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: nersc-ldms-aggr - namespace: "{{ telemetry_namespace }}" - spec: - template: - metadata: - annotations: - kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" - delegate_to: "{{ kube_vip }}" - failed_when: false - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) - - - name: Wait for LDMS aggregator pod to be ready after restart - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-aggr" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: ldms_pod_ready - failed_when: false - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) - - - name: Display LDMS aggregator restart status - ansible.builtin.debug: - msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}" - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index 3e59602e44..7f2767d20a 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -68,34 +68,52 @@ additional_remote_write_endpoints: "{{ telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) }}" when: telemetry_config.powerscale_configurations is defined -- name: Check if any source targets victoria_metrics +- name: Check if any enabled source targets victoria_metrics ansible.builtin.set_fact: victoria_metrics_support: true cacheable: true when: >- - 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])) + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))) -- name: Check if any source targets victoria_logs +- name: Check if any enabled source targets victoria_logs ansible.builtin.set_fact: victoria_logs_support: true cacheable: true when: >- - 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])) + (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))) -- name: Check if any source targets Kafka +- name: Check if any enabled source targets Kafka ansible.builtin.set_fact: kafka_support: true cacheable: true when: >- - 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) or - 'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([])) + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + ((telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) and + 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool)) and + 'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([]))) # ============================================================================= # VECTOR BRIDGE LOGIC - Determine sink requirements based on Vector bridges @@ -142,17 +160,17 @@ - name: Set global variable for telemetry_enabled ansible.builtin.set_fact: telemetry_enabled: true - when: > - idrac_telemetry_support or - powerscale_metrics_enabled or - powerscale_log_enabled or - victoria_metrics_support or - victoria_logs_support or - ldms_support or - kafka_support or - ufm_telemetry_support or - ufm_log_enabled or - vast_telemetry_support or - vast_log_enabled or - ome_metrics_enabled or - ome_logs_enabled + when: >- + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index c513480a37..5bdd8cc86b 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -27,8 +27,28 @@ - name: Derive sink support flags from collection_targets ansible.builtin.include_tasks: derive_sink_support_flags.yml +- name: Set pulp server facts for cloud-init templates + when: + - hostvars['localhost']['service_k8s_support'] | default(false) | bool + block: + - name: Run pulp status command on omnia_core container + ansible.builtin.command: /usr/local/bin/pulp status + delegate_to: localhost + changed_when: false + register: pulp_status_output + + - name: Set pulp content origin value + ansible.builtin.set_fact: + pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}" + + - name: Set pulp_server_ip fact + ansible.builtin.set_fact: + pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" + - name: Configure service_k8s telemetry services - when: hostvars['localhost']['service_k8s_support'] | default(false) | bool + when: + - hostvars['localhost']['service_k8s_support'] | default(false) | bool + - telemetry_enabled | default(false) | bool block: - name: Read telemetry packages from software config ansible.builtin.include_tasks: read_software_config.yml @@ -45,18 +65,7 @@ - name: Configure of k8s telemetry service prerequisites when: - - >- - (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or - (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or - ldms_support | default(false) | bool + - telemetry_enabled | default(false) | bool block: - name: Set NFS info fact ansible.builtin.set_fact: @@ -148,8 +157,3 @@ - telemetry_enabled | default(false) | bool tags: - telemetry_deployment - - # - name: Apply telemetry configurations on upgrade - # ansible.builtin.include_tasks: apply_telemetry_on_upgrade.yml - # when: - # - hostvars['localhost']['upgrade_enabled'] | default(false) | bool diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index a50607e4ed..e49bd45587 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -13,20 +13,6 @@ # limitations under the License. --- -- name: Run pulp status command on omnia_core container - ansible.builtin.command: /usr/local/bin/pulp status - delegate_to: localhost - changed_when: false - register: pulp_status_output - -- name: Set pulp content origin value - ansible.builtin.set_fact: - pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}" - -- name: Set fact for pulp protocol - ansible.builtin.set_fact: - pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" - - name: Get cluster_os_type from software_config.json ansible.builtin.set_fact: cluster_os_type: "{{ software_config['cluster_os_type'] }}" @@ -51,3 +37,10 @@ | map(attribute='package') | list) | unique }} + +- name: Extract individual pip module versions from service_k8s.json + ansible.builtin.set_fact: + kubernetes_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^kubernetes==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] + prometheus_client_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^prometheus_client==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] + pyyaml_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^pyyaml==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] + cffi_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^cffi==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] From a8ac01502e07a2bb8a7bfabfec3a2b643b614ed7 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:55:21 +0530 Subject: [PATCH 31/33] resolve merge conflict Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- provision/roles/telemetry/tasks/main.yml | 15 +++++++++++++-- .../telemetry/tasks/read_software_config.yml | 9 +-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 5bdd8cc86b..4ec8aea05c 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -65,7 +65,18 @@ - name: Configure of k8s telemetry service prerequisites when: - - telemetry_enabled | default(false) | bool + - >- + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or + ldms_support | default(false) | bool block: - name: Set NFS info fact ansible.builtin.set_fact: @@ -156,4 +167,4 @@ when: - telemetry_enabled | default(false) | bool tags: - - telemetry_deployment + - telemetry_deployment \ No newline at end of file diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index e49bd45587..6963618579 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -36,11 +36,4 @@ | selectattr('type', 'equalto', 'pip_module') | map(attribute='package') | list) - | unique }} - -- name: Extract individual pip module versions from service_k8s.json - ansible.builtin.set_fact: - kubernetes_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^kubernetes==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] - prometheus_client_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^prometheus_client==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] - pyyaml_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^pyyaml==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] - cffi_pip_version: "{{ (telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', '^cffi==') | map(attribute='package') | first).split('==')[1] }}" # noqa: yaml[line-length] + | unique }} \ No newline at end of file From 8f3c13d8c3ba4ecdea960bbfa384716a33abb085 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 19 Jun 2026 00:58:11 +0530 Subject: [PATCH 32/33] resolve merge conflict Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- provision/roles/telemetry/tasks/main.yml | 2 +- provision/roles/telemetry/tasks/read_software_config.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 4ec8aea05c..8a7e9f6ab2 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -167,4 +167,4 @@ when: - telemetry_enabled | default(false) | bool tags: - - telemetry_deployment \ No newline at end of file + - telemetry_deployment diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index 6963618579..36300d0a52 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -36,4 +36,4 @@ | selectattr('type', 'equalto', 'pip_module') | map(attribute='package') | list) - | unique }} \ No newline at end of file + | unique }} From f3b05adeb93def9f3c87d6bd7d86868cc7522c19 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 22 Jun 2026 15:41:55 +0530 Subject: [PATCH 33/33] input file name in message Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> --- .../common_utils/en_us_validation_msg.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index dbaa2acc94..bf63fecbd1 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -367,13 +367,13 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): # PowerScale telemetry validation messages POWERSCALE_VICTORIA_REQUIRED_MSG = ( "PowerScale telemetry requires VictoriaMetrics to be deployed. " - "When telemetry_sources.powerscale.metrics_enabled is true, " + "When telemetry_sources.powerscale.metrics_enabled is true in telemetry_config.yml, " "'victoria_metrics' must be included in collection_targets " "(e.g., 'victoria_metrics' or 'victoria_metrics,victoria_logs')." ) POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG = ( "PowerScale logs collection requires VictoriaLogs to be deployed. " - "When telemetry_sources.powerscale.logs_enabled is true, " + "When telemetry_sources.powerscale.logs_enabled is true in telemetry_config.yml, " "'victoria_logs' must be included in collection_targets " "(e.g., 'victoria_metrics,victoria_logs')." ) @@ -386,15 +386,15 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "PowerScale telemetry requires a service cluster." ) POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( - "powerscale_configurations section is required when " + "powerscale_configurations section is required in telemetry_config.yml when " "telemetry_sources.powerscale.metrics_enabled is true. " "It must contain csm_observability_values_file_path." ) POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( - "must be a non-empty string in format 'XGi' (e.g., '5Gi')" + "must be a non-empty string in format 'XGi' (e.g., '5Gi') in telemetry_config.yml" ) POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( - "csm_observability_values_file_path is required when " + "csm_observability_values_file_path is required in telemetry_config.yml when " "telemetry_sources.powerscale.metrics_enabled is true. " "Please provide the path to the CSM Observability values.yaml file." ) @@ -402,34 +402,34 @@ def powerscale_csm_values_not_found_msg(path): """Returns error message when CSM Observability values.yaml file is not found.""" return ( f"CSM Observability values.yaml file not found at '{path}'. " - "Please verify the file path is correct." + "Please verify the file path is correct in telemetry_config.yml (csm_observability_values_file_path)." ) POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = ( - "CSM Observability values.yaml must contain a valid YAML dictionary." + "CSM Observability values.yaml (path specified in telemetry_config.yml) must contain a valid YAML dictionary." ) def powerscale_csm_values_parse_error_msg(error): """Returns error message when CSM Observability values.yaml fails to parse.""" return f"Failed to parse CSM Observability values.yaml: {error}" POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = ( - "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section." + "CSM Observability values.yaml (path specified in telemetry_config.yml) is missing 'karaviMetricsPowerscale' section." ) POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = ( - "CSM Metrics PowerScale image is required in CSM Observability values.yaml." + "CSM Metrics PowerScale image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)." ) POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( - "OTEL Collector image is required in CSM Observability values.yaml." + "OTEL Collector image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)." ) ADDITIONAL_METRIC_ENDPOINTS_URL_EMPTY_MSG = ( - "Each additional_metric_remote_write_endpoint must have a non-empty 'url' field." + "Each additional_metric_remote_write_endpoint in telemetry_config.yml must have a non-empty 'url' field." ) ADDITIONAL_METRIC_ENDPOINTS_URL_INVALID_MSG = ( - "URL must start with 'http://' or 'https://'." + "URL in telemetry_config.yml must start with 'http://' or 'https://'." ) ADDITIONAL_LOG_ENDPOINTS_URL_EMPTY_MSG = ( - "Each additional_log_write_endpoint must have a non-empty 'url' field." + "Each additional_log_write_endpoint in telemetry_config.yml must have a non-empty 'url' field." ) ADDITIONAL_LOG_ENDPOINTS_URL_INVALID_MSG = ( - "URL must start with 'http://' or 'https://'." + "URL in telemetry_config.yml must start with 'http://' or 'https://'." ) def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): """Returns error message when CSM values.yaml image version doesn't match service_k8s (versioned).""" @@ -457,13 +457,13 @@ def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_ "PowerScale telemetry requires a service cluster." ) POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( - "powerscale_configurations section is required and must contain powerscale_telemetry_support." + "powerscale_configurations section is required in telemetry_config.yml and must contain powerscale_telemetry_support." ) POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( - "must be a non-empty string in format 'XGi' (e.g., '5Gi')" + "must be a non-empty string in format 'XGi' (e.g., '5Gi') in telemetry_config.yml" ) POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( - "csm_observability_values_file_path is required when powerscale_configurations.powerscale_telemetry_support is true. " + "csm_observability_values_file_path is required in telemetry_config.yml when powerscale_configurations.powerscale_telemetry_support is true. " "Please provide the path to the CSM Observability values.yaml file." ) POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( @@ -475,28 +475,28 @@ def powerscale_csm_values_not_found_msg(path): """Returns error message when CSM Observability values.yaml file is not found.""" return ( f"CSM Observability values.yaml file not found at '{path}'. " - "Please verify the file path is correct." + "Please verify the file path is correct in telemetry_config.yml (csm_observability_values_file_path)." ) POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = ( - "CSM Observability values.yaml must contain a valid YAML dictionary." + "CSM Observability values.yaml (path specified in telemetry_config.yml) must contain a valid YAML dictionary." ) def powerscale_csm_values_parse_error_msg(error): """Returns error message when CSM Observability values.yaml fails to parse.""" return f"Failed to parse CSM Observability values.yaml: {error}" POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = ( - "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section." + "CSM Observability values.yaml (path specified in telemetry_config.yml) is missing 'karaviMetricsPowerscale' section." ) POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = ( - "CSM Metrics PowerScale image is required in CSM Observability values.yaml." + "CSM Metrics PowerScale image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)." ) POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( - "OTEL Collector image is required in CSM Observability values.yaml." + "OTEL Collector image is required in CSM Observability values.yaml (path specified in telemetry_config.yml)." ) POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = ( - "Each additional_remote_write_endpoint must have a non-empty 'url' field." + "Each additional_remote_write_endpoint in telemetry_config.yml must have a non-empty 'url' field." ) POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = ( - "URL must start with 'http://' or 'https://'." + "URL in telemetry_config.yml must start with 'http://' or 'https://'." ) def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): """Returns error message when CSM values.yaml image version doesn't match service_k8s.json."""