From bf790b2c5d05904f90bfedcccf706ae57f6d1cde Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Wed, 17 Jun 2026 09:36:16 +0000 Subject: [PATCH 1/4] provison playbook fix when telemetry disabled but service_k8s is true --- .../tasks/derive_sink_support_flags.yml | 74 ++++++++++++------- provision/roles/telemetry/tasks/main.yml | 22 +++++- .../telemetry/tasks/read_software_config.yml | 14 ---- 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index 3e59602e44..7f2767d20a 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -68,34 +68,52 @@ additional_remote_write_endpoints: "{{ telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) }}" when: telemetry_config.powerscale_configurations is defined -- name: Check if any source targets victoria_metrics +- name: Check if any enabled source targets victoria_metrics ansible.builtin.set_fact: victoria_metrics_support: true cacheable: true when: >- - 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or - 'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])) + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and + 'victoria_metrics' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))) -- name: Check if any source targets victoria_logs +- name: Check if any enabled source targets victoria_logs ansible.builtin.set_fact: victoria_logs_support: true cacheable: true when: >- - 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([])) or - 'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([])) + (((telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([]))) or + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.ufm.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool)) and + 'victoria_logs' in (telemetry_config.telemetry_sources.vast.collection_targets | default([]))) -- name: Check if any source targets Kafka +- name: Check if any enabled source targets Kafka ansible.builtin.set_fact: kafka_support: true cacheable: true when: >- - 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or - 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) or - 'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([])) + ((telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) and + 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))) or + ((telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) and + 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([]))) or + (((telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool)) and + 'kafka' in (telemetry_config.telemetry_sources.ome.collection_targets | default([]))) # ============================================================================= # VECTOR BRIDGE LOGIC - Determine sink requirements based on Vector bridges @@ -142,17 +160,17 @@ - name: Set global variable for telemetry_enabled ansible.builtin.set_fact: telemetry_enabled: true - when: > - idrac_telemetry_support or - powerscale_metrics_enabled or - powerscale_log_enabled or - victoria_metrics_support or - victoria_logs_support or - ldms_support or - kafka_support or - ufm_telemetry_support or - ufm_log_enabled or - vast_telemetry_support or - vast_log_enabled or - ome_metrics_enabled or - ome_logs_enabled + when: >- + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ufm.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.vast.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ome.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ome.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_bridges.vector_ome.logs_enabled | default(false) | bool) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index c513480a37..5e8b92e8fc 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -27,8 +27,28 @@ - name: Derive sink support flags from collection_targets ansible.builtin.include_tasks: derive_sink_support_flags.yml +- name: Set pulp server facts for cloud-init templates + when: + - hostvars['localhost']['service_k8s_support'] | default(false) | bool + block: + - name: Run pulp status command on omnia_core container + ansible.builtin.command: /usr/local/bin/pulp status + delegate_to: localhost + changed_when: false + register: pulp_status_output + + - name: Set pulp content origin value + ansible.builtin.set_fact: + pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}" + + - name: Set pulp_server_ip fact + ansible.builtin.set_fact: + pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" + - name: Configure service_k8s telemetry services - when: hostvars['localhost']['service_k8s_support'] | default(false) | bool + when: + - hostvars['localhost']['service_k8s_support'] | default(false) | bool + - telemetry_enabled | default(false) | bool block: - name: Read telemetry packages from software config ansible.builtin.include_tasks: read_software_config.yml diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index a50607e4ed..36300d0a52 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -13,20 +13,6 @@ # limitations under the License. --- -- name: Run pulp status command on omnia_core container - ansible.builtin.command: /usr/local/bin/pulp status - delegate_to: localhost - changed_when: false - register: pulp_status_output - -- name: Set pulp content origin value - ansible.builtin.set_fact: - pulp_content_origin: "{{ (pulp_status_output.stdout | from_json).content_settings.content_origin }}" - -- name: Set fact for pulp protocol - ansible.builtin.set_fact: - pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" - - name: Get cluster_os_type from software_config.json ansible.builtin.set_fact: cluster_os_type: "{{ software_config['cluster_os_type'] }}" From 7816d1a3243be7f4a1e36a38e0a023092848ab8f Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Wed, 17 Jun 2026 09:41:14 +0000 Subject: [PATCH 2/4] remvoing unsed commented task Signed-off-by: Kratika_Patidar --- .../tasks/apply_telemetry_on_upgrade.yml | 240 ------------------ provision/roles/telemetry/tasks/main.yml | 5 - 2 files changed, 245 deletions(-) delete mode 100644 provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml diff --git a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml b/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml deleted file mode 100644 index 0cdb4bd2cb..0000000000 --- a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Apply telemetry configurations for upgrade - when: - - kube_vip is defined - - kube_vip | length > 0 - - idrac_telemetry_support | default(false) | bool - block: - - name: Check if telemetry deployment file exists - ansible.builtin.stat: - path: "{{ idrac_telemetry_statefulset_path }}" - register: telemetry_stat - - - name: Get current iDRAC telemetry StatefulSet configuration - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet - name: idrac-telemetry - namespace: "{{ telemetry_namespace }}" - register: current_idrac_statefulset - failed_when: false - when: - - telemetry_stat.stat.exists | default(false) - - - name: Set replica count as fact - ansible.builtin.set_fact: - preserved_replica_count: "{{ current_idrac_statefulset.resources[0].spec.replicas | default(1) }}" - when: - - current_idrac_statefulset.resources is defined and current_idrac_statefulset.resources | length > 0 - - - name: Show current replica count - ansible.builtin.debug: - msg: "Current replica count: {{ preserved_replica_count }}" - verbosity: 2 - when: - - preserved_replica_count is defined - - - name: Read iDRAC telemetry StatefulSet YAML file - ansible.builtin.slurp: - src: "{{ idrac_telemetry_statefulset_path }}" - register: idrac_statefulset_yaml - - - name: Update StatefulSet definition with preserved replica count - ansible.builtin.set_fact: - updated_statefulset_definition: "{{ idrac_statefulset_yaml.content | b64decode | regex_replace('---\\n', '') | from_yaml | combine({'spec': {'replicas': preserved_replica_count | int}}, recursive=true) }}" # noqa: yaml[line-length] - when: - - telemetry_stat.stat.exists | default(false) - - preserved_replica_count is defined - - - name: Apply iDRAC telemetry StatefulSet with preserved replica count - kubernetes.core.k8s: - state: present - definition: "{{ updated_statefulset_definition }}" - register: kubectl_apply_result - when: - - updated_statefulset_definition is defined - - telemetry_stat.stat.exists | default(false) - - - name: Display kubectl apply result - ansible.builtin.debug: - msg: "{{ kubectl_apply_result }}" - when: - - kubectl_apply_result is defined - - - name: Wait for idrac telemetry receiver to be ready - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=idrac-telemetry-receiver" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: idrac_telemetry_receiver_ready - failed_when: false - when: - - idrac_telemetry_support | default(false) | bool - - - name: Display idrac telemetry receiver ready status - ansible.builtin.debug: - msg: "{{ idrac_telemetry_receiver_ready }}" - when: - - idrac_telemetry_support | default(false) | bool - - idrac_telemetry_receiver_ready is defined - -- name: Apply LDMS configurations for upgrade - when: - - kube_vip is defined - - kube_vip | length > 0 - - ldms_support | default(false) | bool - block: - - name: Check if LDMS aggregator is running on service k8s cluster - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet - name: nersc-ldms-aggr - namespace: "{{ telemetry_namespace }}" - delegate_to: "{{ kube_vip }}" - register: ldms_statefulset_info - failed_when: false - - - name: Set LDMS running state - ansible.builtin.set_fact: - ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}" - - - name: Check if LDMS store daemon is running on service k8s cluster - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-store" - delegate_to: "{{ kube_vip }}" - register: ldms_store_pod_info - failed_when: false - when: - - ldms_running | default(false) | bool - - - name: Set LDMS store daemon running state - ansible.builtin.set_fact: - ldms_store_running: "{{ ldms_store_pod_info.resources is defined and ldms_store_pod_info.resources | length > 0 }}" - when: - - ldms_running | default(false) | bool - - - name: Restart LDMS store daemon pod - kubernetes.core.k8s: - state: absent - api_version: v1 - kind: Pod - name: "{{ ldms_store_pod_info.resources[0].metadata.name }}" - namespace: "{{ telemetry_namespace }}" - delegate_to: "{{ kube_vip }}" - failed_when: false - when: - - ldms_store_running | default(false) | bool - - - name: Wait for LDMS store daemon pod to be ready after restart - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-store" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: ldms_store_pod_ready - failed_when: false - when: - - ldms_store_running | default(false) | bool - - - name: Display LDMS store daemon restart status - ansible.builtin.debug: - msg: > - {{ ldms_store_pod_ready_msg - if (ldms_store_pod_ready.resources | default([]) | length > 0) - else ldms_store_pod_not_ready_msg }} - when: - - ldms_store_running | default(false) | bool - - - name: Check if decomp.json exists - ansible.builtin.stat: - path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json" - register: decomp_json_stat - - - name: Copy decompose.json if it doesn't exist - ansible.builtin.copy: - src: files/scripts/decomp.json - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/scripts/decomp.json" - mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - when: not decomp_json_stat.stat.exists - - - name: Restart LDMS aggregator StatefulSet - kubernetes.core.k8s: - state: present - definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: nersc-ldms-aggr - namespace: "{{ telemetry_namespace }}" - spec: - template: - metadata: - annotations: - kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" - delegate_to: "{{ kube_vip }}" - failed_when: false - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) - - - name: Wait for LDMS aggregator pod to be ready after restart - kubernetes.core.k8s_info: - api_version: v1 - kind: Pod - namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app=nersc-ldms-aggr" - wait: true - wait_condition: - type: Ready - status: "True" - wait_timeout: 120 - delegate_to: "{{ kube_vip }}" - register: ldms_pod_ready - failed_when: false - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) - - - name: Display LDMS aggregator restart status - ansible.builtin.debug: - msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}" - when: - - ldms_running | default(false) | bool - - ldms_conf_file.stat.exists | default(false) - - ldms_bin_file.stat.exists | default(false) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 5e8b92e8fc..8a7e9f6ab2 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -168,8 +168,3 @@ - telemetry_enabled | default(false) | bool tags: - telemetry_deployment - - # - name: Apply telemetry configurations on upgrade - # ansible.builtin.include_tasks: apply_telemetry_on_upgrade.yml - # when: - # - hostvars['localhost']['upgrade_enabled'] | default(false) | bool From 19a008adbd2ad8842c60eca6786c0575fed57eb5 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Tue, 23 Jun 2026 13:22:12 +0000 Subject: [PATCH 3/4] telemetry components version updates Signed-off-by: Kratika_Patidar --- input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json | 8 ++++---- .../templates/telemetry/kafka/kafka.kafka.yaml.j2 | 10 +++++----- .../telemetry/kafka/kafka.kafka_bridge.yaml.j2 | 3 +-- .../telemetry/kafka/kafka.kafkapump_user.yaml.j2 | 2 +- .../templates/telemetry/kafka/kafka.topic.yaml.j2 | 2 +- .../telemetry/vector/vector-ome-kafkauser.yaml.j2 | 2 +- provision/roles/telemetry/vars/main.yml | 6 +++--- 7 files changed, 16 insertions(+), 17 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json index 8f21b2bf51..8c18422fcc 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -33,8 +33,8 @@ { "package": "cffi==1.17.1", "type": "pip_module" }, { "package": "prometheus_client==0.20.0", "type": "pip_module" }, { "package": "kubernetes==33.1.0", "type": "pip_module" }, - { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, - { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "quay.io/strimzi/operator", "tag": "1.0.1", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "1.0.1-kafka-4.2.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.1", "type": "image" }, { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.12.0", "type": "image" }, { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.150.1", "type": "image" }, @@ -46,8 +46,8 @@ { "package": "quay.io/jetstack/cert-manager-webhook", "tag": "v1.10.0", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-acmesolver", "tag": "v1.10.0", "type": "image" }, { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, - { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, - { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "strimzi-kafka-operator-helm-3-chart-1.0.1", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/1.0.1/strimzi-kafka-operator-helm-3-chart-1.0.1.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "1.0.0", "type": "image" }, { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 index 929af037c4..a9a0f6196f 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 @@ -1,4 +1,4 @@ -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: KafkaNodePool metadata: name: controller @@ -19,7 +19,7 @@ spec: deleteClaim: false --- -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: KafkaNodePool metadata: name: broker @@ -40,7 +40,7 @@ spec: deleteClaim: false --- -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: Kafka metadata: name: kafka @@ -50,8 +50,8 @@ metadata: strimzi.io/kraft: enabled spec: kafka: - version: 4.1.0 - metadataVersion: 4.1-IV0 + version: 4.2.0 + metadataVersion: 4.2-IV0 listeners: - name: internal port: 9092 diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka_bridge.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka_bridge.yaml.j2 index 35a0862cf3..b4fdb5689c 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka_bridge.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka_bridge.yaml.j2 @@ -1,12 +1,11 @@ --- -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: KafkaBridge metadata: name: bridge namespace: telemetry spec: bootstrapServers: kafka-kafka-bootstrap:9093 - enableMetrics: true http: port: 8080 # Enable TLS for Kafka connection diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 index 413a7fe72d..df1b9f9bae 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 @@ -13,7 +13,7 @@ # limitations under the License. --- -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: KafkaUser metadata: name: kafkapump diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.topic.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.topic.yaml.j2 index 9ae180ecfd..974712e78f 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.topic.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.topic.yaml.j2 @@ -1,4 +1,4 @@ -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: KafkaTopic metadata: name: {{ topic_name }} diff --git a/provision/roles/telemetry/templates/telemetry/vector/vector-ome-kafkauser.yaml.j2 b/provision/roles/telemetry/templates/telemetry/vector/vector-ome-kafkauser.yaml.j2 index 2cad636058..28e69319f3 100644 --- a/provision/roles/telemetry/templates/telemetry/vector/vector-ome-kafkauser.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/vector/vector-ome-kafkauser.yaml.j2 @@ -9,7 +9,7 @@ # OME is an external producer with a different security domain, so it gets a # dedicated, least-privilege KafkaUser. -apiVersion: kafka.strimzi.io/v1beta2 +apiVersion: kafka.strimzi.io/v1 kind: KafkaUser metadata: name: {{ vector.ome.kafka_user }} diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 4f1f29f9fa..1337efb093 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -116,9 +116,9 @@ kafka: lb_service_name: "kafka-loadbalancer" container_port1: 9093 # Kafka images from service_k8s_v.json - operator_image: "{{ telemetry_images['strimzi/operator'] | default('quay.io/strimzi/operator:0.48.0') }}" - kafka_image: "{{ telemetry_images['strimzi/kafka'] | default('quay.io/strimzi/kafka:0.48.0-kafka-4.1.0') }}" - bridge_image: "{{ telemetry_images['strimzi/kafka-bridge'] | default('quay.io/strimzi/kafka-bridge:0.33.1') }}" + operator_image: "{{ telemetry_images['strimzi/operator'] | default('quay.io/strimzi/operator:1.0.1') }}" + kafka_image: "{{ telemetry_images['strimzi/kafka'] | default('quay.io/strimzi/kafka:1.0.1-kafka-4.2.0') }}" + bridge_image: "{{ telemetry_images['strimzi/kafka-bridge'] | default('quay.io/strimzi/kafka-bridge:1.0.0') }}" container_port2: 9093 image: "apache/kafka:4.1.0" cluster_id: "kafka-cluster-id" From a0739d342394af230cc646374c338e9c48bedce2 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Wed, 24 Jun 2026 19:25:33 +0000 Subject: [PATCH 4/4] changes for upgrade of telemetry Signed-off-by: Kratika_Patidar --- .../templates/coredhcp/coredhcp.yaml.j2 | 2 +- .../templates/telemetry/telemetry.sh.j2 | 9 ++ .../files/migrate_strimzi_crds.sh | 103 ++++++++++++++++++ .../tasks/apply_victoria_crs.yml | 2 +- .../tasks/execute_telemetry_sh.yml | 16 +++ 5 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 upgrade/roles/upgrade_telemetry/files/migrate_strimzi_crds.sh diff --git a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 index edfcb8c583..1cf09db3f9 100644 --- a/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 +++ b/prepare_oim/roles/deploy_containers/openchami/templates/coredhcp/coredhcp.yaml.j2 @@ -41,7 +41,7 @@ server4: # ------------------------------------------------------------------- # Multi-subnet configuration (requires coresmd v0.6.x+) # To enable multi-subnet DHCP: - # 1. Pull the new coresmd image: podman pull ghcr.io/openchami/coresmd:v0.6.x + # 1. Pull the new coresmd image: podman pull ghcr.io/openchami/coresmd:v0.6.3 # 2. Comment out the single-subnet coresmd and bootloop lines above # 3. Uncomment the multi-subnet coresmd and bootloop blocks below # 4. Replace the new coresmd image version in files: /etc/containers/systemd/coresmd-coredhcp.container /etc/containers/systemd/coresmd-coredns.container with the old version diff --git a/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 index c711863f48..8321e8e61d 100644 --- a/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 @@ -19,6 +19,15 @@ else helm -n telemetry install strimzi-cluster-operator "${DEPLOY_DIR}/{{ strimzi_kafka_pkg }}.tar.gz" fi +# Helm 3 does NOT update CRDs on 'helm upgrade'. Explicitly apply CRDs +# from the chart tarball so that new API versions are registered before +# kubectl apply -k attempts to create Kafka resources. +echo " Applying Strimzi CRDs from chart (Helm 3 does not update CRDs on upgrade)..." +_STRIMZI_CRD_TMP=$(mktemp -d) +tar -xzf "${DEPLOY_DIR}/{{ strimzi_kafka_pkg }}.tar.gz" -C "$_STRIMZI_CRD_TMP" +kubectl apply -f "$_STRIMZI_CRD_TMP/strimzi-kafka-operator/crds/" --server-side --force-conflicts +rm -rf "$_STRIMZI_CRD_TMP" + # Wait for Strimzi operator to be ready echo " Waiting for Strimzi operator deployment..." kubectl wait --for=condition=available --timeout=300s deployment/strimzi-cluster-operator -n telemetry diff --git a/upgrade/roles/upgrade_telemetry/files/migrate_strimzi_crds.sh b/upgrade/roles/upgrade_telemetry/files/migrate_strimzi_crds.sh new file mode 100644 index 0000000000..c9c98e828b --- /dev/null +++ b/upgrade/roles/upgrade_telemetry/files/migrate_strimzi_crds.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# migrate_strimzi_crds.sh — Strimzi CRD major version migration +# +# Handles the upgrade from Strimzi 0.x (v1beta2) to 1.x (v1-only). +# Strimzi 1.0.x completely dropped the v1beta2 API. Kubernetes +# cannot remove a served version when objects stored in that version +# still exist in etcd. This script: +# +# 1. Detects whether migration is needed +# 2. Temporarily re-enables v1beta2 on CRDs so stuck CRs are readable +# 3. Deletes existing Kafka CRs (they will be recreated by telemetry.sh) +# 4. Deletes old PVCs (new cluster ID makes old data incompatible) +# 5. Removes CRDs (handles stuck cleanup finalizers) +# +# telemetry.sh then recreates CRDs + CRs from the new chart. +# This script is fully idempotent — it is a no-op when CRDs are +# already healthy, absent, or running v1 without issues. +# +# Usage: bash migrate_strimzi_crds.sh +# Exit codes: 0 = success or no migration needed + +set -euo pipefail + +NS="${1:-telemetry}" + +# ── Phase 1: Detect ───────────────────────────────────────────── +needs_migration=false + +# Check if any Strimzi CRD still lists v1beta2 in storedVersions +for crd in $(kubectl get crd -o name 2>/dev/null | grep -E '\.kafka\.strimzi\.io|\.core\.strimzi\.io'); do + if kubectl get "$crd" -o jsonpath='{.status.storedVersions}' 2>/dev/null | grep -q 'v1beta2'; then + echo "[MIGRATE] $crd has v1beta2 in storedVersions" + needs_migration=true + break + fi +done + +# Check if CRs are stuck (v1-only CRDs but objects stored as v1beta2) +if [ "$needs_migration" = "false" ] && kubectl get crd kafkas.kafka.strimzi.io >/dev/null 2>&1; then + if kubectl get kafka -n "$NS" 2>&1 | grep -q 'convert CR from an invalid group/version'; then + echo "[MIGRATE] CRs stuck — conversion error detected" + needs_migration=true + fi +fi + +if [ "$needs_migration" = "false" ]; then + echo "[MIGRATE] No Strimzi CRD migration needed." + exit 0 +fi + +echo "[MIGRATE] Starting Strimzi CRD migration (v1beta2 → v1)..." + +# ── Phase 2: Make stuck CRs readable ──────────────────────────── +STRIMZI_CRDS=$(kubectl get crd -o name 2>/dev/null \ + | grep -E '\.kafka\.strimzi\.io|\.core\.strimzi\.io' \ + | sed 's|customresourcedefinition.apiextensions.k8s.io/||') + +if [ -n "$STRIMZI_CRDS" ]; then + echo "[MIGRATE] Temporarily adding v1beta2 to CRDs..." + for crd in $STRIMZI_CRDS; do + kubectl get crd "$crd" -o json 2>/dev/null \ + | jq '.spec.versions += [(.spec.versions[0] | .name = "v1beta2" | .served = true | .storage = false)]' \ + | kubectl apply -f - --server-side --force-conflicts >/dev/null 2>&1 || true + done +fi + +# ── Phase 3: Delete existing CRs ──────────────────────────────── +echo "[MIGRATE] Deleting existing Kafka CRs..." +for kind in kafka kafkanodepool kafkabridge kafkatopic kafkauser strimzipodset; do + for item in $(kubectl get "$kind" -n "$NS" -o name 2>/dev/null); do + kubectl patch "$item" -n "$NS" --type=merge \ + -p '{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete "$item" -n "$NS" --wait=false 2>/dev/null || true + done +done +sleep 5 + +# ── Phase 4: Delete old Kafka PVCs ─────────────────────────────── +echo "[MIGRATE] Deleting old Kafka PVCs (new cluster ID makes old data incompatible)..." +kubectl delete pvc -n "$NS" -l strimzi.io/cluster=kafka --wait=false 2>/dev/null || true + +# ── Phase 5: Delete cluster-id secret (operator will regenerate) ─ +kubectl delete secret kafka-cluster-id -n "$NS" 2>/dev/null || true + +# ── Phase 6: Delete CRDs ──────────────────────────────────────── +if [ -n "$STRIMZI_CRDS" ]; then + echo "[MIGRATE] Deleting Strimzi CRDs..." + kubectl delete crd $STRIMZI_CRDS --wait=false --timeout=30s 2>&1 || true + sleep 5 + # Remove cleanup finalizers from any CRDs stuck in Terminating + for crd in $(kubectl get crd -o name 2>/dev/null | grep -E '\.strimzi\.io'); do + kubectl patch "$crd" --type=merge \ + -p '{"metadata":{"finalizers":[]}}' 2>/dev/null || true + done + # Wait for CRDs to fully disappear + for i in $(seq 1 24); do + remaining=$(kubectl get crd -o name 2>/dev/null | grep -cE '\.strimzi\.io' || echo 0) + [ "$remaining" -eq 0 ] 2>/dev/null && break + sleep 5 + done +fi + +echo "[MIGRATE] Strimzi CRD migration complete. telemetry.sh will recreate CRDs and CRs." diff --git a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml index 1e8b7aa3f0..085801b9ab 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/apply_victoria_crs.yml @@ -150,7 +150,7 @@ - name: Reclaim preserved IPs from conflicting services when: - preserved_vminsert_ip | default('') | length > 0 or preserved_vmselect_ip | default('') | length > 0 - - vminsert_lb_ip.stdout | trim | length == 0 or vmselect_lb_ip.stdout | trim | length == 0 + - (vminsert_lb_ip.stdout | default('') | trim | length == 0) or (vmselect_lb_ip.stdout | default('') | trim | length == 0) block: - name: Stage IP conflict detection script ansible.builtin.template: diff --git a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml index 1dac883990..54a2ae922f 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml @@ -67,6 +67,22 @@ ansible.builtin.debug: msg: "{{ pods_before_upgrade.stdout_lines }}" + # ── Pre-CRD migration: Strimzi major version upgrade (0.x → 1.x) ── + # See files/migrate_strimzi_crds.sh for full details. + # The script is idempotent — no-op when CRDs are already healthy or absent. + - name: Run Strimzi CRD migration if needed (v1beta2 → v1) + ansible.builtin.script: + cmd: migrate_strimzi_crds.sh {{ telemetry_namespace }} + delegate_to: "{{ kube_vip }}" + connection: ssh + register: strimzi_migration_result + changed_when: "'Starting Strimzi CRD migration' in strimzi_migration_result.stdout" + failed_when: strimzi_migration_result.rc != 0 + + - name: Display Strimzi migration result + ansible.builtin.debug: + msg: "{{ strimzi_migration_result.stdout_lines }}" + # ── Execute telemetry.sh ── - name: Execute telemetry.sh on kube_vip ansible.builtin.command: