From 37492f649008122075abb1374a75f1a1140d213d Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 22 Oct 2025 16:11:25 +0200 Subject: [PATCH 01/24] basic config, promtail and prometheus --- control-plane/kind.yaml | 10 ++++++---- deploy_control_plane.yaml | 4 ++++ deploy_partition.yaml | 16 ++++++++++++++++ inventories/group_vars/all/control_plane.yaml | 10 ++++++++++ .../group_vars/control-plane/ingress.yaml | 3 +++ inventories/group_vars/partition/promtail.yaml | 14 ++++++++++++++ mini-lab.sonic.yaml | 4 ++++ 7 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 inventories/group_vars/partition/promtail.yaml diff --git a/control-plane/kind.yaml b/control-plane/kind.yaml index 949e46d5..7c08f040 100644 --- a/control-plane/kind.yaml +++ b/control-plane/kind.yaml @@ -11,16 +11,18 @@ nodes: extraPortMappings: - containerPort: 4443 hostPort: 4443 - listenAddress: 0.0.0.0 - containerPort: 8080 hostPort: 8080 - listenAddress: 0.0.0.0 - containerPort: 4150 hostPort: 4150 - listenAddress: 0.0.0.0 - containerPort: 50051 hostPort: 50051 - listenAddress: 0.0.0.0 + - containerPort: 3100 + hostPort: 3100 + - containerPort: 3000 + hostPort: 3000 + - containerPort: 9090 + hostPort: 9090 # if you want to run gardener operator + metal-stack, you need more pods kubeadmConfigPatches: - | diff --git a/deploy_control_plane.yaml b/deploy_control_plane.yaml index 2b0312cc..33133fb8 100644 --- a/deploy_control_plane.yaml +++ b/deploy_control_plane.yaml @@ -29,6 +29,10 @@ tags: auth - name: metal-roles/control-plane/roles/metal tags: metal + - name: metal-roles/control-plane/roles/logging + tags: logging + - name: metal-roles/control-plane/roles/monitoring + tags: monitoring - name: deploy gardener import_playbook: deploy_gardener.yaml diff --git a/deploy_partition.yaml b/deploy_partition.yaml index c8299608..b297b3db 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -32,6 +32,22 @@ - name: metal-roles/partition/roles/pixiecore tags: pixiecore +- name: Deploy monitoring on leaf01 + hosts: leaf01 + roles: + - name: ansible-common + tags: always + - name: metal-roles/partition/roles/monitoring/prometheus + tags: prometheus + +- name: Deploy monitoring on leaves + hosts: leaves + roles: + - name: ansible-common + tags: always + - name: metal-roles/partition/roles/promtail + tags: promtail + - name: Deploy metal-core hosts: leaves any_errors_fatal: true diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index c843eecb..86a15730 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -7,3 +7,13 @@ metal_control_plane_image_pull_policy: Always helm_additional_params: - '--debug' + +monitoring_grafana_github_oauth: + enabled: false +monitoring_grafana_extra_secret_mounts: [] +monitoring_thanos_receive_enabled: false +monitoring_thanos_receive_ingress_enabled: false + +monitoring_thanos_receive_ingress_basic_auth: {} +monitoring_thanos_receive_ingress_annotations: [] +monitoring_thanos_receive_ingress_tls: {} \ No newline at end of file diff --git a/inventories/group_vars/control-plane/ingress.yaml b/inventories/group_vars/control-plane/ingress.yaml index 0ef8644b..d92cf164 100644 --- a/inventories/group_vars/control-plane/ingress.yaml +++ b/inventories/group_vars/control-plane/ingress.yaml @@ -2,3 +2,6 @@ ingress_tcp_service_exposals: "4150": "{{ metal_control_plane_namespace }}/nsqd:4150" "50051": "{{ metal_control_plane_namespace }}/metal-api:50051" + "3000": "{{ monitoring_namespace }}/kube-prometheus-stack-grafana:80" + "3100": "{{ monitoring_namespace }}/loki:3100" + "9090": "{{ monitoring_namespace }}/kube-prometheus-stack-prometheus:9090" \ No newline at end of file diff --git a/inventories/group_vars/partition/promtail.yaml b/inventories/group_vars/partition/promtail.yaml new file mode 100644 index 00000000..d71f58f0 --- /dev/null +++ b/inventories/group_vars/partition/promtail.yaml @@ -0,0 +1,14 @@ +promtail_clients: + - url: http://{{ metal_control_plane_ingress_dns }}:3100/loki/api/v1/push + +promtail_scrape_configs: + - job_name: leave-node-docker + docker_sd_configs: + - host: 'unix:///var/run/docker.sock' + refresh_interval: 5s + relabel_configs: + - regex: /(.*) + source_labels: [__meta_docker_container_name] + target_label: container + - replacement: "{{ inventory_hostname }}" + target_label: node_name \ No newline at end of file diff --git a/mini-lab.sonic.yaml b/mini-lab.sonic.yaml index eb33236f..a22c0c46 100644 --- a/mini-lab.sonic.yaml +++ b/mini-lab.sonic.yaml @@ -32,11 +32,15 @@ topology: image: ${MINI_LAB_SONIC_IMAGE} binds: - files/ssh/id_rsa.pub:/authorized_keys + env: + QEMU_MEMORY: 4096 leaf02: group: leaves image: ${MINI_LAB_SONIC_IMAGE} binds: - files/ssh/id_rsa.pub:/authorized_keys + env: + QEMU_MEMORY: 4096 machine01: group: machines image: ${MINI_LAB_VM_IMAGE} From ec81a8a85542cedf9a95ae4858c62459a2925d2c Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 22 Oct 2025 16:28:47 +0200 Subject: [PATCH 02/24] remote write to cp prometheus of leaf --- inventories/group_vars/partition/prometheus.yaml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 inventories/group_vars/partition/prometheus.yaml diff --git a/inventories/group_vars/partition/prometheus.yaml b/inventories/group_vars/partition/prometheus.yaml new file mode 100644 index 00000000..f6213351 --- /dev/null +++ b/inventories/group_vars/partition/prometheus.yaml @@ -0,0 +1,2 @@ +prometheus_remote_write: + - url: "http://{{ metal_control_plane_ingress_dns }}:9090/api/v1/write" From 5544dd3656267332d98f31a32276bd980ade8706 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 22 Oct 2025 17:01:00 +0200 Subject: [PATCH 03/24] node-exporter configuration --- deploy_partition.yaml | 2 ++ inventories/group_vars/partition/monitoring.yaml | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 inventories/group_vars/partition/monitoring.yaml diff --git a/deploy_partition.yaml b/deploy_partition.yaml index b297b3db..4337a47e 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -47,6 +47,8 @@ tags: always - name: metal-roles/partition/roles/promtail tags: promtail + - name: metal-roles/partition/roles/monitoring/node-exporter + tags: node-exporter - name: Deploy metal-core hosts: leaves diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml new file mode 100644 index 00000000..a294859f --- /dev/null +++ b/inventories/group_vars/partition/monitoring.yaml @@ -0,0 +1,4 @@ +monitoring_node_exporter_port: 9100 +prometheus_node_exporter_targets: + - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_node_exporter_port }}" + - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" \ No newline at end of file From b7a3517217c949322221fef4f3f105b77bbf23a9 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 23 Oct 2025 09:23:05 +0200 Subject: [PATCH 04/24] add blackbox exporter --- deploy_partition.yaml | 2 ++ inventories/group_vars/partition/monitoring.yaml | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/deploy_partition.yaml b/deploy_partition.yaml index 4337a47e..3f0aea3d 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -49,6 +49,8 @@ tags: promtail - name: metal-roles/partition/roles/monitoring/node-exporter tags: node-exporter + - name: metal-roles/partition/roles/monitoring/blackbox-exporter + tags: blackbox-exporter - name: Deploy metal-core hosts: leaves diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index a294859f..6b989d8c 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -1,4 +1,10 @@ monitoring_node_exporter_port: 9100 prometheus_node_exporter_targets: - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_node_exporter_port }}" - - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" \ No newline at end of file + - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" + + +monitoring_blackbox_exporter_port: 9115 +prometheus_blackbox_exporter_targets: + - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" + - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" \ No newline at end of file From 7abbd90329321f51733e821d7a975082c0c13f66 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 23 Oct 2025 09:50:13 +0200 Subject: [PATCH 05/24] add sonic-exporter --- deploy_partition.yaml | 2 ++ inventories/group_vars/partition/monitoring.yaml | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/deploy_partition.yaml b/deploy_partition.yaml index 3f0aea3d..f3eb8c65 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -51,6 +51,8 @@ tags: node-exporter - name: metal-roles/partition/roles/monitoring/blackbox-exporter tags: blackbox-exporter + - name: metal-roles/partition/roles/monitoring/sonic-exporter + tags: sonic-exporter - name: Deploy metal-core hosts: leaves diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index 6b989d8c..97d67e43 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -7,4 +7,9 @@ prometheus_node_exporter_targets: monitoring_blackbox_exporter_port: 9115 prometheus_blackbox_exporter_targets: - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" - - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" \ No newline at end of file + - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" + +monitoring_sonic_exporter_port: 9101 +prometheus_sonic_exporter_targets: + - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" + - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" \ No newline at end of file From 3e2bb7bb96cbc5d18967dadaa96b2f58383e6956 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 23 Oct 2025 11:49:15 +0200 Subject: [PATCH 06/24] add ipmi-exporter --- deploy_partition.yaml | 2 ++ inventories/group_vars/partition/monitoring.yaml | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/deploy_partition.yaml b/deploy_partition.yaml index f3eb8c65..5ee6f08b 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -39,6 +39,8 @@ tags: always - name: metal-roles/partition/roles/monitoring/prometheus tags: prometheus + - name: metal-roles/partition/roles/monitoring/ipmi-exporter + tags: ipmi-exporter - name: Deploy monitoring on leaves hosts: leaves diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index 97d67e43..dfb11cc7 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -12,4 +12,10 @@ prometheus_blackbox_exporter_targets: monitoring_sonic_exporter_port: 9101 prometheus_sonic_exporter_targets: - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" - - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" \ No newline at end of file + - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" + +prometheus_ipmi_exporter_targets: + - "{{ hostvars['machine01'].ansible_host }}" + - "{{ hostvars['machine02'].ansible_host }}" + +metal_api_bmc_superuser_enabled: false \ No newline at end of file From b3e56afb4bddba2f5d4ea94d8a2007aef4461a46 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 23 Oct 2025 14:46:15 +0200 Subject: [PATCH 07/24] fix pdb of metal-api --- inventories/group_vars/control-plane/metal.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inventories/group_vars/control-plane/metal.yml b/inventories/group_vars/control-plane/metal.yml index 9ec398f3..7b441c9d 100644 --- a/inventories/group_vars/control-plane/metal.yml +++ b/inventories/group_vars/control-plane/metal.yml @@ -5,7 +5,7 @@ metal_api_headscale_control_plane_address: "http://headscale.{{ metal_control_pl # metal_helm_chart_local_path: /helm-charts/charts/metal-control-plane -metal_api_replicas: 1 +metal_api_replicas: 2 metal_api_view_key: metal-view metal_api_edit_key: metal-edit metal_api_admin_key: metal-admin From a33af8a465aab0b8ef7789aa3db383f361772dd8 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 30 Oct 2025 13:54:41 +0100 Subject: [PATCH 08/24] add ntp-server for sonic switches --- inventories/group_vars/leaves/main.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/inventories/group_vars/leaves/main.yaml b/inventories/group_vars/leaves/main.yaml index cc36a1e6..d06812e3 100644 --- a/inventories/group_vars/leaves/main.yaml +++ b/inventories/group_vars/leaves/main.yaml @@ -28,3 +28,9 @@ sonic_vteps: vni: 103999 metal_core_cidr_mask: 25 + +sonic_ntpservers: +- 0.europe.pool.ntp.org +- 1.europe.pool.ntp.org +- 2.europe.pool.ntp.org +- 3.europe.pool.ntp.org \ No newline at end of file From b37400c9b80acc9e2e63fbcf93b4cf6bfea4dbc8 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 27 Nov 2025 10:11:15 +0100 Subject: [PATCH 09/24] fix ntp and blackbox probes --- inventories/group_vars/all/control_plane.yaml | 5 +++-- inventories/group_vars/partition/common.yaml | 1 + inventories/group_vars/partition/monitoring.yaml | 7 +++++-- inventories/group_vars/partition/router.yaml | 1 + 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index 86a15730..6e2e2cba 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -1,4 +1,5 @@ --- +metal_control_plane_ip: 172.17.0.1 metal_control_plane_provider_tenant: metal-stack metal_control_plane_ingress_dns: 172.17.0.1.nip.io metal_control_plane_stage_name: test @@ -6,7 +7,7 @@ metal_control_plane_namespace: metal-control-plane metal_control_plane_image_pull_policy: Always helm_additional_params: - - '--debug' + - "--debug" monitoring_grafana_github_oauth: enabled: false @@ -16,4 +17,4 @@ monitoring_thanos_receive_ingress_enabled: false monitoring_thanos_receive_ingress_basic_auth: {} monitoring_thanos_receive_ingress_annotations: [] -monitoring_thanos_receive_ingress_tls: {} \ No newline at end of file +monitoring_thanos_receive_ingress_tls: {} diff --git a/inventories/group_vars/partition/common.yaml b/inventories/group_vars/partition/common.yaml index 61da08a0..cc484596 100644 --- a/inventories/group_vars/partition/common.yaml +++ b/inventories/group_vars/partition/common.yaml @@ -3,6 +3,7 @@ metal_partition_timezone: Europe/Berlin metal_partition_id: mini-lab metal_partition_metal_api_protocol: http +metal_partition_metal_ip: "{{ metal_control_plane_ip }}" metal_partition_metal_api_addr: api.{{ metal_control_plane_ingress_dns }} metal_partition_metal_api_port: 8080 metal_partition_metal_api_basepath: /metal/ diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index dfb11cc7..908c4811 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -3,7 +3,6 @@ prometheus_node_exporter_targets: - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_node_exporter_port }}" - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" - monitoring_blackbox_exporter_port: 9115 prometheus_blackbox_exporter_targets: - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" @@ -18,4 +17,8 @@ prometheus_ipmi_exporter_targets: - "{{ hostvars['machine01'].ansible_host }}" - "{{ hostvars['machine02'].ansible_host }}" -metal_api_bmc_superuser_enabled: false \ No newline at end of file +metal_api_bmc_superuser_enabled: false +metal_api_bmc_superuser_pwd: change-me + +prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_control_plane_ip }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" +prometheus_blackbox_exporter_dns: 8.8.8.8 diff --git a/inventories/group_vars/partition/router.yaml b/inventories/group_vars/partition/router.yaml index bdcbc460..99f801fd 100644 --- a/inventories/group_vars/partition/router.yaml +++ b/inventories/group_vars/partition/router.yaml @@ -3,3 +3,4 @@ router_nameservers: - 172.17.0.1 - 1.1.1.1 - 1.0.0.1 + - 8.8.8.8 From 48eaa5bbf421144d32a853f07c6e631e85cf0733 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 27 Nov 2025 11:56:07 +0100 Subject: [PATCH 10/24] fix hosts on mgmt-server/leaf --- deploy_partition.yaml | 2 ++ .../group_vars/partition/monitoring.yaml | 18 ++++++++++++------ roles/monitoring/tasks/main.yaml | 10 ++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 roles/monitoring/tasks/main.yaml diff --git a/deploy_partition.yaml b/deploy_partition.yaml index 5ee6f08b..c4954801 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -37,6 +37,8 @@ roles: - name: ansible-common tags: always + - name: monitoring + tags: monitoring - name: metal-roles/partition/roles/monitoring/prometheus tags: prometheus - name: metal-roles/partition/roles/monitoring/ipmi-exporter diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index 908c4811..e6d9c919 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -1,17 +1,23 @@ monitoring_node_exporter_port: 9100 prometheus_node_exporter_targets: - - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_node_exporter_port }}" - - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" + # - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_node_exporter_port }}" + # - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" + - "leaf01:{{ monitoring_node_exporter_port }}" + - "leaf02:{{ monitoring_node_exporter_port }}" monitoring_blackbox_exporter_port: 9115 prometheus_blackbox_exporter_targets: - - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" - - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" + # - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" + # - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" + - "leaf01:{{ monitoring_blackbox_exporter_port }}" + - "leaf02:{{ monitoring_blackbox_exporter_port }}" monitoring_sonic_exporter_port: 9101 prometheus_sonic_exporter_targets: - - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" - - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" + # - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" + # - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" + - "leaf01:{{ monitoring_sonic_exporter_port }}" + - "leaf02:{{ monitoring_sonic_exporter_port }}" prometheus_ipmi_exporter_targets: - "{{ hostvars['machine01'].ansible_host }}" diff --git a/roles/monitoring/tasks/main.yaml b/roles/monitoring/tasks/main.yaml new file mode 100644 index 00000000..3a653847 --- /dev/null +++ b/roles/monitoring/tasks/main.yaml @@ -0,0 +1,10 @@ +--- +- name: Add leaf hosts to /etc/hosts + become: yes + lineinfile: + path: /etc/hosts + regexp: "^{{ hostvars[item].ansible_host }}\\s+{{ item }}$" + line: "{{ hostvars[item].ansible_host }} {{ item }}" + loop: + - leaf01 + - leaf02 From 1e750a29a889febf83fad2a6d558182341b00497 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 17 Dec 2025 12:56:33 +0100 Subject: [PATCH 11/24] make etc/hosts to prometheus var --- deploy_partition.yaml | 14 ++++++-------- inventories/group_vars/all/control_plane.yaml | 2 ++ inventories/group_vars/all/release_vector.yaml | 2 +- inventories/group_vars/partition/monitoring.yaml | 10 ++-------- inventories/group_vars/partition/prometheus.yaml | 6 ++++++ roles/monitoring/tasks/main.yaml | 10 ---------- 6 files changed, 17 insertions(+), 27 deletions(-) delete mode 100644 roles/monitoring/tasks/main.yaml diff --git a/deploy_partition.yaml b/deploy_partition.yaml index 02527b38..633776c6 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -92,8 +92,6 @@ roles: - name: ansible-common tags: always - - name: monitoring - tags: monitoring - name: metal-roles/partition/roles/monitoring/prometheus tags: prometheus - name: metal-roles/partition/roles/monitoring/ipmi-exporter @@ -118,9 +116,9 @@ any_errors_fatal: true become: true pre_tasks: - - name: Wait some time - pause: - seconds: 120 + - name: Wait some time + pause: + seconds: 120 roles: - name: ansible-common tags: always @@ -156,8 +154,8 @@ any_errors_fatal: true gather_facts: false pre_tasks: - - name: Wait until no route entries have "queued" - include_tasks: tasks/check_queued.yaml + - name: Wait until no route entries have "queued" + include_tasks: tasks/check_queued.yaml - name: Configure IPv6 and LLDP ports (Enterprise SONiC) hosts: dell_sonic @@ -167,7 +165,7 @@ - name: Enable IPv6 to also have LLA at VLAN interfaces sysctl: name: net.ipv6.conf.default.disable_ipv6 - value: '0' + value: "0" state: present sysctl_file: /etc/sysctl.conf - name: Configure LLDP port IDs and descriptions diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index 6e2e2cba..81cb189d 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -6,6 +6,8 @@ metal_control_plane_stage_name: test metal_control_plane_namespace: metal-control-plane metal_control_plane_image_pull_policy: Always +metal_api_bmc_superuser_pwd: change-me + helm_additional_params: - "--debug" diff --git a/inventories/group_vars/all/release_vector.yaml b/inventories/group_vars/all/release_vector.yaml index 24138ded..60f752f6 100644 --- a/inventories/group_vars/all/release_vector.yaml +++ b/inventories/group_vars/all/release_vector.yaml @@ -15,7 +15,7 @@ metal_stack_release_vectors: # metal_api_image_name: # metal_api_image_tag: # metal_apiserver_image_name: -# metal_apiserver_image_tag: +metal_apiserver_image_tag: v0.1.1 # metal_metalctl_image_name: # metal_metalctl_image_tag: # metal_masterdata_api_image_name: diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index e6d9c919..2040e38f 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -1,21 +1,15 @@ monitoring_node_exporter_port: 9100 prometheus_node_exporter_targets: - # - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_node_exporter_port }}" - # - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_node_exporter_port }}" - "leaf01:{{ monitoring_node_exporter_port }}" - "leaf02:{{ monitoring_node_exporter_port }}" monitoring_blackbox_exporter_port: 9115 prometheus_blackbox_exporter_targets: - # - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" - # - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_blackbox_exporter_port }}" - "leaf01:{{ monitoring_blackbox_exporter_port }}" - "leaf02:{{ monitoring_blackbox_exporter_port }}" monitoring_sonic_exporter_port: 9101 prometheus_sonic_exporter_targets: - # - "{{ hostvars['leaf01'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" - # - "{{ hostvars['leaf02'].ansible_host }}:{{ monitoring_sonic_exporter_port }}" - "leaf01:{{ monitoring_sonic_exporter_port }}" - "leaf02:{{ monitoring_sonic_exporter_port }}" @@ -23,8 +17,8 @@ prometheus_ipmi_exporter_targets: - "{{ hostvars['machine01'].ansible_host }}" - "{{ hostvars['machine02'].ansible_host }}" -metal_api_bmc_superuser_enabled: false -metal_api_bmc_superuser_pwd: change-me +# IPMI should use the same credentials as metal-api +monitoring_ipmi_bmc_superuser_pwd: "{{ metal_api_bmc_superuser_pwd }}" prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_control_plane_ip }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" prometheus_blackbox_exporter_dns: 8.8.8.8 diff --git a/inventories/group_vars/partition/prometheus.yaml b/inventories/group_vars/partition/prometheus.yaml index f6213351..4eccf8e7 100644 --- a/inventories/group_vars/partition/prometheus.yaml +++ b/inventories/group_vars/partition/prometheus.yaml @@ -1,2 +1,8 @@ prometheus_remote_write: - url: "http://{{ metal_control_plane_ingress_dns }}:9090/api/v1/write" + +prometheus_hosts_content: | + 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 + ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 + "{{ hostvars['leaf01'].ansible_host }}" leaf01 + "{{ hostvars['leaf02'].ansible_host }}" leaf02 diff --git a/roles/monitoring/tasks/main.yaml b/roles/monitoring/tasks/main.yaml deleted file mode 100644 index 3a653847..00000000 --- a/roles/monitoring/tasks/main.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -- name: Add leaf hosts to /etc/hosts - become: yes - lineinfile: - path: /etc/hosts - regexp: "^{{ hostvars[item].ansible_host }}\\s+{{ item }}$" - line: "{{ hostvars[item].ansible_host }} {{ item }}" - loop: - - leaf01 - - leaf02 From 90ba87eef8b3b85df9636afce1b9cccc84417911 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 17 Dec 2025 14:21:18 +0100 Subject: [PATCH 12/24] fix etc/hosts for prometheus --- inventories/group_vars/partition/prometheus.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inventories/group_vars/partition/prometheus.yaml b/inventories/group_vars/partition/prometheus.yaml index 4eccf8e7..66bd0b79 100644 --- a/inventories/group_vars/partition/prometheus.yaml +++ b/inventories/group_vars/partition/prometheus.yaml @@ -4,5 +4,5 @@ prometheus_remote_write: prometheus_hosts_content: | 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 - "{{ hostvars['leaf01'].ansible_host }}" leaf01 - "{{ hostvars['leaf02'].ansible_host }}" leaf02 + {{ hostvars['leaf01'].ansible_host }} leaf01 + {{ hostvars['leaf02'].ansible_host }} leaf02 From b37176f221771df6c78248df57364bf2a4d84a4c Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 17 Dec 2025 14:34:59 +0100 Subject: [PATCH 13/24] fix sonic-exporter initial state_db --- roles/sonic/tasks/main.yaml | 4 ++++ roles/sonic/tasks/mock-platform.yaml | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 roles/sonic/tasks/mock-platform.yaml diff --git a/roles/sonic/tasks/main.yaml b/roles/sonic/tasks/main.yaml index cbf1cada..2aa72b05 100644 --- a/roles/sonic/tasks/main.yaml +++ b/roles/sonic/tasks/main.yaml @@ -25,3 +25,7 @@ reload: no sysctl_set: yes value: "1" + +# We need to fill some values for the sonic-exporter (uses the STATE_DB) +- name: Mock sonic platform for kvm + ansible.builtin.import_tasks: mock-platform.yaml diff --git a/roles/sonic/tasks/mock-platform.yaml b/roles/sonic/tasks/mock-platform.yaml new file mode 100644 index 00000000..c6339781 --- /dev/null +++ b/roles/sonic/tasks/mock-platform.yaml @@ -0,0 +1,17 @@ +--- +- name: Check if CHASSIS_INFO exists + shell: sonic-db-cli STATE_DB exists 'CHASSIS_INFO|chassis0' + register: chassis_exists + changed_when: false + +- name: Set chassis info in STATE_DB (only if missing) + shell: | + sonic-db-cli STATE_DB hmset 'CHASSIS_INFO|chassis0' \ + part_num 'PN-MINI-LAB' \ + serial_num '{{ inventory_hostname }}' \ + base_mac_addr 'aa:bb:cc:dd:ee:ff' \ + onie_version 'unknown' \ + platform_name 'kvm_x86_64' \ + hardware_revision '0' \ + product_name 'SONiC-KVM' + when: chassis_exists.stdout.strip() == "0" From 87ae50ec418ead6312520be0218afc51b0a9be2d Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 14 Jan 2026 16:08:53 +0100 Subject: [PATCH 14/24] cleanup --- inventories/group_vars/all/control_plane.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index 81cb189d..1b1c78d0 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -1,7 +1,7 @@ --- metal_control_plane_ip: 172.17.0.1 metal_control_plane_provider_tenant: metal-stack -metal_control_plane_ingress_dns: 172.17.0.1.nip.io +metal_control_plane_ingress_dns: "{{ metal_control_plane_ip }}.nip.io" metal_control_plane_stage_name: test metal_control_plane_namespace: metal-control-plane metal_control_plane_image_pull_policy: Always From 8e32107a95aff226ee1875f6e008046926b708b6 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 14 Jan 2026 16:45:04 +0100 Subject: [PATCH 15/24] cleanup --- inventories/group_vars/all/release_vector.yaml | 3 +-- inventories/group_vars/partition/monitoring.yaml | 3 +-- inventories/group_vars/partition/router.yaml | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/inventories/group_vars/all/release_vector.yaml b/inventories/group_vars/all/release_vector.yaml index 60f752f6..fcc07328 100644 --- a/inventories/group_vars/all/release_vector.yaml +++ b/inventories/group_vars/all/release_vector.yaml @@ -6,7 +6,6 @@ metal_stack_release_vectors: variable_mapping_path: metal_stack_release.mapping include_role_defaults: metal-roles/common/roles/defaults oci_cosign_verify_key: "{{ lookup('file', 'cosign.pub') }}" - ## ## for development purposes, you can override releases from our image vector here ## @@ -15,7 +14,7 @@ metal_stack_release_vectors: # metal_api_image_name: # metal_api_image_tag: # metal_apiserver_image_name: -metal_apiserver_image_tag: v0.1.1 +# metal_apiserver_image_tag: # metal_metalctl_image_name: # metal_metalctl_image_tag: # metal_masterdata_api_image_name: diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index 2040e38f..611e2624 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -20,5 +20,4 @@ prometheus_ipmi_exporter_targets: # IPMI should use the same credentials as metal-api monitoring_ipmi_bmc_superuser_pwd: "{{ metal_api_bmc_superuser_pwd }}" -prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_control_plane_ip }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" -prometheus_blackbox_exporter_dns: 8.8.8.8 +prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_partition_metal_api_addr }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" diff --git a/inventories/group_vars/partition/router.yaml b/inventories/group_vars/partition/router.yaml index 99f801fd..bdcbc460 100644 --- a/inventories/group_vars/partition/router.yaml +++ b/inventories/group_vars/partition/router.yaml @@ -3,4 +3,3 @@ router_nameservers: - 172.17.0.1 - 1.1.1.1 - 1.0.0.1 - - 8.8.8.8 From 2f4a87c802933ed203df201aac6fc25754631aa5 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 28 Jan 2026 11:53:52 +0100 Subject: [PATCH 16/24] feedback --- control-plane/kind.yaml | 6 ------ inventories/group_vars/all/control_plane.yaml | 12 ------------ inventories/group_vars/control-plane/metal.yml | 9 ++++++--- inventories/group_vars/control-plane/monitoring.yaml | 11 +++++++++++ 4 files changed, 17 insertions(+), 21 deletions(-) create mode 100644 inventories/group_vars/control-plane/monitoring.yaml diff --git a/control-plane/kind.yaml b/control-plane/kind.yaml index 7c08f040..a55b4aaf 100644 --- a/control-plane/kind.yaml +++ b/control-plane/kind.yaml @@ -17,12 +17,6 @@ nodes: hostPort: 4150 - containerPort: 50051 hostPort: 50051 - - containerPort: 3100 - hostPort: 3100 - - containerPort: 3000 - hostPort: 3000 - - containerPort: 9090 - hostPort: 9090 # if you want to run gardener operator + metal-stack, you need more pods kubeadmConfigPatches: - | diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index 1b1c78d0..69c3655e 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -6,17 +6,5 @@ metal_control_plane_stage_name: test metal_control_plane_namespace: metal-control-plane metal_control_plane_image_pull_policy: Always -metal_api_bmc_superuser_pwd: change-me - helm_additional_params: - "--debug" - -monitoring_grafana_github_oauth: - enabled: false -monitoring_grafana_extra_secret_mounts: [] -monitoring_thanos_receive_enabled: false -monitoring_thanos_receive_ingress_enabled: false - -monitoring_thanos_receive_ingress_basic_auth: {} -monitoring_thanos_receive_ingress_annotations: [] -monitoring_thanos_receive_ingress_tls: {} diff --git a/inventories/group_vars/control-plane/metal.yml b/inventories/group_vars/control-plane/metal.yml index c0ab9f1a..d0609b0f 100644 --- a/inventories/group_vars/control-plane/metal.yml +++ b/inventories/group_vars/control-plane/metal.yml @@ -3,15 +3,16 @@ metal_set_resource_limits: no metal_check_api_health_endpoint: http://api.{{ metal_control_plane_ingress_dns }}:8080/metal/v1/health metal_api_headscale_control_plane_address: "http://headscale.{{ metal_control_plane_ingress_dns }}:8080" -# metal_helm_chart_local_path: /helm-charts/charts/metal-control-plane - -metal_api_replicas: 2 +metal_api_pdb_min_available: 1 +metal_api_replicas: 1 metal_api_view_key: metal-view metal_api_edit_key: metal-edit metal_api_admin_key: metal-admin metal_api_nsq_tcp_address: nsqd:4150 +metal_apiserver_pdb_min_available: 1 + metal_apiserver_enabled: true metal_apiserver_url: http://v2.api.{{ metal_control_plane_ingress_dns }}:8080 @@ -22,6 +23,8 @@ metal_apiserver_oidc_end_session_url: "https://zitadel.{{ metal_control_plane_in metal_apiserver_redis_password: change-me-soon metal_apiserver_admin_subjects: "admin@metal-stack.zitadel.172.17.0.1.nip.io@openid-connect" +metal_api_bmc_superuser_pwd: change-me + metal_api_images: - id: firewall-ubuntu-3.0 name: Firewall 3 Ubuntu diff --git a/inventories/group_vars/control-plane/monitoring.yaml b/inventories/group_vars/control-plane/monitoring.yaml new file mode 100644 index 00000000..85dd2942 --- /dev/null +++ b/inventories/group_vars/control-plane/monitoring.yaml @@ -0,0 +1,11 @@ +monitoring_ingress_grafana_tls: no + +monitoring_grafana_github_oauth: + enabled: false +monitoring_grafana_extra_secret_mounts: [] +monitoring_thanos_receive_enabled: false +monitoring_thanos_receive_ingress_enabled: false + +monitoring_thanos_receive_ingress_basic_auth: {} +monitoring_thanos_receive_ingress_annotations: [] +monitoring_thanos_receive_ingress_tls: {} From 6f02a3fffbf9c81a295ae3461e9382eb74077410 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 28 Jan 2026 13:02:40 +0100 Subject: [PATCH 17/24] feedback --- inventories/group_vars/all/control_plane.yaml | 6 ++++-- inventories/group_vars/control-plane/metal.yml | 2 -- .../group_vars/control-plane/monitoring.yaml | 8 +++----- inventories/group_vars/leaves/main.yaml | 17 ++++++++--------- inventories/group_vars/partition/common.yaml | 1 - 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index 69c3655e..ae4c74ee 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -1,10 +1,12 @@ --- -metal_control_plane_ip: 172.17.0.1 metal_control_plane_provider_tenant: metal-stack -metal_control_plane_ingress_dns: "{{ metal_control_plane_ip }}.nip.io" +metal_control_plane_ingress_dns: 172.17.0.1.nip.io metal_control_plane_stage_name: test metal_control_plane_namespace: metal-control-plane metal_control_plane_image_pull_policy: Always helm_additional_params: - "--debug" + +# needs to be in all -> paritition and control-plane +metal_api_bmc_superuser_pwd: change-me diff --git a/inventories/group_vars/control-plane/metal.yml b/inventories/group_vars/control-plane/metal.yml index d0609b0f..9e5568d9 100644 --- a/inventories/group_vars/control-plane/metal.yml +++ b/inventories/group_vars/control-plane/metal.yml @@ -23,8 +23,6 @@ metal_apiserver_oidc_end_session_url: "https://zitadel.{{ metal_control_plane_in metal_apiserver_redis_password: change-me-soon metal_apiserver_admin_subjects: "admin@metal-stack.zitadel.172.17.0.1.nip.io@openid-connect" -metal_api_bmc_superuser_pwd: change-me - metal_api_images: - id: firewall-ubuntu-3.0 name: Firewall 3 Ubuntu diff --git a/inventories/group_vars/control-plane/monitoring.yaml b/inventories/group_vars/control-plane/monitoring.yaml index 85dd2942..4eb5fa3e 100644 --- a/inventories/group_vars/control-plane/monitoring.yaml +++ b/inventories/group_vars/control-plane/monitoring.yaml @@ -3,9 +3,7 @@ monitoring_ingress_grafana_tls: no monitoring_grafana_github_oauth: enabled: false monitoring_grafana_extra_secret_mounts: [] -monitoring_thanos_receive_enabled: false -monitoring_thanos_receive_ingress_enabled: false -monitoring_thanos_receive_ingress_basic_auth: {} -monitoring_thanos_receive_ingress_annotations: [] -monitoring_thanos_receive_ingress_tls: {} +monitoring_thanos_receive_enabled: true +monitoring_thanos_receive_ingress_enabled: true +monitoring_thanos_receive_size: 5Gi diff --git a/inventories/group_vars/leaves/main.yaml b/inventories/group_vars/leaves/main.yaml index d06812e3..e8f50576 100644 --- a/inventories/group_vars/leaves/main.yaml +++ b/inventories/group_vars/leaves/main.yaml @@ -2,6 +2,7 @@ dhcp_listening_interfaces: - Vlan4000 +metal_core_cidr_mask: 25 metal_core_spine_uplinks: - Ethernet120 @@ -23,14 +24,12 @@ sonic_vlans: # Dummy VTEP so that basic VXLAN config will be deployed by sonic role: sonic_vteps: -- comment: Dummy - vlan: Vlan3999 - vni: 103999 - -metal_core_cidr_mask: 25 + - comment: Dummy + vlan: Vlan3999 + vni: 103999 sonic_ntpservers: -- 0.europe.pool.ntp.org -- 1.europe.pool.ntp.org -- 2.europe.pool.ntp.org -- 3.europe.pool.ntp.org \ No newline at end of file + - 0.europe.pool.ntp.org + - 1.europe.pool.ntp.org + - 2.europe.pool.ntp.org + - 3.europe.pool.ntp.org diff --git a/inventories/group_vars/partition/common.yaml b/inventories/group_vars/partition/common.yaml index cc484596..61da08a0 100644 --- a/inventories/group_vars/partition/common.yaml +++ b/inventories/group_vars/partition/common.yaml @@ -3,7 +3,6 @@ metal_partition_timezone: Europe/Berlin metal_partition_id: mini-lab metal_partition_metal_api_protocol: http -metal_partition_metal_ip: "{{ metal_control_plane_ip }}" metal_partition_metal_api_addr: api.{{ metal_control_plane_ingress_dns }} metal_partition_metal_api_port: 8080 metal_partition_metal_api_basepath: /metal/ From 35304e34c90bb66315a05a162958372c363e3ac5 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 28 Jan 2026 13:03:47 +0100 Subject: [PATCH 18/24] feedback --- inventories/group_vars/control-plane/ingress.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/inventories/group_vars/control-plane/ingress.yaml b/inventories/group_vars/control-plane/ingress.yaml index d92cf164..0ef8644b 100644 --- a/inventories/group_vars/control-plane/ingress.yaml +++ b/inventories/group_vars/control-plane/ingress.yaml @@ -2,6 +2,3 @@ ingress_tcp_service_exposals: "4150": "{{ metal_control_plane_namespace }}/nsqd:4150" "50051": "{{ metal_control_plane_namespace }}/metal-api:50051" - "3000": "{{ monitoring_namespace }}/kube-prometheus-stack-grafana:80" - "3100": "{{ monitoring_namespace }}/loki:3100" - "9090": "{{ monitoring_namespace }}/kube-prometheus-stack-prometheus:9090" \ No newline at end of file From c3885d36af608c0fbae9fa3f399f98dc905af2b8 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 28 Jan 2026 14:06:25 +0100 Subject: [PATCH 19/24] feedback --- deploy_control_plane.yaml | 2 ++ inventories/group_vars/all/monitoring.yaml | 2 ++ .../group_vars/control-plane/logging.yaml | 2 ++ .../group_vars/control-plane/monitoring.yaml | 3 +++ .../group_vars/partition/monitoring.yaml | 15 ----------- .../group_vars/partition/prometheus.yaml | 25 ++++++++++++++++++- .../group_vars/partition/promtail.yaml | 20 +++++++-------- roles/sonic/tasks/main.yaml | 6 +++++ roles/sonic/tasks/mock-platform.yaml | 18 ++++++------- 9 files changed, 58 insertions(+), 35 deletions(-) create mode 100644 inventories/group_vars/all/monitoring.yaml create mode 100644 inventories/group_vars/control-plane/logging.yaml diff --git a/deploy_control_plane.yaml b/deploy_control_plane.yaml index d24af542..6fe5a2b4 100644 --- a/deploy_control_plane.yaml +++ b/deploy_control_plane.yaml @@ -32,8 +32,10 @@ - name: metal-roles/control-plane/roles/metal tags: metal - name: metal-roles/control-plane/roles/logging + when: monitoring_enabled tags: logging - name: metal-roles/control-plane/roles/monitoring + when: monitoring_enabled tags: monitoring - name: deploy gardener diff --git a/inventories/group_vars/all/monitoring.yaml b/inventories/group_vars/all/monitoring.yaml new file mode 100644 index 00000000..6b781a5b --- /dev/null +++ b/inventories/group_vars/all/monitoring.yaml @@ -0,0 +1,2 @@ +--- +monitoring_enabled: true diff --git a/inventories/group_vars/control-plane/logging.yaml b/inventories/group_vars/control-plane/logging.yaml new file mode 100644 index 00000000..4e931d1c --- /dev/null +++ b/inventories/group_vars/control-plane/logging.yaml @@ -0,0 +1,2 @@ +--- +logging_ingress_loki_tls: no diff --git a/inventories/group_vars/control-plane/monitoring.yaml b/inventories/group_vars/control-plane/monitoring.yaml index 4eb5fa3e..db54d9be 100644 --- a/inventories/group_vars/control-plane/monitoring.yaml +++ b/inventories/group_vars/control-plane/monitoring.yaml @@ -1,3 +1,4 @@ +--- monitoring_ingress_grafana_tls: no monitoring_grafana_github_oauth: @@ -7,3 +8,5 @@ monitoring_grafana_extra_secret_mounts: [] monitoring_thanos_receive_enabled: true monitoring_thanos_receive_ingress_enabled: true monitoring_thanos_receive_size: 5Gi + +monitoring_prometheus_core_dns_enabled: true diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml index 611e2624..2c3b07be 100644 --- a/inventories/group_vars/partition/monitoring.yaml +++ b/inventories/group_vars/partition/monitoring.yaml @@ -1,23 +1,8 @@ monitoring_node_exporter_port: 9100 -prometheus_node_exporter_targets: - - "leaf01:{{ monitoring_node_exporter_port }}" - - "leaf02:{{ monitoring_node_exporter_port }}" monitoring_blackbox_exporter_port: 9115 -prometheus_blackbox_exporter_targets: - - "leaf01:{{ monitoring_blackbox_exporter_port }}" - - "leaf02:{{ monitoring_blackbox_exporter_port }}" monitoring_sonic_exporter_port: 9101 -prometheus_sonic_exporter_targets: - - "leaf01:{{ monitoring_sonic_exporter_port }}" - - "leaf02:{{ monitoring_sonic_exporter_port }}" - -prometheus_ipmi_exporter_targets: - - "{{ hostvars['machine01'].ansible_host }}" - - "{{ hostvars['machine02'].ansible_host }}" # IPMI should use the same credentials as metal-api monitoring_ipmi_bmc_superuser_pwd: "{{ metal_api_bmc_superuser_pwd }}" - -prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_partition_metal_api_addr }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" diff --git a/inventories/group_vars/partition/prometheus.yaml b/inventories/group_vars/partition/prometheus.yaml index 66bd0b79..8e691dae 100644 --- a/inventories/group_vars/partition/prometheus.yaml +++ b/inventories/group_vars/partition/prometheus.yaml @@ -1,8 +1,31 @@ +--- +# exporters + +prometheus_sonic_exporter_targets: + - "leaf01:{{ monitoring_sonic_exporter_port }}" + - "leaf02:{{ monitoring_sonic_exporter_port }}" + +prometheus_blackbox_exporter_targets: + - "leaf01:{{ monitoring_blackbox_exporter_port }}" + - "leaf02:{{ monitoring_blackbox_exporter_port }}" + +prometheus_node_exporter_targets: + - "leaf01:{{ monitoring_node_exporter_port }}" + - "leaf02:{{ monitoring_node_exporter_port }}" + +prometheus_ipmi_exporter_targets: + - "{{ hostvars['machine01'].ansible_host }}" + - "{{ hostvars['machine02'].ansible_host }}" + +prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_partition_metal_api_addr }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" + prometheus_remote_write: - - url: "http://{{ metal_control_plane_ingress_dns }}:9090/api/v1/write" + - url: "http://thanos-receive.{{ metal_control_plane_ingress_dns }}:8080/api/v1/receive" prometheus_hosts_content: | 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 {{ hostvars['leaf01'].ansible_host }} leaf01 {{ hostvars['leaf02'].ansible_host }} leaf02 + +prometheus_haproxy_enabled: false diff --git a/inventories/group_vars/partition/promtail.yaml b/inventories/group_vars/partition/promtail.yaml index d71f58f0..fda39638 100644 --- a/inventories/group_vars/partition/promtail.yaml +++ b/inventories/group_vars/partition/promtail.yaml @@ -1,14 +1,14 @@ promtail_clients: - - url: http://{{ metal_control_plane_ingress_dns }}:3100/loki/api/v1/push + - url: http://loki.{{ metal_control_plane_ingress_dns }}:8080/loki/api/v1/push promtail_scrape_configs: - - job_name: leave-node-docker - docker_sd_configs: - - host: 'unix:///var/run/docker.sock' - refresh_interval: 5s - relabel_configs: - - regex: /(.*) - source_labels: [__meta_docker_container_name] - target_label: container + - job_name: leaf-node-docker + docker_sd_configs: + - host: "unix:///var/run/docker.sock" + refresh_interval: 5s + relabel_configs: + - regex: /(.*) + source_labels: [__meta_docker_container_name] + target_label: container - replacement: "{{ inventory_hostname }}" - target_label: node_name \ No newline at end of file + target_label: node_name diff --git a/roles/sonic/tasks/main.yaml b/roles/sonic/tasks/main.yaml index 2aa72b05..c8ee8460 100644 --- a/roles/sonic/tasks/main.yaml +++ b/roles/sonic/tasks/main.yaml @@ -29,3 +29,9 @@ # We need to fill some values for the sonic-exporter (uses the STATE_DB) - name: Mock sonic platform for kvm ansible.builtin.import_tasks: mock-platform.yaml + +# ntp restarting for monitoring -> otherwise some NodeTimeOutOfSync error +- name: restart chrony + systemd: + name: chrony + state: restarted diff --git a/roles/sonic/tasks/mock-platform.yaml b/roles/sonic/tasks/mock-platform.yaml index c6339781..2363d5c4 100644 --- a/roles/sonic/tasks/mock-platform.yaml +++ b/roles/sonic/tasks/mock-platform.yaml @@ -1,17 +1,17 @@ --- - name: Check if CHASSIS_INFO exists - shell: sonic-db-cli STATE_DB exists 'CHASSIS_INFO|chassis0' + command: sonic-db-cli STATE_DB exists 'CHASSIS_INFO|chassis0' register: chassis_exists changed_when: false - name: Set chassis info in STATE_DB (only if missing) - shell: | - sonic-db-cli STATE_DB hmset 'CHASSIS_INFO|chassis0' \ - part_num 'PN-MINI-LAB' \ - serial_num '{{ inventory_hostname }}' \ - base_mac_addr 'aa:bb:cc:dd:ee:ff' \ - onie_version 'unknown' \ - platform_name 'kvm_x86_64' \ - hardware_revision '0' \ + command: | + sonic-db-cli STATE_DB hmset 'CHASSIS_INFO|chassis0' + part_num 'PN-MINI-LAB' + serial_num '{{ inventory_hostname }}' + base_mac_addr 'aa:bb:cc:dd:ee:ff' + onie_version 'unknown' + platform_name 'kvm_x86_64' + hardware_revision '0' product_name 'SONiC-KVM' when: chassis_exists.stdout.strip() == "0" From c5558deb31877c2b19908a6b9ce73d320b695cd3 Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 28 Jan 2026 14:35:47 +0100 Subject: [PATCH 20/24] feedback --- deploy_partition.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deploy_partition.yaml b/deploy_partition.yaml index e79b096a..55528790 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -93,8 +93,10 @@ - name: ansible-common tags: always - name: metal-roles/partition/roles/monitoring/prometheus + when: monitoring_enabled tags: prometheus - name: metal-roles/partition/roles/monitoring/ipmi-exporter + when: monitoring_enabled tags: ipmi-exporter - name: Deploy monitoring on leaves @@ -103,12 +105,16 @@ - name: ansible-common tags: always - name: metal-roles/partition/roles/promtail + when: monitoring_enabled tags: promtail - name: metal-roles/partition/roles/monitoring/node-exporter + when: monitoring_enabled tags: node-exporter - name: metal-roles/partition/roles/monitoring/blackbox-exporter + when: monitoring_enabled tags: blackbox-exporter - name: metal-roles/partition/roles/monitoring/sonic-exporter + when: monitoring_enabled tags: sonic-exporter - name: Deploy metal-core From 34106acde504019ebd8a892fa61a0030099f6edc Mon Sep 17 00:00:00 2001 From: ostempel Date: Wed, 28 Jan 2026 14:48:00 +0100 Subject: [PATCH 21/24] feedback --- deploy_partition.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deploy_partition.yaml b/deploy_partition.yaml index 55528790..d9a7e08a 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -89,6 +89,7 @@ - name: Deploy monitoring on leaf01 hosts: leaf01 + become: true roles: - name: ansible-common tags: always @@ -101,6 +102,7 @@ - name: Deploy monitoring on leaves hosts: leaves + become: true roles: - name: ansible-common tags: always From 4f7137d95ae12a57e98abcba3d20903f1b6bef87 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 28 Jan 2026 15:10:51 +0100 Subject: [PATCH 22/24] Only deploy monitoring for sonic flavor. --- Makefile | 1 + compose.yaml | 2 ++ inventories/group_vars/all/monitoring.yaml | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 02a2729e..055a79e4 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,7 @@ MAX_RETRIES := 30 # Machine flavors ifeq ($(MINI_LAB_FLAVOR),sonic) LAB_TOPOLOGY=mini-lab.sonic.yaml +MONITORING_ENABLED=true else ifeq ($(MINI_LAB_FLAVOR),dell_sonic) LAB_TOPOLOGY=mini-lab.dell_sonic.yaml MINI_LAB_SONIC_IMAGE=r.metal-stack.io/vrnetlab/dell_sonic:$(MINI_LAB_DELL_SONIC_VERSION) diff --git a/compose.yaml b/compose.yaml index 54a6b6f4..8c2aa709 100644 --- a/compose.yaml +++ b/compose.yaml @@ -21,6 +21,7 @@ services: - DOCKER_HUB_USER=${DOCKER_HUB_USER} - DOCKER_HUB_TOKEN=${DOCKER_HUB_TOKEN} - GARDENER_ENABLED=${GARDENER_ENABLED:-} + - MONITORING_ENABLED=${MONITORING_ENABLED:-} network_mode: host working_dir: /mini-lab entrypoint: @@ -48,6 +49,7 @@ services: - CI=${CI} - DOCKER_HUB_USER=${DOCKER_HUB_USER} - DOCKER_HUB_TOKEN=${DOCKER_HUB_TOKEN} + - MONITORING_ENABLED=${MONITORING_ENABLED:-} network_mode: host working_dir: /mini-lab entrypoint: diff --git a/inventories/group_vars/all/monitoring.yaml b/inventories/group_vars/all/monitoring.yaml index 6b781a5b..ef630d21 100644 --- a/inventories/group_vars/all/monitoring.yaml +++ b/inventories/group_vars/all/monitoring.yaml @@ -1,2 +1,2 @@ --- -monitoring_enabled: true +monitoring_enabled: "{{ lookup('env', 'MONITORING_ENABLED') | default('', false) }}" From 8638854a120bd6a83a6bef71f4f4f93aa40316a9 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 29 Jan 2026 08:49:04 +0100 Subject: [PATCH 23/24] test pipeline with old metal-core --- inventories/group_vars/all/release_vector.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inventories/group_vars/all/release_vector.yaml b/inventories/group_vars/all/release_vector.yaml index f801fffc..80d5ee5f 100644 --- a/inventories/group_vars/all/release_vector.yaml +++ b/inventories/group_vars/all/release_vector.yaml @@ -21,8 +21,8 @@ metal_stack_release_vectors: # metal_masterdata_api_image_tag: # metal_console_image_name: # metal_console_image_tag: -# metal_core_image_name: -# metal_core_image_tag: +metal_core_image_name: ghcr.io/metal-stack/metal-core +metal_core_image_tag: v0.15.0 # headscale_image_tag: # headscale_db_backup_restore_sidecar_image_tag: # headscale_db_backup_restore_sidecar_image_name: From cb15b367bcb8eb02aee5da5318904ff78aae6fc7 Mon Sep 17 00:00:00 2001 From: ostempel Date: Thu, 29 Jan 2026 11:14:12 +0100 Subject: [PATCH 24/24] revert test --- inventories/group_vars/all/release_vector.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inventories/group_vars/all/release_vector.yaml b/inventories/group_vars/all/release_vector.yaml index 80d5ee5f..f801fffc 100644 --- a/inventories/group_vars/all/release_vector.yaml +++ b/inventories/group_vars/all/release_vector.yaml @@ -21,8 +21,8 @@ metal_stack_release_vectors: # metal_masterdata_api_image_tag: # metal_console_image_name: # metal_console_image_tag: -metal_core_image_name: ghcr.io/metal-stack/metal-core -metal_core_image_tag: v0.15.0 +# metal_core_image_name: +# metal_core_image_tag: # headscale_image_tag: # headscale_db_backup_restore_sidecar_image_tag: # headscale_db_backup_restore_sidecar_image_name: