diff --git a/Makefile b/Makefile index 02a2729e..055a79e4 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,7 @@ MAX_RETRIES := 30 # Machine flavors ifeq ($(MINI_LAB_FLAVOR),sonic) LAB_TOPOLOGY=mini-lab.sonic.yaml +MONITORING_ENABLED=true else ifeq ($(MINI_LAB_FLAVOR),dell_sonic) LAB_TOPOLOGY=mini-lab.dell_sonic.yaml MINI_LAB_SONIC_IMAGE=r.metal-stack.io/vrnetlab/dell_sonic:$(MINI_LAB_DELL_SONIC_VERSION) diff --git a/compose.yaml b/compose.yaml index 54a6b6f4..8c2aa709 100644 --- a/compose.yaml +++ b/compose.yaml @@ -21,6 +21,7 @@ services: - DOCKER_HUB_USER=${DOCKER_HUB_USER} - DOCKER_HUB_TOKEN=${DOCKER_HUB_TOKEN} - GARDENER_ENABLED=${GARDENER_ENABLED:-} + - MONITORING_ENABLED=${MONITORING_ENABLED:-} network_mode: host working_dir: /mini-lab entrypoint: @@ -48,6 +49,7 @@ services: - CI=${CI} - DOCKER_HUB_USER=${DOCKER_HUB_USER} - DOCKER_HUB_TOKEN=${DOCKER_HUB_TOKEN} + - MONITORING_ENABLED=${MONITORING_ENABLED:-} network_mode: host working_dir: /mini-lab entrypoint: diff --git a/control-plane/kind.yaml b/control-plane/kind.yaml index 949e46d5..a55b4aaf 100644 --- a/control-plane/kind.yaml +++ b/control-plane/kind.yaml @@ -11,16 +11,12 @@ nodes: extraPortMappings: - containerPort: 4443 hostPort: 4443 - listenAddress: 0.0.0.0 - containerPort: 8080 hostPort: 8080 - listenAddress: 0.0.0.0 - containerPort: 4150 hostPort: 4150 - listenAddress: 0.0.0.0 - containerPort: 50051 hostPort: 50051 - listenAddress: 0.0.0.0 # if you want to run gardener operator + metal-stack, you need more pods kubeadmConfigPatches: - | diff --git a/deploy_control_plane.yaml b/deploy_control_plane.yaml index 84436081..6fe5a2b4 100644 --- a/deploy_control_plane.yaml +++ b/deploy_control_plane.yaml @@ -31,6 +31,12 @@ tags: auth - name: metal-roles/control-plane/roles/metal tags: metal + - name: metal-roles/control-plane/roles/logging + when: monitoring_enabled + tags: logging + - name: metal-roles/control-plane/roles/monitoring + when: monitoring_enabled + tags: monitoring - name: deploy gardener import_playbook: deploy_gardener.yaml diff --git a/deploy_partition.yaml b/deploy_partition.yaml index 58c8b6a4..d9a7e08a 100644 --- a/deploy_partition.yaml +++ b/deploy_partition.yaml @@ -87,14 +87,46 @@ - name: metal-roles/partition/roles/pixiecore tags: pixiecore +- name: Deploy monitoring on leaf01 + hosts: leaf01 + become: true + roles: + - name: ansible-common + tags: always + - name: metal-roles/partition/roles/monitoring/prometheus + when: monitoring_enabled + tags: prometheus + - name: metal-roles/partition/roles/monitoring/ipmi-exporter + when: monitoring_enabled + tags: ipmi-exporter + +- name: Deploy monitoring on leaves + hosts: leaves + become: true + roles: + - name: ansible-common + tags: always + - name: metal-roles/partition/roles/promtail + when: monitoring_enabled + tags: promtail + - name: metal-roles/partition/roles/monitoring/node-exporter + when: monitoring_enabled + tags: node-exporter + - name: metal-roles/partition/roles/monitoring/blackbox-exporter + when: monitoring_enabled + tags: blackbox-exporter + - name: metal-roles/partition/roles/monitoring/sonic-exporter + when: monitoring_enabled + tags: sonic-exporter + - name: Deploy metal-core hosts: leaves any_errors_fatal: true become: true pre_tasks: - - name: Wait some time - pause: - seconds: 120 + - name: Wait some time + pause: + seconds: 120 roles: - name: ansible-common tags: always @@ -130,8 +162,8 @@ any_errors_fatal: true gather_facts: false pre_tasks: - - name: Wait until no route entries have "queued" - include_tasks: tasks/check_queued.yaml + - name: Wait until no route entries have "queued" + include_tasks: tasks/check_queued.yaml - name: Configure IPv6 and LLDP ports (Enterprise SONiC) hosts: dell_sonic @@ -141,7 +173,7 @@ - name: Enable IPv6 to also have LLA at VLAN interfaces sysctl: name: net.ipv6.conf.default.disable_ipv6 - value: '0' + value: "0" state: present sysctl_file: /etc/sysctl.conf - name: Configure LLDP port IDs and descriptions diff --git a/inventories/group_vars/all/control_plane.yaml b/inventories/group_vars/all/control_plane.yaml index c843eecb..ae4c74ee 100644 --- a/inventories/group_vars/all/control_plane.yaml +++ b/inventories/group_vars/all/control_plane.yaml @@ -6,4 +6,7 @@ metal_control_plane_namespace: metal-control-plane metal_control_plane_image_pull_policy: Always helm_additional_params: - - '--debug' + - "--debug" + +# needs to be in all -> paritition and control-plane +metal_api_bmc_superuser_pwd: change-me diff --git a/inventories/group_vars/all/monitoring.yaml b/inventories/group_vars/all/monitoring.yaml new file mode 100644 index 00000000..ef630d21 --- /dev/null +++ b/inventories/group_vars/all/monitoring.yaml @@ -0,0 +1,2 @@ +--- +monitoring_enabled: "{{ lookup('env', 'MONITORING_ENABLED') | default('', false) }}" diff --git a/inventories/group_vars/all/release_vector.yaml b/inventories/group_vars/all/release_vector.yaml index a66214d0..f801fffc 100644 --- a/inventories/group_vars/all/release_vector.yaml +++ b/inventories/group_vars/all/release_vector.yaml @@ -6,7 +6,6 @@ metal_stack_release_vectors: variable_mapping_path: metal_stack_release.mapping include_role_defaults: metal-roles/common/roles/defaults oci_cosign_verify_key: "{{ lookup('file', 'cosign.pub') }}" - ## ## for development purposes, you can override releases from our image vector here ## diff --git a/inventories/group_vars/control-plane/logging.yaml b/inventories/group_vars/control-plane/logging.yaml new file mode 100644 index 00000000..4e931d1c --- /dev/null +++ b/inventories/group_vars/control-plane/logging.yaml @@ -0,0 +1,2 @@ +--- +logging_ingress_loki_tls: no diff --git a/inventories/group_vars/control-plane/metal.yml b/inventories/group_vars/control-plane/metal.yml index 441d6fce..9e5568d9 100644 --- a/inventories/group_vars/control-plane/metal.yml +++ b/inventories/group_vars/control-plane/metal.yml @@ -3,6 +3,7 @@ metal_set_resource_limits: no metal_check_api_health_endpoint: http://api.{{ metal_control_plane_ingress_dns }}:8080/metal/v1/health metal_api_headscale_control_plane_address: "http://headscale.{{ metal_control_plane_ingress_dns }}:8080" +metal_api_pdb_min_available: 1 metal_api_replicas: 1 metal_api_view_key: metal-view metal_api_edit_key: metal-edit @@ -10,6 +11,8 @@ metal_api_admin_key: metal-admin metal_api_nsq_tcp_address: nsqd:4150 +metal_apiserver_pdb_min_available: 1 + metal_apiserver_enabled: true metal_apiserver_url: http://v2.api.{{ metal_control_plane_ingress_dns }}:8080 @@ -21,39 +24,39 @@ metal_apiserver_redis_password: change-me-soon metal_apiserver_admin_subjects: "admin@metal-stack.zitadel.172.17.0.1.nip.io@openid-connect" metal_api_images: -- id: firewall-ubuntu-3.0 - name: Firewall 3 Ubuntu - description: Firewall 3 Ubuntu Latest Release - url: https://images.metal-stack.io/metal-os/stable/firewall/3.0-ubuntu/img.tar.lz4 - features: - - firewall -- id: ubuntu-24.4 - name: Ubuntu 24.04 - description: Ubuntu 24.04 Latest Release - url: https://images.metal-stack.io/metal-os/stable/ubuntu/24.04/img.tar.lz4 - features: - - machine -- id: debian-12.0 - name: Debian 12 - description: Debian 12 Latest Release - url: https://images.metal-stack.io/metal-os/stable/debian/12/img.tar.lz4 - features: - - machine + - id: firewall-ubuntu-3.0 + name: Firewall 3 Ubuntu + description: Firewall 3 Ubuntu Latest Release + url: https://images.metal-stack.io/metal-os/stable/firewall/3.0-ubuntu/img.tar.lz4 + features: + - firewall + - id: ubuntu-24.4 + name: Ubuntu 24.04 + description: Ubuntu 24.04 Latest Release + url: https://images.metal-stack.io/metal-os/stable/ubuntu/24.04/img.tar.lz4 + features: + - machine + - id: debian-12.0 + name: Debian 12 + description: Debian 12 Latest Release + url: https://images.metal-stack.io/metal-os/stable/debian/12/img.tar.lz4 + features: + - machine metal_api_sizes: -- id: v1-small-x86 - name: v1-small-x86 - description: The Tiny Virtual - constraints: - - type: cores - min: 1 - max: 4 - - type: memory - min: "{{ '500MB' | humanfriendly }}" - max: "{{ '4GB' | humanfriendly }}" - - type: storage - min: "{{ '1GB' | humanfriendly }}" - max: "{{ '100GB' | humanfriendly }}" + - id: v1-small-x86 + name: v1-small-x86 + description: The Tiny Virtual + constraints: + - type: cores + min: 1 + max: 4 + - type: memory + min: "{{ '500MB' | humanfriendly }}" + max: "{{ '4GB' | humanfriendly }}" + - type: storage + min: "{{ '1GB' | humanfriendly }}" + max: "{{ '100GB' | humanfriendly }}" metal_api_partitions: - id: mini-lab @@ -66,61 +69,61 @@ metal_api_partitions: privatenetworkprefixlength: 22 metal_api_networks: -- id: tenant-super-network-mini-lab - name: "Project Super Network" - description: "Super network of all project networks" - nat: false - privatesuper: true - underlay: false - destinationprefixes: [] - partitionid: mini-lab - defaultchildprefixlength: - IPv4: 22 - IPv6: 96 - prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 - additionalAnnouncableCIDRs: - - 10.240.0.0/12 -- id: internet-mini-lab - name: "Virtual Internet Network" - description: "Virtual Internet Network for mini-lab" - nat: true - privatesuper: false - underlay: false - destinationprefixes: - - 0.0.0.0/0 - - ::/0 - partitionid: "mini-lab" - vrf: 104009 - prefixes: - - 203.0.113.128/25 - - 2001:db8:0:113::/64 - labels: - network.metal-stack.io/default: "" - network.metal-stack.io/default-external: "" -- id: underlay-mini-lab - name: "Underlay Network" - description: "Underlay Network for mini-lab" - nat: false - privatesuper: false - underlay: true - destinationprefixes: [] - partitionid: "mini-lab" - prefixes: - - 10.1.0.0/24 + - id: tenant-super-network-mini-lab + name: "Project Super Network" + description: "Super network of all project networks" + nat: false + privatesuper: true + underlay: false + destinationprefixes: [] + partitionid: mini-lab + defaultchildprefixlength: + IPv4: 22 + IPv6: 96 + prefixes: + - 10.0.0.0/16 + - 2001:db8:0:10::/64 + additionalAnnouncableCIDRs: + - 10.240.0.0/12 + - id: internet-mini-lab + name: "Virtual Internet Network" + description: "Virtual Internet Network for mini-lab" + nat: true + privatesuper: false + underlay: false + destinationprefixes: + - 0.0.0.0/0 + - ::/0 + partitionid: "mini-lab" + vrf: 104009 + prefixes: + - 203.0.113.128/25 + - 2001:db8:0:113::/64 + labels: + network.metal-stack.io/default: "" + network.metal-stack.io/default-external: "" + - id: underlay-mini-lab + name: "Underlay Network" + description: "Underlay Network for mini-lab" + nat: false + privatesuper: false + underlay: true + destinationprefixes: [] + partitionid: "mini-lab" + prefixes: + - 10.1.0.0/24 metal_api_ips: -- name: "reserve v4" - description: "Reserve IPv4 (for testing purposes)" - networkid: internet-mini-lab - projectid: 00000000-0000-0000-0000-000000000000 - ipaddress: "203.0.113.142" -- name: "reserve v6" - description: "Reserve IPv6 (for testing purposes)" - networkid: internet-mini-lab - projectid: 00000000-0000-0000-0000-000000000000 - ipaddress: "2001:db8:0:113::142" + - name: "reserve v4" + description: "Reserve IPv4 (for testing purposes)" + networkid: internet-mini-lab + projectid: 00000000-0000-0000-0000-000000000000 + ipaddress: "203.0.113.142" + - name: "reserve v6" + description: "Reserve IPv6 (for testing purposes)" + networkid: internet-mini-lab + projectid: 00000000-0000-0000-0000-000000000000 + ipaddress: "2001:db8:0:113::142" metal_masterdata_api_tls_ca: "{{ lookup('file', 'certs/ca.pem') }}" metal_masterdata_api_tls_cert: "{{ lookup('file', 'certs/masterdata-api/server.pem') }}" @@ -132,23 +135,23 @@ metal_masterdata_api_tls_client_key: "{{ lookup('file', 'certs/masterdata-api/cl metal_masterdata_api_port: 8445 metal_masterdata_api_tenants: -- meta: - id: metal-stack - kind: Tenant - apiversion: v1 - version: 0 - name: metal-stack - iam_config: - description: metal-stack tenant, which is provider + - meta: + id: metal-stack + kind: Tenant + apiversion: v1 + version: 0 + name: metal-stack + iam_config: + description: metal-stack tenant, which is provider metal_masterdata_api_projects: -- meta: - id: 00000000-0000-0000-0000-000000000001 - kind: Project - apiversion: v1 - version: 0 - name: sample-project - description: Sample project with static id + - meta: + id: 00000000-0000-0000-0000-000000000001 + kind: Project + apiversion: v1 + version: 0 + name: sample-project + description: Sample project with static id metal_console_enabled: false diff --git a/inventories/group_vars/control-plane/monitoring.yaml b/inventories/group_vars/control-plane/monitoring.yaml new file mode 100644 index 00000000..db54d9be --- /dev/null +++ b/inventories/group_vars/control-plane/monitoring.yaml @@ -0,0 +1,12 @@ +--- +monitoring_ingress_grafana_tls: no + +monitoring_grafana_github_oauth: + enabled: false +monitoring_grafana_extra_secret_mounts: [] + +monitoring_thanos_receive_enabled: true +monitoring_thanos_receive_ingress_enabled: true +monitoring_thanos_receive_size: 5Gi + +monitoring_prometheus_core_dns_enabled: true diff --git a/inventories/group_vars/leaves/main.yaml b/inventories/group_vars/leaves/main.yaml index 79e2bdcc..fbc83ac4 100644 --- a/inventories/group_vars/leaves/main.yaml +++ b/inventories/group_vars/leaves/main.yaml @@ -2,6 +2,7 @@ dhcp_listening_interfaces: - Vlan4000 +metal_core_cidr_mask: 25 metal_core_spine_uplinks: - Ethernet120 @@ -23,4 +24,9 @@ sonic_config_vlans: sonic_config_vtep: enabled: true -metal_core_cidr_mask: 25 +sonic_config_ntp: + servers: + - 0.europe.pool.ntp.org + - 1.europe.pool.ntp.org + - 2.europe.pool.ntp.org + - 3.europe.pool.ntp.org diff --git a/inventories/group_vars/partition/monitoring.yaml b/inventories/group_vars/partition/monitoring.yaml new file mode 100644 index 00000000..2c3b07be --- /dev/null +++ b/inventories/group_vars/partition/monitoring.yaml @@ -0,0 +1,8 @@ +monitoring_node_exporter_port: 9100 + +monitoring_blackbox_exporter_port: 9115 + +monitoring_sonic_exporter_port: 9101 + +# IPMI should use the same credentials as metal-api +monitoring_ipmi_bmc_superuser_pwd: "{{ metal_api_bmc_superuser_pwd }}" diff --git a/inventories/group_vars/partition/prometheus.yaml b/inventories/group_vars/partition/prometheus.yaml new file mode 100644 index 00000000..8e691dae --- /dev/null +++ b/inventories/group_vars/partition/prometheus.yaml @@ -0,0 +1,31 @@ +--- +# exporters + +prometheus_sonic_exporter_targets: + - "leaf01:{{ monitoring_sonic_exporter_port }}" + - "leaf02:{{ monitoring_sonic_exporter_port }}" + +prometheus_blackbox_exporter_targets: + - "leaf01:{{ monitoring_blackbox_exporter_port }}" + - "leaf02:{{ monitoring_blackbox_exporter_port }}" + +prometheus_node_exporter_targets: + - "leaf01:{{ monitoring_node_exporter_port }}" + - "leaf02:{{ monitoring_node_exporter_port }}" + +prometheus_ipmi_exporter_targets: + - "{{ hostvars['machine01'].ansible_host }}" + - "{{ hostvars['machine02'].ansible_host }}" + +prometheus_blackbox_exporter_metal_api_probe_url: "{{ metal_partition_metal_api_protocol }}://{{ metal_partition_metal_api_addr }}:{{ metal_partition_metal_api_port }}{{ metal_partition_metal_api_basepath }}v1/version" + +prometheus_remote_write: + - url: "http://thanos-receive.{{ metal_control_plane_ingress_dns }}:8080/api/v1/receive" + +prometheus_hosts_content: | + 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 + ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 + {{ hostvars['leaf01'].ansible_host }} leaf01 + {{ hostvars['leaf02'].ansible_host }} leaf02 + +prometheus_haproxy_enabled: false diff --git a/inventories/group_vars/partition/promtail.yaml b/inventories/group_vars/partition/promtail.yaml new file mode 100644 index 00000000..fda39638 --- /dev/null +++ b/inventories/group_vars/partition/promtail.yaml @@ -0,0 +1,14 @@ +promtail_clients: + - url: http://loki.{{ metal_control_plane_ingress_dns }}:8080/loki/api/v1/push + +promtail_scrape_configs: + - job_name: leaf-node-docker + docker_sd_configs: + - host: "unix:///var/run/docker.sock" + refresh_interval: 5s + relabel_configs: + - regex: /(.*) + source_labels: [__meta_docker_container_name] + target_label: container + - replacement: "{{ inventory_hostname }}" + target_label: node_name diff --git a/mini-lab.sonic.yaml b/mini-lab.sonic.yaml index eb33236f..a22c0c46 100644 --- a/mini-lab.sonic.yaml +++ b/mini-lab.sonic.yaml @@ -32,11 +32,15 @@ topology: image: ${MINI_LAB_SONIC_IMAGE} binds: - files/ssh/id_rsa.pub:/authorized_keys + env: + QEMU_MEMORY: 4096 leaf02: group: leaves image: ${MINI_LAB_SONIC_IMAGE} binds: - files/ssh/id_rsa.pub:/authorized_keys + env: + QEMU_MEMORY: 4096 machine01: group: machines image: ${MINI_LAB_VM_IMAGE} diff --git a/roles/sonic/tasks/main.yaml b/roles/sonic/tasks/main.yaml index cbf1cada..c8ee8460 100644 --- a/roles/sonic/tasks/main.yaml +++ b/roles/sonic/tasks/main.yaml @@ -25,3 +25,13 @@ reload: no sysctl_set: yes value: "1" + +# We need to fill some values for the sonic-exporter (uses the STATE_DB) +- name: Mock sonic platform for kvm + ansible.builtin.import_tasks: mock-platform.yaml + +# ntp restarting for monitoring -> otherwise some NodeTimeOutOfSync error +- name: restart chrony + systemd: + name: chrony + state: restarted diff --git a/roles/sonic/tasks/mock-platform.yaml b/roles/sonic/tasks/mock-platform.yaml new file mode 100644 index 00000000..2363d5c4 --- /dev/null +++ b/roles/sonic/tasks/mock-platform.yaml @@ -0,0 +1,17 @@ +--- +- name: Check if CHASSIS_INFO exists + command: sonic-db-cli STATE_DB exists 'CHASSIS_INFO|chassis0' + register: chassis_exists + changed_when: false + +- name: Set chassis info in STATE_DB (only if missing) + command: | + sonic-db-cli STATE_DB hmset 'CHASSIS_INFO|chassis0' + part_num 'PN-MINI-LAB' + serial_num '{{ inventory_hostname }}' + base_mac_addr 'aa:bb:cc:dd:ee:ff' + onie_version 'unknown' + platform_name 'kvm_x86_64' + hardware_revision '0' + product_name 'SONiC-KVM' + when: chassis_exists.stdout.strip() == "0"