From 03a6d1db798ab396f59acee50096990efc43de8d Mon Sep 17 00:00:00 2001 From: Joshua Gilman Date: Mon, 29 Dec 2025 21:55:13 -0800 Subject: [PATCH 1/3] feat(talos): add justfile tooling and fix CP-1 network selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add justfile with recipes for common Talos operations: - genconfig: Generate machine configs via talhelper - bootstrap: Initialize first control plane node - kubeconfig: Fetch cluster credentials - dashboard: Open talosctl dashboard - status/services/logs: Node inspection commands - apply-config: Apply config updates to nodes - Flexible node targeting (cp-1, cp-2, cp-3, or all) - Fix CP-1 network interface selector: - Change from wildcard (*) to specific MAC address (38:05:25:34:25:d0) - Wildcard was matching multiple interfaces (bond0, dummy0, enp2s0) - This caused VLAN interfaces on wrong devices and gateway route issues - Add .gitignore for generated clusterconfig/ directory 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- infrastructure/compute/talos/.gitignore | 3 + infrastructure/compute/talos/justfile | 252 ++++++++++++++++++++ infrastructure/compute/talos/talconfig.yaml | 3 +- 3 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 infrastructure/compute/talos/.gitignore create mode 100644 infrastructure/compute/talos/justfile diff --git a/infrastructure/compute/talos/.gitignore b/infrastructure/compute/talos/.gitignore new file mode 100644 index 0000000..3d78f2a --- /dev/null +++ b/infrastructure/compute/talos/.gitignore @@ -0,0 +1,3 @@ +# Generated by talhelper genconfig +# These contain sensitive data (certs, keys) and should not be committed +clusterconfig/ diff --git a/infrastructure/compute/talos/justfile b/infrastructure/compute/talos/justfile new file mode 100644 index 0000000..e562776 --- /dev/null +++ b/infrastructure/compute/talos/justfile @@ -0,0 +1,252 @@ +# Talos Platform Cluster Management +# +# Prerequisites: +# - talhelper (brew install talhelper) +# - talosctl (brew install siderolabs/tap/talosctl) +# - sops (brew install sops) - for decrypting secrets +# +# Usage: +# just genconfig # Generate machine configs +# just bootstrap # Bootstrap CP-1 (first run only) +# just kubeconfig # Fetch kubeconfig +# just dashboard # Open talosctl dashboard (all nodes) +# just status cp-1 # Check status of a specific node +# just logs kubelet cp-2 # View logs from a service on a node + +# Node name to IP mapping +# Use these names with any command that takes a 'node' parameter +node_cp1 := "10.10.30.10" +node_cp2 := "10.10.30.11" +node_cp3 := "10.10.30.12" + +# All control plane nodes (comma-separated for talosctl) +all_nodes := node_cp1 + "," + node_cp2 + "," + node_cp3 + +# Generated config paths +config_dir := "clusterconfig" +talosconfig := config_dir / "talosconfig" + +# Helper to resolve node name to IP +[private] +_resolve node: + #!/usr/bin/env bash + case "{{ node }}" in + cp-1|cp1|1) echo "{{ node_cp1 }}" ;; + cp-2|cp2|2) echo "{{ node_cp2 }}" ;; + cp-3|cp3|3) echo "{{ node_cp3 }}" ;; + all) echo "{{ all_nodes }}" ;; + 10.10.30.*) echo "{{ node }}" ;; + *) echo "Error: Unknown node '{{ node }}'. Use cp-1, cp-2, cp-3, or all" >&2; exit 1 ;; + esac + +# Generate machine configurations using talhelper +genconfig: + talhelper genconfig --config-file talconfig.yaml --secret-file talsecret.sops.yaml --out-dir {{ config_dir }} + @echo "Configs generated in {{ config_dir }}/" + +# Bootstrap the first control plane node (CP-1) +# Only run this ONCE when initializing a new cluster +bootstrap: _require-config + talosctl bootstrap \ + --talosconfig {{ talosconfig }} \ + --nodes {{ node_cp1 }} + +# Fetch kubeconfig and merge into default location +kubeconfig: _require-config + talosctl kubeconfig \ + --talosconfig {{ talosconfig }} \ + --nodes {{ node_cp1 }} \ + --force + @echo "Kubeconfig merged into ~/.kube/config" + +# Fetch kubeconfig to a specific file +kubeconfig-file file: _require-config + talosctl kubeconfig {{ file }} \ + --talosconfig {{ talosconfig }} \ + --nodes {{ node_cp1 }} \ + --force + @echo "Kubeconfig written to {{ file }}" + +# Open talosctl dashboard +# Usage: just dashboard [node] +# Examples: +# just dashboard # All nodes +# just dashboard cp-1 # Single node +# just dashboard all # Explicit all nodes +dashboard node="all": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl dashboard \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# Check cluster health (all nodes) +health: _require-config + talosctl health \ + --talosconfig {{ talosconfig }} \ + --nodes {{ all_nodes }} + +# Get machine status +# Usage: just status [node] +status node="all": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl get machinestatus \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# Get detailed machine config from a node +get-config node="cp-1": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl get machineconfig \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" \ + -o yaml + +# View services on node(s) +# Usage: just services [node] +services node="all": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl services \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# View logs from a service +# Usage: just logs [node] +# Examples: +# just logs kubelet # kubelet logs from cp-1 +# just logs etcd cp-2 # etcd logs from cp-2 +# just logs apid all # apid logs from all nodes +logs service node="cp-1": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl logs {{ service }} \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# Follow logs from a service (streaming) +logs-follow service node="cp-1": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl logs {{ service }} \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" \ + --follow + +# Apply configuration to a node +# Usage: just apply-config +apply-config node: _require-config + #!/usr/bin/env bash + set -euo pipefail + + # Resolve node name to IP and config file + case "{{ node }}" in + cp-1|cp1|1|{{ node_cp1 }}) + node_ip="{{ node_cp1 }}" + config_file="{{ config_dir }}/platform-cp-1.yaml" + ;; + cp-2|cp2|2|{{ node_cp2 }}) + node_ip="{{ node_cp2 }}" + config_file="{{ config_dir }}/platform-cp-2.yaml" + ;; + cp-3|cp3|3|{{ node_cp3 }}) + node_ip="{{ node_cp3 }}" + config_file="{{ config_dir }}/platform-cp-3.yaml" + ;; + *) + echo "Error: Unknown node '{{ node }}'. Use cp-1, cp-2, or cp-3" + exit 1 + ;; + esac + + echo "Applying $config_file to $node_ip..." + talosctl apply-config \ + --talosconfig {{ talosconfig }} \ + --nodes "$node_ip" \ + --file "$config_file" + +# Upgrade Talos on a node +# Usage: just upgrade +# Example: just upgrade cp-1 v1.12.0 +upgrade node version: _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl upgrade \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" \ + --image "ghcr.io/siderolabs/installer:{{ version }}" + +# Reset a node (DANGEROUS - removes from cluster) +[confirm("This will RESET the node and remove it from the cluster. Continue?")] +reset node: _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl reset \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" \ + --graceful=false \ + --reboot + +# Get dmesg from node(s) +dmesg node="cp-1": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl dmesg \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# Get memory/cpu info from node(s) +resources node="all": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl get cpu,memory \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# List disks on node(s) +disks node="all": _require-config + #!/usr/bin/env bash + set -euo pipefail + nodes=$(just _resolve {{ node }}) + talosctl disks \ + --talosconfig {{ talosconfig }} \ + --nodes "$nodes" + +# Get etcd member list +etcd-members: _require-config + talosctl etcd members \ + --talosconfig {{ talosconfig }} \ + --nodes {{ node_cp1 }} + +# Get etcd status +etcd-status: _require-config + talosctl etcd status \ + --talosconfig {{ talosconfig }} \ + --nodes {{ all_nodes }} + +# List all available recipes +help: + @just --list + +# Run any talosctl command with correct config +# Usage: just talosctl +# Example: just talosctl get members --nodes 10.10.30.10 +talosctl *args: _require-config + talosctl --talosconfig {{ talosconfig }} {{ args }} + +# Internal: Check that configs have been generated +[private] +_require-config: + @test -f {{ talosconfig }} || (echo "Error: Run 'just genconfig' first" && exit 1) diff --git a/infrastructure/compute/talos/talconfig.yaml b/infrastructure/compute/talos/talconfig.yaml index b4bc297..0e3dcd9 100644 --- a/infrastructure/compute/talos/talconfig.yaml +++ b/infrastructure/compute/talos/talconfig.yaml @@ -41,9 +41,10 @@ nodes: type: nvme # UM760 uses hybrid trunk: VLAN 20 native, VLAN 30 tagged # Configure VLAN 30 sub-interface for platform traffic + # MAC address is for enp2s0 (2.5GbE port connected to VyOS eth2) networkInterfaces: - deviceSelector: - hardwareAddr: "*" + hardwareAddr: "38:05:25:34:25:d0" dhcp: false vlans: - vlanId: 30 From 081d66928448b3406aa9c050fb2f24503ff70e7f Mon Sep 17 00:00:00 2001 From: Joshua Gilman Date: Mon, 29 Dec 2025 21:58:38 -0800 Subject: [PATCH 2/3] feat(vyos): add bridge br30 for VLAN 30 platform network MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bridge eth1.30 (switch trunk) and eth2.30 (UM760 direct connect) to allow the UM760 platform anchor node to participate in VLAN 30 without requiring the traffic to traverse the switch. - Add br30 bridge with gateway IP 10.10.30.1/24 - Move IP from eth1.30 to br30 (eth1.30 now bridge member) - Add eth2.30 VLAN interface as bridge member - Update documentation with bridge architecture 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../network/vyos/configs/gateway.conf | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/infrastructure/network/vyos/configs/gateway.conf b/infrastructure/network/vyos/configs/gateway.conf index b55c2cd..34f4664 100644 --- a/infrastructure/network/vyos/configs/gateway.conf +++ b/infrastructure/network/vyos/configs/gateway.conf @@ -19,10 +19,15 @@ * VLAN Architecture: * 10 - LAB_MGMT (10.10.10.0/24) - Infrastructure management * 20 - LAB_PROV (10.10.20.0/24) - Provisioning (PXE) - * 30 - LAB_PLATFORM (10.10.30.0/24) - Platform cluster + * 30 - LAB_PLATFORM (10.10.30.0/24) - Platform cluster (via br30 bridge) * 40 - LAB_CLUSTER (10.10.40.0/24) - Tenant clusters * 50 - LAB_SERVICE (10.10.50.0/24) - Service VIPs (BGP) * 60 - LAB_STORAGE (10.10.60.0/24) - Storage replication + * + * Bridge Architecture: + * br30 - Bridges eth1.30 (switch trunk) and eth2.30 (UM760 direct connect) + * This allows the UM760 to participate in VLAN 30 via eth2 + * while other devices access VLAN 30 via the switch trunk */ firewall { @@ -180,6 +185,20 @@ firewall { } } interfaces { + /* Bridge for VLAN 30 - allows UM760 (eth2) to join platform network + * Both eth1.30 (switch trunk) and eth2.30 (UM760) are bridge members + * The gateway IP lives on the bridge, not the individual interfaces + */ + bridge br30 { + address 10.10.30.1/24 + description "LAB_PLATFORM - Platform Cluster" + member { + interface eth1.30 { + } + interface eth2.30 { + } + } + } ethernet eth0 { address 10.0.0.2/30 description "WAN - Transit to Home (CCR2004)" @@ -195,8 +214,7 @@ interfaces { description "LAB_PROV - Provisioning (PXE)" } vif 30 { - address 10.10.30.1/24 - description "LAB_PLATFORM - Platform Cluster" + description "LAB_PLATFORM - Bridge member (br30)" } vif 40 { address 10.10.40.1/24 @@ -213,6 +231,9 @@ interfaces { } ethernet eth2 { description "LAN - UM760 Platform Anchor Node" + vif 30 { + description "LAB_PLATFORM - Bridge member (br30)" + } } } nat { From 11adb926c3283f01091f6668e9a720e2de40c676 Mon Sep 17 00:00:00 2001 From: Joshua Gilman Date: Mon, 29 Dec 2025 22:13:22 -0800 Subject: [PATCH 3/3] fix(vyos): update tests for VLAN 30 bridge architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The platform network (VLAN 30) now uses a bridge (br30) to allow the UM760 anchor node to participate via a direct connection. Update the operational tests to reflect this: - Remove VLAN 30 from parametrized test_vlan_interface_up - Add dedicated test_vlan30_bridge_interface_up that validates: - VLAN interface is up and is a bridge member - Bridge br30 is up with the gateway IP 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../network/vyos/tests/test_operational.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/infrastructure/network/vyos/tests/test_operational.py b/infrastructure/network/vyos/tests/test_operational.py index 25b8d5f..6522665 100644 --- a/infrastructure/network/vyos/tests/test_operational.py +++ b/infrastructure/network/vyos/tests/test_operational.py @@ -31,7 +31,6 @@ def test_trunk_interface_up(self, vyos_show, test_topology): [ ("10", "10.10.10.1"), ("20", "10.10.20.1"), - ("30", "10.10.30.1"), ("40", "10.10.40.1"), ("50", "10.10.50.1"), ("60", "10.10.60.1"), @@ -45,6 +44,27 @@ def test_vlan_interface_up(self, vyos_show, test_topology, vif, gateway_ip): assert "up" in output.lower(), f"VLAN {vif} interface is not up" assert gateway_ip in output, f"VLAN {vif} missing IP {gateway_ip}" + def test_vlan30_bridge_interface_up(self, vyos_show, test_topology): + """VLAN 30 uses a bridge for the platform network. + + The platform network (VLAN 30) is bridged to allow the UM760 anchor node + to participate via a direct connection (eth4 in test, eth2 in production). + The gateway IP lives on br30, not on the VLAN interface directly. + """ + # Check that the VLAN interface is up and is a bridge member + vlan_output = vyos_show( + f"show interfaces ethernet {test_topology.trunk_iface} vif 30" + ) + assert "up" in vlan_output.lower(), "VLAN 30 interface is not up" + assert "br30" in vlan_output, "VLAN 30 should be a member of br30" + + # Check that the bridge has the gateway IP + bridge_output = vyos_show("show interfaces bridge br30") + assert "up" in bridge_output.lower(), "Bridge br30 is not up" + assert test_topology.platform_gateway in bridge_output, ( + f"Bridge br30 missing IP {test_topology.platform_gateway}" + ) + class TestRoutingState: """Test routing table state."""