diff --git a/compute/incusos/Justfile b/compute/incusos/Justfile index 2a27de5..af9edd9 100644 --- a/compute/incusos/Justfile +++ b/compute/incusos/Justfile @@ -30,6 +30,66 @@ apply host="um760": delete host="um760": @proto run cue -- export . -e kubernetesYAML --out text -t host={{host}} | kubectl delete --ignore-not-found=true -f - +# Report the UM760 link mode, bridge membership, listener ownership, and Tinkerbell state. +um760-status: + #!/usr/bin/env bash + set -euo pipefail + link_mode="../../network/vyos/scripts/um760_link_mode.py" + ssh_key="${VYOS_SSH_KEY:-$HOME/.ssh/vyos-gateway}" + ssh_target="${VYOS_USER:-vyos}@${VYOS_HOST:-10.0.0.2}" + ssh_base=(ssh -i "$ssh_key" -o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=accept-new "$ssh_target") + kubectl="sudo podman exec bootstrap-k0s k0s kubectl" + + python3 "$link_mode" status + echo + "${ssh_base[@]}" "$kubectl -n tinkerbell get deployment,pod,workflow,hardware,template -o wide" + +# Preflight the bootstrap stack and move the UM760-facing link to untagged LAB_PROV. +um760-provision host="um760": + #!/usr/bin/env bash + set -euo pipefail + link_mode="../../network/vyos/scripts/um760_link_mode.py" + ssh_key="${VYOS_SSH_KEY:-$HOME/.ssh/vyos-gateway}" + ssh_target="${VYOS_USER:-vyos}@${VYOS_HOST:-10.0.0.2}" + ssh_base=(ssh -i "$ssh_key" -o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=accept-new "$ssh_target") + kubectl="sudo podman exec bootstrap-k0s k0s kubectl" + artifact_url="$(proto run cue -- export . -e imageBuildConfig.image.artifactURL --out text -t host={{host}})" + + "${ssh_base[@]}" "$kubectl -n tinkerbell rollout status deployment/tinkerbell --timeout=120s" + "${ssh_base[@]}" "$kubectl -n tinkerbell rollout status deployment/hookos --timeout=120s" + + workflow_state="$("${ssh_base[@]}" "$kubectl -n tinkerbell get workflow incusos-operation-{{host}} -o jsonpath='{.status.state}'")" + case "$workflow_state" in + PENDING|RUNNING) ;; + *) + echo "workflow incusos-operation-{{host}} is not ready to provision: state=$workflow_state" >&2 + exit 1 + ;; + esac + + "${ssh_base[@]}" "curl -fsSI --max-time 10 '$artifact_url' >/dev/null" + python3 "$link_mode" provision + echo + status_output="$(python3 "$link_mode" status)" + printf '%s\n' "$status_output" + grep -Eq '^br20 members:.*eth2' <<<"$status_output" + ! grep -Eq '^br10 members:.*eth2\.10' <<<"$status_output" + +# Restore the UM760-facing link to tagged LAB_MGMT and report current reachability evidence. +um760-mgmt: + #!/usr/bin/env bash + set -euo pipefail + link_mode="../../network/vyos/scripts/um760_link_mode.py" + python3 "$link_mode" mgmt + echo + status_output="$(python3 "$link_mode" status)" + printf '%s\n' "$status_output" + grep -Eq '^br10 members:.*eth2\.10' <<<"$status_output" + ! grep -Eq '^br20 members:.*eth2(,|$)' <<<"$status_output" + if ! grep -qi '38:05:25:34:25:d0' <<<"$status_output"; then + echo "UM760 MAC is not observed yet; it may still be off, booting, or not using LAB_MGMT." >&2 + fi + # Remove disposable local build state. clean: @rm -rf .state diff --git a/network/vyos/ansible/templates/bootstrap-k0s.env.j2 b/network/vyos/ansible/templates/bootstrap-k0s.env.j2 index 677dd1c..fc9fefb 100644 --- a/network/vyos/ansible/templates/bootstrap-k0s.env.j2 +++ b/network/vyos/ansible/templates/bootstrap-k0s.env.j2 @@ -1,5 +1,6 @@ TINKERBELL_PUBLIC_IP={{ bootstrap_k0s_provisioning_ip }} TINKERBELL_ARTIFACTS_FILE_SERVER={{ bootstrap_k0s_artifacts_file_server }} +TINKERBELL_DHCP_BIND_ADDR={{ bootstrap_k0s_dhcp_bind_addr }} TINKERBELL_DHCP_BIND_INTERFACE={{ bootstrap_k0s_dhcp_bind_interface }} TINKERBELL_TRUSTED_PROXIES={{ bootstrap_k0s_trusted_proxies }} K0S_POD_CIDR={{ bootstrap_k0s_pod_cidr }} diff --git a/network/vyos/ansible/vars/bootstrap_k0s.yml b/network/vyos/ansible/vars/bootstrap_k0s.yml index 3bc4ee5..b36ccbd 100644 --- a/network/vyos/ansible/vars/bootstrap_k0s.yml +++ b/network/vyos/ansible/vars/bootstrap_k0s.yml @@ -1,8 +1,9 @@ --- -bootstrap_k0s_image: ghcr.io/gilmanlab/platform/bootstrap-k0s:0.2.1 +bootstrap_k0s_image: ghcr.io/gilmanlab/platform/bootstrap-k0s:0.2.2 bootstrap_k0s_provisioning_ip: 10.10.20.1 bootstrap_k0s_artifacts_file_server: http://10.10.20.1:7173 -bootstrap_k0s_dhcp_bind_interface: eth1.20 +bootstrap_k0s_dhcp_bind_addr: 10.10.20.1 +bootstrap_k0s_dhcp_bind_interface: br20 bootstrap_k0s_pod_cidr: 10.244.0.0/16 bootstrap_k0s_service_cidr: 10.96.0.0/12 bootstrap_k0s_trusted_proxies: 10.244.0.0/16,10.96.0.0/12 diff --git a/network/vyos/configs/gateway.conf b/network/vyos/configs/gateway.conf index 1b32a00..c09b5da 100644 --- a/network/vyos/configs/gateway.conf +++ b/network/vyos/configs/gateway.conf @@ -24,8 +24,9 @@ * * Bridge Architecture: * br10 - Bridges eth1.10 (switch trunk) and eth2.10 (UM760 direct connect) - * This allows the UM760 to participate in VLAN 10 via eth2 - * while other devices access VLAN 10 via the switch trunk + * for steady-state UM760 management/platform access. + * br20 - Bridges eth1.20 (switch trunk) and, temporarily, physical eth2 + * while provisioning the UM760 via untagged PXE/Tinkerbell. */ firewall { @@ -446,6 +447,17 @@ interfaces { } } } + /* Bridge for VLAN 20 - Tinkerbell provisioning anchor. + * eth1.20 is permanent; physical eth2 is added only during UM760 PXE. + */ + bridge br20 { + address 10.10.20.1/24 + description "LAB_PROV - Provisioning (PXE)" + member { + interface eth1.20 { + } + } + } ethernet eth0 { address 10.0.0.2/30 description "WAN - Transit to Home (CCR2004)" @@ -456,8 +468,7 @@ interfaces { description "LAB_MGMT - Bridge member (br10)" } vif 20 { - address 10.10.20.1/24 - description "LAB_PROV - Provisioning (PXE)" + description "LAB_PROV - Bridge member (br20)" } vif 40 { address 10.10.40.1/24 @@ -551,6 +562,8 @@ protocols { } service { dhcp-server { + listen-address 10.10.10.1 + listen-address 10.10.70.1 dynamic-dns-update { enable conflict-resolution enable diff --git a/network/vyos/moon.yml b/network/vyos/moon.yml index bfef644..4ea6a5f 100644 --- a/network/vyos/moon.yml +++ b/network/vyos/moon.yml @@ -32,10 +32,20 @@ tasks: cache: false runInCI: true + check-scripts: + command: 'python3 -m py_compile scripts/um760_link_mode.py' + toolchains: 'system' + inputs: + - 'scripts/um760_link_mode.py' + options: + cache: false + runInCI: true + check: deps: - 'check-init' - 'check-playbook' + - 'check-scripts' command: 'true' toolchains: 'system' inputs: [] diff --git a/network/vyos/scripts/um760_link_mode.py b/network/vyos/scripts/um760_link_mode.py new file mode 100755 index 0000000..e22dc34 --- /dev/null +++ b/network/vyos/scripts/um760_link_mode.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +"""Switch the UM760-facing VyOS link between management and PXE provisioning.""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import shlex +import subprocess +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Iterable + + +DEFAULT_VYOS_HOST = "10.0.0.2" +DEFAULT_VYOS_USER = "vyos" +DEFAULT_VYOS_SSH_KEY = "~/.ssh/vyos-gateway" + +BR_MGMT = "br10" +BR_PROV = "br20" +IFACE_DIRECT = "eth2" +IFACE_MGMT = "eth2.10" +VIF_MGMT = "10" + +UM760_MAC = "38:05:25:34:25:d0" +UM760_MGMT_IP = "10.10.10.10" +UM760_PROV_IP = "10.10.20.10" + +LISTENER_PORTS = ("67", "69", "514") + + +@dataclass(frozen=True) +class LinkStatus: + mode: str + br10_members: list[str] + br20_members: list[str] + eth2_vif10_present: bool + um760_fdb: list[str] + um760_neighbors: list[str] + listeners: list[str] + + +class CommandError(RuntimeError): + def __init__(self, command: str, result: subprocess.CompletedProcess[str]) -> None: + super().__init__(f"command failed ({result.returncode}): {command}\n{result.stderr.strip()}") + self.command = command + self.result = result + + +class VyOSClient: + def __init__(self, host: str, user: str, ssh_key: str, ssh_opts: str) -> None: + self.host = host + self.user = user + self.ssh_key = str(Path(ssh_key).expanduser()) + self.ssh_opts = shlex.split(ssh_opts) + + def run(self, remote_command: str, *, input_text: str | None = None, check: bool = True) -> subprocess.CompletedProcess[str]: + command = [ + "ssh", + "-i", + self.ssh_key, + "-o", + "BatchMode=yes", + "-o", + "ConnectTimeout=10", + "-o", + "StrictHostKeyChecking=accept-new", + *self.ssh_opts, + f"{self.user}@{self.host}", + remote_command, + ] + result = subprocess.run( + command, + input=input_text, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if check and result.returncode != 0: + raise CommandError(" ".join(shlex.quote(part) for part in command), result) + return result + + def read_config_commands(self) -> str: + command = f"/bin/vbash -ic {shlex.quote('show configuration commands')}" + return self.run(command).stdout + + def apply_config_commands(self, commands: list[str]) -> None: + script = "\n".join( + [ + "source /opt/vyatta/etc/functions/script-template", + "configure", + *commands, + "commit", + "save", + "exit", + "exit", + ] + ) + self.run("/bin/vbash -s", input_text=f"{script}\n") + + +def bridge_members(config: str, bridge: str) -> list[str]: + pattern = re.compile(rf"^set interfaces bridge {re.escape(bridge)} member interface '?([^'\s]+)'?", re.MULTILINE) + return sorted(set(pattern.findall(config))) + + +def bridge_exists(config: str, bridge: str) -> bool: + return any(line.startswith(f"set interfaces bridge {bridge}") for line in config.splitlines()) + + +def eth2_vif10_present(config: str) -> bool: + return any(line.startswith(f"set interfaces ethernet {IFACE_DIRECT} vif {VIF_MGMT}") for line in config.splitlines()) + + +def infer_mode(br10_members: Iterable[str], br20_members: Iterable[str]) -> str: + has_mgmt = IFACE_MGMT in set(br10_members) + has_prov = IFACE_DIRECT in set(br20_members) + if has_mgmt and not has_prov: + return "mgmt" + if has_prov and not has_mgmt: + return "provision" + if has_mgmt and has_prov: + return "mixed" + return "unknown" + + +def collect_status(client: VyOSClient) -> LinkStatus: + config = client.read_config_commands() + br10 = bridge_members(config, BR_MGMT) + br20 = bridge_members(config, BR_PROV) + + fdb = client.run("sudo /usr/sbin/bridge fdb show 2>/dev/null || sudo bridge fdb show", check=False).stdout + neigh = client.run("sudo /usr/sbin/ip neigh show 2>/dev/null || sudo ip neigh show", check=False).stdout + listeners = client.run("sudo /usr/bin/ss -H -lunp 2>/dev/null || sudo /usr/sbin/ss -H -lunp", check=False).stdout + + listener_pattern = re.compile(r":(" + "|".join(re.escape(port) for port in LISTENER_PORTS) + r")(\s|$)") + return LinkStatus( + mode=infer_mode(br10, br20), + br10_members=br10, + br20_members=br20, + eth2_vif10_present=eth2_vif10_present(config), + um760_fdb=[line for line in fdb.splitlines() if UM760_MAC in line.lower()], + um760_neighbors=[ + line + for line in neigh.splitlines() + if UM760_MAC in line.lower() or UM760_MGMT_IP in line or UM760_PROV_IP in line + ], + listeners=[line for line in listeners.splitlines() if listener_pattern.search(line)], + ) + + +def print_status(status: LinkStatus) -> None: + def render_members(members: list[str]) -> str: + return ", ".join(members) if members else "(none)" + + print(f"UM760 link mode: {status.mode}") + print(f"{BR_MGMT} members: {render_members(status.br10_members)}") + print(f"{BR_PROV} members: {render_members(status.br20_members)}") + print(f"{IFACE_DIRECT} vif {VIF_MGMT} present: {'yes' if status.eth2_vif10_present else 'no'}") + print() + print(f"UM760 MAC/IP evidence ({UM760_MAC}, {UM760_MGMT_IP}, {UM760_PROV_IP}):") + if status.um760_fdb: + for line in status.um760_fdb: + print(f" fdb: {line}") + else: + print(" fdb: not observed") + if status.um760_neighbors: + for line in status.um760_neighbors: + print(f" neigh: {line}") + else: + print(" neigh: not observed") + print() + print("DHCP/PXE/syslog UDP listeners (:67, :69, :514):") + if status.listeners: + for line in status.listeners: + print(f" {line}") + else: + print(" none observed") + + +def provision_commands(config: str) -> list[str]: + if not bridge_exists(config, BR_PROV): + raise RuntimeError(f"{BR_PROV} is not configured; deploy the durable VyOS bridge config before flipping the link") + + commands: list[str] = [] + if IFACE_MGMT in bridge_members(config, BR_MGMT): + commands.append(f"delete interfaces bridge {BR_MGMT} member interface {IFACE_MGMT}") + if eth2_vif10_present(config): + commands.append(f"delete interfaces ethernet {IFACE_DIRECT} vif {VIF_MGMT}") + if IFACE_DIRECT not in bridge_members(config, BR_PROV): + commands.append(f"set interfaces bridge {BR_PROV} member interface {IFACE_DIRECT}") + return commands + + +def mgmt_commands(config: str) -> list[str]: + if not bridge_exists(config, BR_MGMT): + raise RuntimeError(f"{BR_MGMT} is not configured; cannot restore the UM760 management link") + + commands: list[str] = [] + if IFACE_DIRECT in bridge_members(config, BR_PROV): + commands.append(f"delete interfaces bridge {BR_PROV} member interface {IFACE_DIRECT}") + if not eth2_vif10_present(config): + commands.append(f"set interfaces ethernet {IFACE_DIRECT} vif {VIF_MGMT} description 'LAB_MGMT - Bridge member (br10)'") + if IFACE_MGMT not in bridge_members(config, BR_MGMT): + commands.append(f"set interfaces bridge {BR_MGMT} member interface {IFACE_MGMT}") + return commands + + +def run_transition(client: VyOSClient, target: str, dry_run: bool) -> None: + config = client.read_config_commands() + commands = provision_commands(config) if target == "provision" else mgmt_commands(config) + + if not commands: + print(f"UM760 link is already in {target} mode; no changes needed.") + return + + if dry_run: + print("\n".join(["configure", *commands, "commit", "save", "exit"])) + return + + client.apply_config_commands(commands) + print(f"UM760 link switched to {target} mode.") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--host", default=os.environ.get("VYOS_HOST", DEFAULT_VYOS_HOST)) + parser.add_argument("--user", default=os.environ.get("VYOS_USER", DEFAULT_VYOS_USER)) + parser.add_argument("--ssh-key", default=os.environ.get("VYOS_SSH_KEY", DEFAULT_VYOS_SSH_KEY)) + parser.add_argument("--ssh-opts", default=os.environ.get("VYOS_SSH_OPTS", "")) + + subparsers = parser.add_subparsers(dest="command", required=True) + + status = subparsers.add_parser("status", help="report current UM760 link mode and PXE ownership") + status.add_argument("--json", action="store_true", help="emit status as JSON") + + provision = subparsers.add_parser("provision", help="move physical eth2 into LAB_PROV/br20 for PXE") + provision.add_argument("--dry-run", action="store_true", help="print VyOS config commands without applying them") + + mgmt = subparsers.add_parser("mgmt", help="restore eth2.10 into LAB_MGMT/br10") + mgmt.add_argument("--dry-run", action="store_true", help="print VyOS config commands without applying them") + + return parser + + +def main(argv: list[str]) -> int: + parser = build_parser() + args = parser.parse_args(argv) + client = VyOSClient(args.host, args.user, args.ssh_key, args.ssh_opts) + + try: + if args.command == "status": + status = collect_status(client) + if args.json: + print(json.dumps(asdict(status), indent=2, sort_keys=True)) + else: + print_status(status) + return 0 + + run_transition(client, args.command, args.dry_run) + return 0 + except (CommandError, RuntimeError) as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:]))