From 3df5636fba36b35baa75a3e4b43b06298f011494 Mon Sep 17 00:00:00 2001 From: Jean Schmidt Date: Sun, 12 Apr 2026 19:23:35 -0700 Subject: [PATCH] 20260412192335 Signed-off-by: Jean Schmidt --- .../scripts/python/compactor.py | 7 +++ .../scripts/python/test_compactor.py | 44 +++++++++++++++++++ osdc/clusters.yaml | 2 + .../arc-runners/templates/runner.yaml.tpl | 2 +- .../kubernetes/alerts/kustomization.yaml | 1 + .../alerts/node-compactor-alerts.yaml | 19 ++++++++ 6 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 osdc/modules/monitoring/kubernetes/alerts/node-compactor-alerts.yaml diff --git a/osdc/base/node-compactor/scripts/python/compactor.py b/osdc/base/node-compactor/scripts/python/compactor.py index 381c487a..84869032 100644 --- a/osdc/base/node-compactor/scripts/python/compactor.py +++ b/osdc/base/node-compactor/scripts/python/compactor.py @@ -384,6 +384,13 @@ def handle_signal(signum, frame): with m.reconcile_duration_seconds.time(): reconcile(client, cfg, taint_times, fleet_cooldown_times) m.reconcile_cycles_total.labels(status="success").inc() + except ApiError as e: + m.reconcile_cycles_total.labels(status="error").inc() + if e.status.code == 401: + log.warning("Got 401 Unauthorized — recreating client (SA token likely rotated by kubelet)") + client = Client() + else: + log.exception("Reconciliation failed (will retry next cycle)") except Exception: m.reconcile_cycles_total.labels(status="error").inc() log.exception("Reconciliation failed (will retry next cycle)") diff --git a/osdc/base/node-compactor/scripts/python/test_compactor.py b/osdc/base/node-compactor/scripts/python/test_compactor.py index c9623ae9..d78f50d3 100644 --- a/osdc/base/node-compactor/scripts/python/test_compactor.py +++ b/osdc/base/node-compactor/scripts/python/test_compactor.py @@ -2037,6 +2037,50 @@ def set_shutdown(*args, **kwargs): assert result == 0 mock_cleanup_reservations.assert_called_once() + @patch("compactor.cleanup_stale_taints") + @patch("compactor.reconcile") + @patch("compactor.Client") + @patch("compactor.Config.from_env") + @patch("compactor.signal.signal") + @patch("compactor.time.sleep") + def test_main_recreates_client_on_401( + self, + mock_sleep, + mock_signal_fn, + mock_from_env, + mock_client_cls, + mock_reconcile, + mock_cleanup, + _mock_http_server, + ): + """401 Unauthorized triggers client recreation (SA token rotation).""" + mock_from_env.return_value = make_config() + original_client = MagicMock(name="original_client") + refreshed_client = MagicMock(name="refreshed_client") + mock_client_cls.side_effect = [original_client, refreshed_client] + + call_count = 0 + + def reconcile_side_effect(client, *args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + # First call: raise 401 to trigger client recreation + raise _make_api_error(401) + # Second call: verify new client is used, then shutdown + assert client is refreshed_client + for c in mock_signal_fn.call_args_list: + if c[0][0] == signal.SIGTERM: + c[0][1](signal.SIGTERM, None) + + mock_reconcile.side_effect = reconcile_side_effect + + result = main() + + assert result == 0 + assert mock_client_cls.call_count == 2 + assert mock_reconcile.call_count == 2 + # ============================================================================ # compute_taints -- rate limiting tests diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index c3cbdbe1..6176de91 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -67,6 +67,8 @@ defaults: gpu_consolidate_after: "20m" cpu_consolidate_after: "20m" baremetal_consolidate_after: "1h" + gpu_disruption_budget: "20%" + cpu_disruption_budget: "20%" buildkit: replicas_per_arch: 4 monitoring: diff --git a/osdc/modules/arc-runners/templates/runner.yaml.tpl b/osdc/modules/arc-runners/templates/runner.yaml.tpl index 76e0031a..dfbcc6a0 100644 --- a/osdc/modules/arc-runners/templates/runner.yaml.tpl +++ b/osdc/modules/arc-runners/templates/runner.yaml.tpl @@ -155,7 +155,7 @@ template: # Default is 600s (10 min), which is exceeded when node provisioning + # git-cache sync takes longer than expected under concurrent load. - name: ACTIONS_RUNNER_PREPARE_JOB_TIMEOUT_SECONDS - value: "900" + value: "1500" # Wait for startup taints to clear before creating workflow pods. # Prevents Karpenter-scheduler deadlock on fresh nodes where the # runner tolerates the taint but the workflow pod does not. diff --git a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml index e60668cb..76934f04 100644 --- a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml +++ b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml @@ -5,3 +5,4 @@ resources: - arc-alerts.yaml - infrastructure-alerts.yaml - gpu-alerts.yaml + - node-compactor-alerts.yaml diff --git a/osdc/modules/monitoring/kubernetes/alerts/node-compactor-alerts.yaml b/osdc/modules/monitoring/kubernetes/alerts/node-compactor-alerts.yaml new file mode 100644 index 00000000..91529793 --- /dev/null +++ b/osdc/modules/monitoring/kubernetes/alerts/node-compactor-alerts.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: node-compactor-alerts + namespace: monitoring + labels: + app.kubernetes.io/part-of: osdc-monitoring +spec: + groups: + - name: node-compactor + rules: + - alert: NodeCompactorReconcileErrors + expr: rate(node_compactor_reconcile_cycles_total{status="error"}[5m]) > 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Node compactor reconciliation is failing persistently" + description: "The node-compactor in namespace {{ $labels.namespace }} has had continuous reconciliation errors for 15 minutes. Burst-absorption (untainting nodes under scheduling pressure) is offline. Check for expired SA tokens (401) or RBAC issues."