Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions osdc/base/node-compactor/scripts/python/compactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,13 @@ def handle_signal(signum, frame):
with m.reconcile_duration_seconds.time():
reconcile(client, cfg, taint_times, fleet_cooldown_times)
m.reconcile_cycles_total.labels(status="success").inc()
except ApiError as e:
m.reconcile_cycles_total.labels(status="error").inc()
if e.status.code == 401:
log.warning("Got 401 Unauthorized — recreating client (SA token likely rotated by kubelet)")
client = Client()
else:
log.exception("Reconciliation failed (will retry next cycle)")
except Exception:
m.reconcile_cycles_total.labels(status="error").inc()
log.exception("Reconciliation failed (will retry next cycle)")
Expand Down
44 changes: 44 additions & 0 deletions osdc/base/node-compactor/scripts/python/test_compactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2037,6 +2037,50 @@ def set_shutdown(*args, **kwargs):
assert result == 0
mock_cleanup_reservations.assert_called_once()

@patch("compactor.cleanup_stale_taints")
@patch("compactor.reconcile")
@patch("compactor.Client")
@patch("compactor.Config.from_env")
@patch("compactor.signal.signal")
@patch("compactor.time.sleep")
def test_main_recreates_client_on_401(
self,
mock_sleep,
mock_signal_fn,
mock_from_env,
mock_client_cls,
mock_reconcile,
mock_cleanup,
_mock_http_server,
):
"""401 Unauthorized triggers client recreation (SA token rotation)."""
mock_from_env.return_value = make_config()
original_client = MagicMock(name="original_client")
refreshed_client = MagicMock(name="refreshed_client")
mock_client_cls.side_effect = [original_client, refreshed_client]

call_count = 0

def reconcile_side_effect(client, *args, **kwargs):
nonlocal call_count
call_count += 1
if call_count == 1:
# First call: raise 401 to trigger client recreation
raise _make_api_error(401)
# Second call: verify new client is used, then shutdown
assert client is refreshed_client
for c in mock_signal_fn.call_args_list:
if c[0][0] == signal.SIGTERM:
c[0][1](signal.SIGTERM, None)

mock_reconcile.side_effect = reconcile_side_effect

result = main()

assert result == 0
assert mock_client_cls.call_count == 2
assert mock_reconcile.call_count == 2


# ============================================================================
# compute_taints -- rate limiting tests
Expand Down
2 changes: 2 additions & 0 deletions osdc/clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ defaults:
gpu_consolidate_after: "20m"
cpu_consolidate_after: "20m"
baremetal_consolidate_after: "1h"
gpu_disruption_budget: "20%"
cpu_disruption_budget: "20%"
buildkit:
replicas_per_arch: 4
monitoring:
Expand Down
2 changes: 1 addition & 1 deletion osdc/modules/arc-runners/templates/runner.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ template:
# Default is 600s (10 min), which is exceeded when node provisioning +
# git-cache sync takes longer than expected under concurrent load.
- name: ACTIONS_RUNNER_PREPARE_JOB_TIMEOUT_SECONDS
value: "900"
value: "1500"
# Wait for startup taints to clear before creating workflow pods.
# Prevents Karpenter-scheduler deadlock on fresh nodes where the
# runner tolerates the taint but the workflow pod does not.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ resources:
- arc-alerts.yaml
- infrastructure-alerts.yaml
- gpu-alerts.yaml
- node-compactor-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-compactor-alerts
namespace: monitoring
labels:
app.kubernetes.io/part-of: osdc-monitoring
spec:
groups:
- name: node-compactor
rules:
- alert: NodeCompactorReconcileErrors
expr: rate(node_compactor_reconcile_cycles_total{status="error"}[5m]) > 0
for: 15m
labels:
severity: critical
annotations:
summary: "Node compactor reconciliation is failing persistently"
description: "The node-compactor in namespace {{ $labels.namespace }} has had continuous reconciliation errors for 15 minutes. Burst-absorption (untainting nodes under scheduling pressure) is offline. Check for expired SA tokens (401) or RBAC issues."
Loading