Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .claude/skills/env-reference/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ See `apps/api/.env.example` for the full list. Key variables:
- `NODE_AGENT_READY_TIMEOUT_MS` — Max wait for freshly provisioned node-agent health
- `NODE_AGENT_READY_POLL_INTERVAL_MS` — Polling interval for fresh-node readiness checks
- `HETZNER_API_TIMEOUT_MS` — Timeout for Hetzner Cloud API calls (default: 30000)
- `HETZNER_API_RETRY_MAX_ATTEMPTS` — Retry budget for transient Hetzner API calls (default: 3)
- `HETZNER_API_RETRY_BASE_DELAY_MS` — Base delay for transient Hetzner API retry backoff (default: 1000)
- `HETZNER_API_RETRY_MAX_DELAY_MS` — Max delay for transient Hetzner API retry backoff (default: 10000)
- `HETZNER_PLACEMENT_RETRY_DELAY_MS` — Delay before retrying Hetzner primary-location placement failures (default: 3000)
- `HETZNER_PLACEMENT_RETRY_ATTEMPTS` — Number of primary-location placement attempts before fallback locations (default: 2)
- `HETZNER_PLACEMENT_FALLBACK_ENABLED` — Whether Hetzner provisioning tries alternate locations after primary placement failures (default: true)
- `HETZNER_PLACEMENT_FALLBACK_LOCATIONS` — Ordered comma-separated Hetzner fallback locations
- `CF_API_TIMEOUT_MS` — Timeout for Cloudflare DNS API calls (default: 30000)
- `NODE_AGENT_REQUEST_TIMEOUT_MS` — Timeout for Node Agent HTTP requests (default: 30000)

Expand Down
20 changes: 20 additions & 0 deletions apps/api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,33 @@ BASE_DOMAIN=workspaces.example.com

# External API timeouts (milliseconds, default: 30000)
# HETZNER_API_TIMEOUT_MS=30000
# HETZNER_API_RETRY_MAX_ATTEMPTS=3
# HETZNER_API_RETRY_BASE_DELAY_MS=1000
# HETZNER_API_RETRY_MAX_DELAY_MS=10000
# HETZNER_PLACEMENT_RETRY_DELAY_MS=3000
# HETZNER_PLACEMENT_RETRY_ATTEMPTS=2
# HETZNER_PLACEMENT_FALLBACK_ENABLED=true
# HETZNER_PLACEMENT_FALLBACK_LOCATIONS=hel1,nbg1,ash,hil
# CF_API_TIMEOUT_MS=30000
# CF_API_RETRY_MAX_ATTEMPTS=3
# CF_API_RETRY_BASE_DELAY_MS=1000
# CF_API_RETRY_MAX_DELAY_MS=30000
# NODE_AGENT_REQUEST_TIMEOUT_MS=30000
# NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS=2
# NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS=1000
# NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS=10000

# Workspace readiness (TaskRunner DO)
# TASK_RUNNER_WORKSPACE_READY_TIMEOUT_MS=1800000 # 30 minutes — max time for workspace-ready callback
# TASK_RUNNER_WORKSPACE_READY_POLL_INTERVAL_MS=30000 # 30 seconds — D1 poll interval during workspace_ready step
# PROVISIONING_TIMEOUT_MS=1800000 # 30 minutes — cron marks 'creating' workspaces as error after this
# TASK_RUNNER_STEP_MAX_RETRIES=3 # Global per-step retry budget
# TASK_RUNNER_NODE_SELECTION_MAX_RETRIES= # Per-step override: node_selection
# TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES= # Per-step override: node_provisioning
# TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES= # Per-step override: node_agent_ready
# TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES= # Per-step override: workspace_creation
# TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES= # Per-step override: workspace_ready
# TASK_RUNNER_AGENT_SESSION_MAX_RETRIES= # Per-step override: agent_session

# Docker daemon DNS for devcontainer builds (comma-separated quoted IPs)
# DOCKER_DNS_SERVERS="1.1.1.1", "8.8.8.8"
Expand Down
5 changes: 5 additions & 0 deletions apps/api/src/durable-objects/task-runner/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ export function isTransientError(err: unknown): boolean {
return false;
}

// Honor explicit retryable metadata (e.g. from provider libraries)
const retryable = (err as Error & { retryable?: boolean }).retryable;
if (retryable === true) return true;
if (retryable === false) return false;

const msg = err.message.toLowerCase();

// Network / timeout errors — always transient
Expand Down
24 changes: 22 additions & 2 deletions apps/api/src/durable-objects/task-runner/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ export class TaskRunner extends DurableObject<Env> {
durationMs,
});

if (isTransientError(err) && state.retryCount < this.getMaxRetries()) {
if (isTransientError(err) && state.retryCount < this.getMaxRetries(state.currentStep)) {
// Transient failure — retry with backoff
state.retryCount++;
await this.ctx.storage.put('state', state);
Expand Down Expand Up @@ -275,13 +275,33 @@ export class TaskRunner extends DurableObject<Env> {
// Configuration (all configurable via env vars — Constitution Principle XI)
// =========================================================================

private getMaxRetries(): number {
private getMaxRetries(step?: TaskExecutionStep): number {
// Per-step env override takes precedence over the global setting
if (step) {
const envKey = this.getPerStepEnvKey(step);
if (envKey) {
const perStep = parseEnvInt((this.env as unknown as Record<string, string | undefined>)[envKey], 0);
if (perStep > 0) return perStep;
}
}
return parseEnvInt(
this.env.TASK_RUNNER_STEP_MAX_RETRIES,
DEFAULT_TASK_RUNNER_STEP_MAX_RETRIES,
);
}

private getPerStepEnvKey(step: TaskExecutionStep): string | null {
const map: Partial<Record<TaskExecutionStep, string>> = {
node_selection: 'TASK_RUNNER_NODE_SELECTION_MAX_RETRIES',
node_provisioning: 'TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES',
node_agent_ready: 'TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES',
workspace_creation: 'TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES',
workspace_ready: 'TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES',
agent_session: 'TASK_RUNNER_AGENT_SESSION_MAX_RETRIES',
};
return map[step] ?? null;
}

private getRetryBaseDelayMs(): number {
return parseEnvInt(
this.env.TASK_RUNNER_RETRY_BASE_DELAY_MS,
Expand Down
19 changes: 19 additions & 0 deletions apps/api/src/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,21 @@ export interface Env {
HETZNER_BASE_IMAGE?: string;
// External API timeouts (milliseconds)
HETZNER_API_TIMEOUT_MS?: string;
HETZNER_API_RETRY_MAX_ATTEMPTS?: string;
HETZNER_API_RETRY_BASE_DELAY_MS?: string;
HETZNER_API_RETRY_MAX_DELAY_MS?: string;
HETZNER_PLACEMENT_RETRY_DELAY_MS?: string;
HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string;
HETZNER_PLACEMENT_FALLBACK_ENABLED?: string;
HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string;
CF_API_TIMEOUT_MS?: string;
CF_API_RETRY_MAX_ATTEMPTS?: string;
CF_API_RETRY_BASE_DELAY_MS?: string;
CF_API_RETRY_MAX_DELAY_MS?: string;
NODE_AGENT_REQUEST_TIMEOUT_MS?: string;
NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS?: string;
NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS?: string;
NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS?: string;
// Project data DO limits
CACHED_COMMANDS_MAX_PER_AGENT?: string;
CACHED_COMMANDS_MAX_AGENT_TYPE_LENGTH?: string;
Expand Down Expand Up @@ -240,6 +253,12 @@ export interface Env {
HEARTBEAT_ACP_SWEEP_TIMEOUT_MS?: string;
// TaskRunner DO configuration (TDF-2: alarm-driven orchestration)
TASK_RUNNER_STEP_MAX_RETRIES?: string;
TASK_RUNNER_NODE_SELECTION_MAX_RETRIES?: string;
TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES?: string;
TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES?: string;
TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES?: string;
TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES?: string;
TASK_RUNNER_AGENT_SESSION_MAX_RETRIES?: string;
TASK_RUNNER_RETRY_BASE_DELAY_MS?: string;
TASK_RUNNER_RETRY_MAX_DELAY_MS?: string;
TASK_RUNNER_AGENT_POLL_INTERVAL_MS?: string;
Expand Down
Loading
Loading