diff --git a/.claude/skills/env-reference/SKILL.md b/.claude/skills/env-reference/SKILL.md index e46579f3a..ae07a7fe9 100644 --- a/.claude/skills/env-reference/SKILL.md +++ b/.claude/skills/env-reference/SKILL.md @@ -78,6 +78,13 @@ See `apps/api/.env.example` for the full list. Key variables: - `NODE_AGENT_READY_TIMEOUT_MS` — Max wait for freshly provisioned node-agent health - `NODE_AGENT_READY_POLL_INTERVAL_MS` — Polling interval for fresh-node readiness checks - `HETZNER_API_TIMEOUT_MS` — Timeout for Hetzner Cloud API calls (default: 30000) +- `HETZNER_API_RETRY_MAX_ATTEMPTS` — Retry budget for transient Hetzner API calls (default: 3) +- `HETZNER_API_RETRY_BASE_DELAY_MS` — Base delay for transient Hetzner API retry backoff (default: 1000) +- `HETZNER_API_RETRY_MAX_DELAY_MS` — Max delay for transient Hetzner API retry backoff (default: 10000) +- `HETZNER_PLACEMENT_RETRY_DELAY_MS` — Delay before retrying Hetzner primary-location placement failures (default: 3000) +- `HETZNER_PLACEMENT_RETRY_ATTEMPTS` — Number of primary-location placement attempts before fallback locations (default: 2) +- `HETZNER_PLACEMENT_FALLBACK_ENABLED` — Whether Hetzner provisioning tries alternate locations after primary placement failures (default: true) +- `HETZNER_PLACEMENT_FALLBACK_LOCATIONS` — Ordered comma-separated Hetzner fallback locations - `CF_API_TIMEOUT_MS` — Timeout for Cloudflare DNS API calls (default: 30000) - `NODE_AGENT_REQUEST_TIMEOUT_MS` — Timeout for Node Agent HTTP requests (default: 30000) diff --git a/apps/api/.env.example b/apps/api/.env.example index 776f34392..48ca14f21 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -210,13 +210,33 @@ BASE_DOMAIN=workspaces.example.com # External API timeouts (milliseconds, default: 30000) # HETZNER_API_TIMEOUT_MS=30000 +# HETZNER_API_RETRY_MAX_ATTEMPTS=3 +# HETZNER_API_RETRY_BASE_DELAY_MS=1000 +# HETZNER_API_RETRY_MAX_DELAY_MS=10000 +# HETZNER_PLACEMENT_RETRY_DELAY_MS=3000 +# HETZNER_PLACEMENT_RETRY_ATTEMPTS=2 +# HETZNER_PLACEMENT_FALLBACK_ENABLED=true +# HETZNER_PLACEMENT_FALLBACK_LOCATIONS=hel1,nbg1,ash,hil # CF_API_TIMEOUT_MS=30000 +# CF_API_RETRY_MAX_ATTEMPTS=3 +# CF_API_RETRY_BASE_DELAY_MS=1000 +# CF_API_RETRY_MAX_DELAY_MS=30000 # NODE_AGENT_REQUEST_TIMEOUT_MS=30000 +# NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS=2 +# NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS=1000 +# NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS=10000 # Workspace readiness (TaskRunner DO) # TASK_RUNNER_WORKSPACE_READY_TIMEOUT_MS=1800000 # 30 minutes — max time for workspace-ready callback # TASK_RUNNER_WORKSPACE_READY_POLL_INTERVAL_MS=30000 # 30 seconds — D1 poll interval during workspace_ready step # PROVISIONING_TIMEOUT_MS=1800000 # 30 minutes — cron marks 'creating' workspaces as error after this +# TASK_RUNNER_STEP_MAX_RETRIES=3 # Global per-step retry budget +# TASK_RUNNER_NODE_SELECTION_MAX_RETRIES= # Per-step override: node_selection +# TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES= # Per-step override: node_provisioning +# TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES= # Per-step override: node_agent_ready +# TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES= # Per-step override: workspace_creation +# TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES= # Per-step override: workspace_ready +# TASK_RUNNER_AGENT_SESSION_MAX_RETRIES= # Per-step override: agent_session # Docker daemon DNS for devcontainer builds (comma-separated quoted IPs) # DOCKER_DNS_SERVERS="1.1.1.1", "8.8.8.8" diff --git a/apps/api/src/durable-objects/task-runner/helpers.ts b/apps/api/src/durable-objects/task-runner/helpers.ts index 696f1e4d7..249f05f5d 100644 --- a/apps/api/src/durable-objects/task-runner/helpers.ts +++ b/apps/api/src/durable-objects/task-runner/helpers.ts @@ -34,6 +34,11 @@ export function isTransientError(err: unknown): boolean { return false; } + // Honor explicit retryable metadata (e.g. from provider libraries) + const retryable = (err as Error & { retryable?: boolean }).retryable; + if (retryable === true) return true; + if (retryable === false) return false; + const msg = err.message.toLowerCase(); // Network / timeout errors — always transient diff --git a/apps/api/src/durable-objects/task-runner/index.ts b/apps/api/src/durable-objects/task-runner/index.ts index c960d48e3..60481da49 100644 --- a/apps/api/src/durable-objects/task-runner/index.ts +++ b/apps/api/src/durable-objects/task-runner/index.ts @@ -205,7 +205,7 @@ export class TaskRunner extends DurableObject { durationMs, }); - if (isTransientError(err) && state.retryCount < this.getMaxRetries()) { + if (isTransientError(err) && state.retryCount < this.getMaxRetries(state.currentStep)) { // Transient failure — retry with backoff state.retryCount++; await this.ctx.storage.put('state', state); @@ -275,13 +275,33 @@ export class TaskRunner extends DurableObject { // Configuration (all configurable via env vars — Constitution Principle XI) // ========================================================================= - private getMaxRetries(): number { + private getMaxRetries(step?: TaskExecutionStep): number { + // Per-step env override takes precedence over the global setting + if (step) { + const envKey = this.getPerStepEnvKey(step); + if (envKey) { + const perStep = parseEnvInt((this.env as unknown as Record)[envKey], 0); + if (perStep > 0) return perStep; + } + } return parseEnvInt( this.env.TASK_RUNNER_STEP_MAX_RETRIES, DEFAULT_TASK_RUNNER_STEP_MAX_RETRIES, ); } + private getPerStepEnvKey(step: TaskExecutionStep): string | null { + const map: Partial> = { + node_selection: 'TASK_RUNNER_NODE_SELECTION_MAX_RETRIES', + node_provisioning: 'TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES', + node_agent_ready: 'TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES', + workspace_creation: 'TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES', + workspace_ready: 'TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES', + agent_session: 'TASK_RUNNER_AGENT_SESSION_MAX_RETRIES', + }; + return map[step] ?? null; + } + private getRetryBaseDelayMs(): number { return parseEnvInt( this.env.TASK_RUNNER_RETRY_BASE_DELAY_MS, diff --git a/apps/api/src/env.ts b/apps/api/src/env.ts index 1b0b44cce..34b371496 100644 --- a/apps/api/src/env.ts +++ b/apps/api/src/env.ts @@ -196,8 +196,21 @@ export interface Env { HETZNER_BASE_IMAGE?: string; // External API timeouts (milliseconds) HETZNER_API_TIMEOUT_MS?: string; + HETZNER_API_RETRY_MAX_ATTEMPTS?: string; + HETZNER_API_RETRY_BASE_DELAY_MS?: string; + HETZNER_API_RETRY_MAX_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string; + HETZNER_PLACEMENT_FALLBACK_ENABLED?: string; + HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string; CF_API_TIMEOUT_MS?: string; + CF_API_RETRY_MAX_ATTEMPTS?: string; + CF_API_RETRY_BASE_DELAY_MS?: string; + CF_API_RETRY_MAX_DELAY_MS?: string; NODE_AGENT_REQUEST_TIMEOUT_MS?: string; + NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS?: string; + NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS?: string; + NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS?: string; // Project data DO limits CACHED_COMMANDS_MAX_PER_AGENT?: string; CACHED_COMMANDS_MAX_AGENT_TYPE_LENGTH?: string; @@ -240,6 +253,12 @@ export interface Env { HEARTBEAT_ACP_SWEEP_TIMEOUT_MS?: string; // TaskRunner DO configuration (TDF-2: alarm-driven orchestration) TASK_RUNNER_STEP_MAX_RETRIES?: string; + TASK_RUNNER_NODE_SELECTION_MAX_RETRIES?: string; + TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES?: string; + TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES?: string; + TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES?: string; + TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES?: string; + TASK_RUNNER_AGENT_SESSION_MAX_RETRIES?: string; TASK_RUNNER_RETRY_BASE_DELAY_MS?: string; TASK_RUNNER_RETRY_MAX_DELAY_MS?: string; TASK_RUNNER_AGENT_POLL_INTERVAL_MS?: string; diff --git a/apps/api/src/services/dns.ts b/apps/api/src/services/dns.ts index 306d78835..956c5b35f 100644 --- a/apps/api/src/services/dns.ts +++ b/apps/api/src/services/dns.ts @@ -1,6 +1,7 @@ import type { Env } from '../env'; import { log } from '../lib/logger'; -import { fetchWithTimeout, getTimeoutMs } from './fetch-timeout'; +import type { FetchRetryOptions } from './fetch-timeout'; +import { fetchWithTimeoutAndRetry, getRetryDelayMs, getRetryMaxAttempts, getTimeoutMs } from './fetch-timeout'; const CLOUDFLARE_API_BASE = 'https://api.cloudflare.com/client/v4'; @@ -10,6 +11,23 @@ const DEFAULT_DNS_TTL = 60; /** Default timeout for Cloudflare API calls (per Constitution Principle XI) */ const DEFAULT_CF_API_TIMEOUT_MS = 30_000; +/** Default retry config for CF API */ +const DEFAULT_CF_API_RETRY_MAX_ATTEMPTS = 3; +const DEFAULT_CF_API_RETRY_BASE_DELAY_MS = 1_000; +const DEFAULT_CF_API_RETRY_MAX_DELAY_MS = 30_000; + +/** + * Build retry options from env. + */ +function cfRetryOptions(env: Env): FetchRetryOptions { + return { + timeoutMs: getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS), + maxAttempts: getRetryMaxAttempts(env.CF_API_RETRY_MAX_ATTEMPTS, DEFAULT_CF_API_RETRY_MAX_ATTEMPTS), + baseDelayMs: getRetryDelayMs(env.CF_API_RETRY_BASE_DELAY_MS, DEFAULT_CF_API_RETRY_BASE_DELAY_MS), + maxDelayMs: getRetryDelayMs(env.CF_API_RETRY_MAX_DELAY_MS, DEFAULT_CF_API_RETRY_MAX_DELAY_MS), + }; +} + /** * Get DNS TTL from env or use default (per constitution principle XI). */ @@ -79,16 +97,74 @@ export class DNSService implements DNSServiceInterface { } /** - * Create a DNS A record for a workspace. - * Uses Cloudflare proxy for automatic HTTPS. + * Search for an existing DNS A record by name. + * Returns the record if found, null otherwise. + */ +async function findExistingARecord( + recordName: string, + env: Env, +): Promise<{ id: string; content: string } | null> { + const retryOpts = cfRetryOptions(env); + const searchUrl = `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records?name=${encodeURIComponent(recordName)}&type=A`; + + const response = await fetchWithTimeoutAndRetry( + searchUrl, + { + headers: { + Authorization: `Bearer ${env.CF_API_TOKEN}`, + }, + }, + retryOpts, + ); + + if (!response.ok) return null; + + const data = await response.json() as { result: Array<{ id: string; content: string; type: string }> }; + const records = data.result || []; + const aRecord = records.find((r) => r.type === 'A'); + return aRecord ? { id: aRecord.id, content: aRecord.content } : null; +} + +/** + * Create or upsert a DNS A record for a workspace. + * Uses idempotent upsert: searches for existing matching record and PATCHes if found. + * If CREATE returns duplicate/already-exists, re-searches and PATCHes. */ export async function createDNSRecord( workspaceId: string, ip: string, env: Env ): Promise { - const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS); - const response = await fetchWithTimeout( + const recordName = `ws-${workspaceId}.${env.BASE_DOMAIN}`; + return upsertARecord(recordName, ip, true, env); +} + +/** + * Idempotent DNS A record upsert. + * 1. Search for existing record matching the name. + * 2. If found, PATCH it with the new IP. + * 3. If not found, POST to create. + * 4. If create returns duplicate, re-search and PATCH. + */ +async function upsertARecord( + recordName: string, + ip: string, + proxied: boolean, + env: Env, +): Promise { + const retryOpts = cfRetryOptions(env); + const ttl = getDnsTTL(env); + + // Step 1: Search for existing record + const existing = await findExistingARecord(recordName, env); + if (existing) { + log.info('dns.upsert_existing_found', { name: recordName, existingIp: existing.content, newIp: ip }); + await patchDNSRecordContent(existing.id, ip, env); + return existing.id; + } + + // Step 2: Attempt to create + const response = await fetchWithTimeoutAndRetry( `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records`, { method: 'POST', @@ -98,34 +174,78 @@ export async function createDNSRecord( }, body: JSON.stringify({ type: 'A', - name: `ws-${workspaceId}`, + name: recordName, content: ip, - ttl: getDnsTTL(env), // Configurable TTL (default 1 minute for fast updates) - proxied: true, // Enable Cloudflare proxy for HTTPS + ttl, + proxied, }), }, - timeoutMs + retryOpts, + ); + + if (response.ok) { + const data = await response.json() as { result: { id: string } }; + return data.result.id; + } + + // Step 3: If duplicate/already-exists, re-search and PATCH + const errorBody = await response.json().catch(() => ({})) as { errors?: Array<{ code: number; message: string }> }; + const isDuplicate = errorBody.errors?.some( + (e) => e.code === 81057 || e.message?.toLowerCase().includes('already exists'), + ); + + if (isDuplicate) { + log.info('dns.create_duplicate_fallback', { name: recordName }); + const retryExisting = await findExistingARecord(recordName, env); + if (retryExisting) { + await patchDNSRecordContent(retryExisting.id, ip, env); + return retryExisting.id; + } + } + + const message = errorBody.errors?.[0]?.message || `Failed to create DNS record: ${response.status}`; + throw new Error(message); +} + +/** + * PATCH a DNS record with a new IP address. + */ +async function patchDNSRecordContent( + recordId: string, + ip: string, + env: Env, +): Promise { + const retryOpts = cfRetryOptions(env); + const response = await fetchWithTimeoutAndRetry( + `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records/${recordId}`, + { + method: 'PATCH', + headers: { + Authorization: `Bearer ${env.CF_API_TOKEN}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ content: ip }), + }, + retryOpts, ); if (!response.ok) { const error = await response.json().catch(() => ({})) as { errors?: Array<{ message: string }> }; - const message = error.errors?.[0]?.message || `Failed to create DNS record: ${response.status}`; + const message = error.errors?.[0]?.message || `Failed to update DNS record: ${response.status}`; throw new Error(message); } - - const data = await response.json() as { result: { id: string } }; - return data.result.id; } /** * Delete a DNS record by ID. + * Idempotent: ignores 404 (record already deleted). */ export async function deleteDNSRecord( recordId: string, env: Env ): Promise { - const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS); - const response = await fetchWithTimeout( + const retryOpts = cfRetryOptions(env); + const response = await fetchWithTimeoutAndRetry( `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records/${recordId}`, { method: 'DELETE', @@ -133,7 +253,7 @@ export async function deleteDNSRecord( Authorization: `Bearer ${env.CF_API_TOKEN}`, }, }, - timeoutMs + retryOpts, ); // Ignore 404 errors (record already deleted) @@ -152,27 +272,7 @@ export async function updateDNSRecord( ip: string, env: Env ): Promise { - const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS); - const response = await fetchWithTimeout( - `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records/${recordId}`, - { - method: 'PATCH', - headers: { - Authorization: `Bearer ${env.CF_API_TOKEN}`, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - content: ip, - }), - }, - timeoutMs - ); - - if (!response.ok) { - const error = await response.json().catch(() => ({})) as { errors?: Array<{ message: string }> }; - const message = error.errors?.[0]?.message || `Failed to update DNS record: ${response.status}`; - throw new Error(message); - } + await patchDNSRecordContent(recordId, ip, env); } /** @@ -189,6 +289,7 @@ export async function cleanupWorkspaceDNSRecords( ): Promise { const baseDomain = env.BASE_DOMAIN; const id = workspaceId.toLowerCase(); + const retryOpts = cfRetryOptions(env); // Search for all possible DNS record name formats const recordNames = [ @@ -200,12 +301,11 @@ export async function cleanupWorkspaceDNSRecords( for (const recordName of recordNames) { const searchUrl = `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records?name=${encodeURIComponent(recordName)}`; - const cfTimeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS); - const response = await fetchWithTimeout(searchUrl, { + const response = await fetchWithTimeoutAndRetry(searchUrl, { headers: { Authorization: `Bearer ${env.CF_API_TOKEN}`, }, - }, cfTimeoutMs); + }, retryOpts); if (!response.ok) { log.error('dns.search_records_failed', { recordName, status: response.status }); @@ -248,47 +348,15 @@ export async function createBackendDNSRecord( /** * Create a proxied (orange-clouded) A record for a node VM backend. - * Cloudflare's edge handles TLS termination; the VM agent serves HTTPS - * with an Origin CA certificate that CF trusts. - * - * Uses {nodeId}.vm.{BASE_DOMAIN} (two-level subdomain) to bypass Cloudflare - * same-zone routing. The wildcard Worker route *.{domain}/* only matches - * single-level subdomains, so {nodeId}.vm.{domain} is NOT intercepted. - * This allows Worker subrequests (from DO alarms) to reach the VM directly. + * Uses idempotent upsert to recover from partial failures. */ export async function createNodeBackendDNSRecord( nodeId: string, ip: string, env: Env ): Promise { - const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS); - const response = await fetchWithTimeout( - `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records`, - { - method: 'POST', - headers: { - Authorization: `Bearer ${env.CF_API_TOKEN}`, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - type: 'A', - name: `${nodeId.toLowerCase()}.vm`, - content: ip, - ttl: getDnsTTL(env), - proxied: true, // Orange-clouded — CF edge terminates TLS, re-encrypts to Origin CA - }), - }, - timeoutMs - ); - - if (!response.ok) { - const error = await response.json().catch(() => ({})) as { errors?: Array<{ message: string }> }; - const message = error.errors?.[0]?.message || `Failed to create backend DNS record: ${response.status}`; - throw new Error(message); - } - - const data = await response.json() as { result: { id: string } }; - return data.result.id; + const recordName = `${nodeId.toLowerCase()}.vm.${env.BASE_DOMAIN}`; + return upsertARecord(recordName, ip, true, env); } /** diff --git a/apps/api/src/services/fetch-timeout.ts b/apps/api/src/services/fetch-timeout.ts index 38fd2d65c..6ff94f1af 100644 --- a/apps/api/src/services/fetch-timeout.ts +++ b/apps/api/src/services/fetch-timeout.ts @@ -1,11 +1,18 @@ /** - * Fetch wrapper with configurable timeout. + * Fetch wrapper with configurable timeout and retry. * * Cloudflare Workers support `AbortController`/`AbortSignal` for fetch cancellation. - * This utility adds a timeout that aborts the request if it exceeds the specified duration. + * This utility adds a timeout that aborts the request if it exceeds the specified duration, + * and optional retry with bounded exponential backoff and jitter. */ const DEFAULT_API_TIMEOUT_MS = 30_000; +const DEFAULT_RETRY_MAX_ATTEMPTS = 3; +const DEFAULT_RETRY_BASE_DELAY_MS = 1_000; +const DEFAULT_RETRY_MAX_DELAY_MS = 30_000; + +/** HTTP status codes considered transient and safe to retry. */ +const RETRYABLE_STATUS_CODES = new Set([408, 425, 429, 500, 502, 503, 504]); /** * Parse a timeout from env or return the default. @@ -17,6 +24,98 @@ export function getTimeoutMs(envValue: string | undefined, defaultMs: number = D return parsed; } +/** + * Parse a retry-related env int or return the provided default. + */ +export function getRetryMaxAttempts(envValue: string | undefined, defaultVal: number = DEFAULT_RETRY_MAX_ATTEMPTS): number { + if (!envValue) return defaultVal; + const parsed = Number.parseInt(envValue, 10); + if (!Number.isFinite(parsed) || parsed < 0) return defaultVal; + return parsed; +} + +/** + * Parse a retry delay env int or return the provided default. + */ +export function getRetryDelayMs(envValue: string | undefined, defaultVal: number): number { + if (!envValue) return defaultVal; + const parsed = Number.parseInt(envValue, 10); + if (!Number.isFinite(parsed) || parsed < 0) return defaultVal; + return parsed; +} + +/** + * Compute bounded exponential backoff with jitter. + * + * delay = min(baseDelay * 2^attempt, maxDelay) + random jitter (0-25%) + */ +export function computeRetryDelayMs( + attempt: number, + baseDelayMs: number = DEFAULT_RETRY_BASE_DELAY_MS, + maxDelayMs: number = DEFAULT_RETRY_MAX_DELAY_MS, +): number { + const exponential = baseDelayMs * Math.pow(2, attempt); + const capped = Math.min(exponential, maxDelayMs); + const jitter = capped * Math.random() * 0.25; + return Math.floor(capped + jitter); +} + +/** Options for fetchWithTimeoutAndRetry. */ +export interface FetchRetryOptions { + /** Maximum number of retry attempts (0 = no retries). */ + maxAttempts?: number; + /** Base delay for exponential backoff in ms. */ + baseDelayMs?: number; + /** Maximum delay between retries in ms. */ + maxDelayMs?: number; + /** Per-request timeout in ms. */ + timeoutMs?: number; +} + +/** + * Determine whether an HTTP response or fetch error is retryable. + */ +function isRetryableResponse(response: Response): boolean { + return RETRYABLE_STATUS_CODES.has(response.status); +} + +function isRetryableError(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const msg = err.message.toLowerCase(); + return ( + err.name === 'AbortError' || + msg.includes('timed out') || + msg.includes('timeout') || + msg.includes('fetch failed') || + msg.includes('network') || + msg.includes('econnrefused') || + msg.includes('econnreset') || + msg.includes('enotfound') || + msg.includes('socket hang up') + ); +} + +/** + * Parse the Retry-After header value into a delay in milliseconds. + * Supports both integer seconds and HTTP date formats. + * Returns undefined if the header is missing or unparseable. + */ +function parseRetryAfter(response: Response): number | undefined { + const header = response.headers.get('Retry-After'); + if (!header) return undefined; + const seconds = Number.parseInt(header, 10); + if (Number.isFinite(seconds) && seconds > 0) { + return seconds * 1000; + } + // Try HTTP date + const date = Date.parse(header); + if (Number.isFinite(date)) { + const delayMs = date - Date.now(); + return delayMs > 0 ? delayMs : undefined; + } + return undefined; +} + /** * Fetch with an automatic timeout. * Aborts the request if it doesn't complete within `timeoutMs`. @@ -50,3 +149,60 @@ export async function fetchWithTimeout( clearTimeout(timeoutId); } } + +/** + * Fetch with timeout and automatic retry on transient failures. + * + * Retries on: + * - HTTP status codes: 408, 425, 429, 500, 502, 503, 504 + * - Network/timeout errors (AbortError, fetch failed, ECONNREFUSED, etc.) + * + * Honors Retry-After headers when present. + * Uses bounded exponential backoff with random jitter. + */ +export async function fetchWithTimeoutAndRetry( + url: string | URL, + init?: RequestInit, + options?: FetchRetryOptions, +): Promise { + const maxAttempts = options?.maxAttempts ?? DEFAULT_RETRY_MAX_ATTEMPTS; + const baseDelayMs = options?.baseDelayMs ?? DEFAULT_RETRY_BASE_DELAY_MS; + const maxDelayMs = options?.maxDelayMs ?? DEFAULT_RETRY_MAX_DELAY_MS; + const timeoutMs = options?.timeoutMs ?? DEFAULT_API_TIMEOUT_MS; + + let lastError: Error | undefined; + let lastResponse: Response | undefined; + + for (let attempt = 0; attempt <= maxAttempts; attempt++) { + try { + const response = await fetchWithTimeout(url, init, timeoutMs); + + if (!isRetryableResponse(response) || attempt >= maxAttempts) { + return response; + } + + // Retryable status — wait and try again + lastResponse = response; + const retryAfterMs = parseRetryAfter(response); + const backoffMs = retryAfterMs ?? computeRetryDelayMs(attempt, baseDelayMs, maxDelayMs); + await sleep(backoffMs); + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + + if (!isRetryableError(err) || attempt >= maxAttempts) { + throw lastError; + } + + const backoffMs = computeRetryDelayMs(attempt, baseDelayMs, maxDelayMs); + await sleep(backoffMs); + } + } + + // Should not reach here, but if it does return last response or throw last error + if (lastResponse) return lastResponse; + throw lastError ?? new Error(`fetchWithTimeoutAndRetry: exhausted ${maxAttempts} retries for ${url}`); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/apps/api/src/services/node-agent.ts b/apps/api/src/services/node-agent.ts index 0f5593e44..318b6ab47 100644 --- a/apps/api/src/services/node-agent.ts +++ b/apps/api/src/services/node-agent.ts @@ -1,9 +1,25 @@ import type { Env } from '../env'; -import { fetchWithTimeout, getTimeoutMs } from './fetch-timeout'; +import type { FetchRetryOptions } from './fetch-timeout'; +import { fetchWithTimeoutAndRetry, getRetryDelayMs, getRetryMaxAttempts, getTimeoutMs } from './fetch-timeout'; import { signNodeManagementToken } from './jwt'; import { recordNodeRoutingMetric } from './telemetry'; const DEFAULT_NODE_AGENT_REQUEST_TIMEOUT_MS = 30_000; +const DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS = 2; +const DEFAULT_NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS = 1_000; +const DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS = 10_000; + +/** + * Build retry options for node agent requests from env. + */ +function nodeAgentRetryOptions(env: Env): FetchRetryOptions { + return { + timeoutMs: getTimeoutMs(env.NODE_AGENT_REQUEST_TIMEOUT_MS, DEFAULT_NODE_AGENT_REQUEST_TIMEOUT_MS), + maxAttempts: getRetryMaxAttempts(env.NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS, DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS), + baseDelayMs: getRetryDelayMs(env.NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS, DEFAULT_NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS), + maxDelayMs: getRetryDelayMs(env.NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS, DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS), + }; +} const DEFAULT_NODE_AGENT_READY_TIMEOUT_MS = 900_000; // 15 min — cloud-init takes 8-12 min on Hetzner const DEFAULT_NODE_AGENT_READY_POLL_INTERVAL_MS = 5000; @@ -136,14 +152,11 @@ async function nodeAgentRequest( env ); - const requestTimeoutMs = getTimeoutMs( - env.NODE_AGENT_REQUEST_TIMEOUT_MS, - DEFAULT_NODE_AGENT_REQUEST_TIMEOUT_MS - ); - const response = await fetchWithTimeout(url, { + const retryOpts = nodeAgentRetryOptions(env); + const response = await fetchWithTimeoutAndRetry(url, { ...options, headers, - }, requestTimeoutMs); + }, retryOpts); recordNodeRoutingMetric( { @@ -519,8 +532,11 @@ export async function nodeAgentRawRequest( headers.set('X-SAM-Node-Id', nodeId); const DEFAULT_EXPORT_TIMEOUT_MS = 60_000; - const timeoutMs = getTimeoutMs(env.NODE_AGENT_REQUEST_TIMEOUT_MS, DEFAULT_EXPORT_TIMEOUT_MS); - return fetchWithTimeout(url, { method: 'GET', headers }, timeoutMs); + const retryOpts: FetchRetryOptions = { + ...nodeAgentRetryOptions(env), + timeoutMs: getTimeoutMs(env.NODE_AGENT_REQUEST_TIMEOUT_MS, DEFAULT_EXPORT_TIMEOUT_MS), + }; + return fetchWithTimeoutAndRetry(url, { method: 'GET', headers }, retryOpts); } export async function rebuildWorkspaceOnNode( diff --git a/apps/api/src/services/nodes.ts b/apps/api/src/services/nodes.ts index ff12eb958..1b2925bd1 100644 --- a/apps/api/src/services/nodes.ts +++ b/apps/api/src/services/nodes.ts @@ -1,4 +1,5 @@ import { generateCloudInit, validateCloudInitSize } from '@simple-agent-manager/cloud-init'; +import type { Provider } from '@simple-agent-manager/providers'; import { ProviderError } from '@simple-agent-manager/providers'; import type { CredentialProvider, TaskMode } from '@simple-agent-manager/shared'; import { and, eq } from 'drizzle-orm'; @@ -62,6 +63,34 @@ export function resolveHetznerBaseImageOverride( return trimmed ? trimmed : undefined; } +/** + * Recover an existing VM that matches the node ID labels. + * Used for idempotent provisioning: if a previous createVM succeeded but + * the subsequent DB update failed, this finds the orphaned VM so we can + * resume instead of creating a duplicate. + */ +async function recoverExistingNodeVM( + provider: Provider, + nodeId: string, +): Promise<{ id: string; ip: string | null } | null> { + try { + const vms = await provider.listVMs({ + node: nodeId.toLowerCase(), + managed: 'simple-agent-manager', + }); + const first = vms[0]; + if (first) { + return { id: first.id, ip: first.ip ?? null }; + } + } catch (err) { + log.warn('node_provisioning.recovery_search_failed', { + nodeId, + error: err instanceof Error ? err.message : String(err), + }); + } + return null; +} + export async function createNodeRecord(env: Env, input: CreateNodeInput): Promise { const db = drizzle(env.DATABASE, { schema }); const now = new Date().toISOString(); @@ -164,6 +193,7 @@ export async function provisionNode( originCaCert: env.ORIGIN_CA_CERT, originCaKey: env.ORIGIN_CA_KEY, vmAgentPort: env.VM_AGENT_PORT, + provider: targetProvider, }); if (!validateCloudInitSize(cloudInit)) { @@ -172,6 +202,49 @@ export async function provisionNode( const provider = providerResult.provider; + // Idempotent recovery: if a previous attempt already created the VM but + // the control-plane state update failed (e.g. crash between createVM and + // the DB update), recover the existing VM instead of creating a duplicate. + const existingVm = await recoverExistingNodeVM(provider, node.id); + if (existingVm) { + log.info('node_provisioning.recovered_existing_vm', { + nodeId: node.id, + providerInstanceId: existingVm.id, + ip: existingVm.ip ?? null, + }); + + if (existingVm.ip) { + let backendDnsRecordId: string | null = null; + try { + backendDnsRecordId = await createNodeBackendDNSRecord(node.id, existingVm.ip, env); + } catch (dnsErr) { + log.error('node_provisioning.dns_record_failed', { nodeId: node.id, ...serializeError(dnsErr) }); + } + await db + .update(schema.nodes) + .set({ + providerInstanceId: existingVm.id, + ipAddress: existingVm.ip, + backendDnsRecordId, + status: 'running', + healthStatus: 'stale', + updatedAt: new Date().toISOString(), + }) + .where(eq(schema.nodes.id, node.id)); + } else { + await db + .update(schema.nodes) + .set({ + providerInstanceId: existingVm.id, + status: 'creating', + errorMessage: 'Recovered existing VM — awaiting IP allocation', + updatedAt: new Date().toISOString(), + }) + .where(eq(schema.nodes.id, node.id)); + } + return; + } + const baseImageOverride = resolveHetznerBaseImageOverride( targetProvider, env.HETZNER_BASE_IMAGE, diff --git a/apps/api/src/services/provider-credentials.ts b/apps/api/src/services/provider-credentials.ts index 65c3bf97f..1d837f121 100644 --- a/apps/api/src/services/provider-credentials.ts +++ b/apps/api/src/services/provider-credentials.ts @@ -1,4 +1,4 @@ -import type { Provider, ProviderConfig } from '@simple-agent-manager/providers'; +import type { HetznerProviderConfig, Provider, ProviderConfig } from '@simple-agent-manager/providers'; import { createProvider, GcpProvider } from '@simple-agent-manager/providers'; import type { CredentialProvider, CredentialSource, GcpOidcCredential } from '@simple-agent-manager/shared'; import { and, eq } from 'drizzle-orm'; @@ -61,10 +61,20 @@ export function extractScalewaySecretKey(decryptedToken: string): string | null export function buildProviderConfig( provider: CredentialProvider, decryptedToken: string, + env?: { + HETZNER_API_TIMEOUT_MS?: string; + HETZNER_API_RETRY_MAX_ATTEMPTS?: string; + HETZNER_API_RETRY_BASE_DELAY_MS?: string; + HETZNER_API_RETRY_MAX_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string; + HETZNER_PLACEMENT_FALLBACK_ENABLED?: string; + HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string; + }, ): ProviderConfig { switch (provider) { case 'hetzner': - return { provider: 'hetzner', apiToken: decryptedToken }; + return buildHetznerProviderConfig(decryptedToken, env); case 'scaleway': { let parsed: unknown; try { @@ -90,6 +100,44 @@ export function buildProviderConfig( } } +function buildHetznerProviderConfig( + apiToken: string, + env?: { + HETZNER_API_TIMEOUT_MS?: string; + HETZNER_API_RETRY_MAX_ATTEMPTS?: string; + HETZNER_API_RETRY_BASE_DELAY_MS?: string; + HETZNER_API_RETRY_MAX_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string; + HETZNER_PLACEMENT_FALLBACK_ENABLED?: string; + HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string; + }, +): HetznerProviderConfig { + const config: HetznerProviderConfig = { + provider: 'hetzner', + apiToken, + }; + + const timeoutMs = parseOptionalPositiveInt(env?.HETZNER_API_TIMEOUT_MS); + if (timeoutMs !== undefined) config.timeoutMs = timeoutMs; + const apiRetryMaxAttempts = parseOptionalPositiveInt(env?.HETZNER_API_RETRY_MAX_ATTEMPTS); + if (apiRetryMaxAttempts !== undefined) config.apiRetryMaxAttempts = apiRetryMaxAttempts; + const apiRetryBaseDelayMs = parseOptionalPositiveInt(env?.HETZNER_API_RETRY_BASE_DELAY_MS); + if (apiRetryBaseDelayMs !== undefined) config.apiRetryBaseDelayMs = apiRetryBaseDelayMs; + const apiRetryMaxDelayMs = parseOptionalPositiveInt(env?.HETZNER_API_RETRY_MAX_DELAY_MS); + if (apiRetryMaxDelayMs !== undefined) config.apiRetryMaxDelayMs = apiRetryMaxDelayMs; + const placementRetryDelayMs = parseOptionalPositiveInt(env?.HETZNER_PLACEMENT_RETRY_DELAY_MS); + if (placementRetryDelayMs !== undefined) config.placementRetryDelayMs = placementRetryDelayMs; + const placementRetryAttempts = parseOptionalPositiveInt(env?.HETZNER_PLACEMENT_RETRY_ATTEMPTS); + if (placementRetryAttempts !== undefined) config.placementRetryAttempts = placementRetryAttempts; + const placementFallbackEnabled = parseOptionalBoolean(env?.HETZNER_PLACEMENT_FALLBACK_ENABLED); + if (placementFallbackEnabled !== undefined) config.placementFallbackEnabled = placementFallbackEnabled; + const placementFallbackLocations = parseOptionalList(env?.HETZNER_PLACEMENT_FALLBACK_LOCATIONS); + if (placementFallbackLocations !== undefined) config.placementFallbackLocations = placementFallbackLocations; + + return config; +} + /** * Parse a decrypted GCP credential token into structured GcpOidcCredential fields. */ @@ -179,7 +227,24 @@ export async function createProviderForUser( db: ReturnType, userId: string, encryptionKey: string, - env: { KV: KVNamespace; BASE_DOMAIN: string; JWT_PRIVATE_KEY: string; JWT_PUBLIC_KEY: string; GCP_IDENTITY_TOKEN_EXPIRY_SECONDS?: string; GCP_TOKEN_CACHE_TTL_SECONDS?: string; GCP_API_TIMEOUT_MS?: string; GCP_OPERATION_POLL_TIMEOUT_MS?: string }, + env: { + KV: KVNamespace; + BASE_DOMAIN: string; + JWT_PRIVATE_KEY: string; + JWT_PUBLIC_KEY: string; + GCP_IDENTITY_TOKEN_EXPIRY_SECONDS?: string; + GCP_TOKEN_CACHE_TTL_SECONDS?: string; + GCP_API_TIMEOUT_MS?: string; + GCP_OPERATION_POLL_TIMEOUT_MS?: string; + HETZNER_API_TIMEOUT_MS?: string; + HETZNER_API_RETRY_MAX_ATTEMPTS?: string; + HETZNER_API_RETRY_BASE_DELAY_MS?: string; + HETZNER_API_RETRY_MAX_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_DELAY_MS?: string; + HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string; + HETZNER_PLACEMENT_FALLBACK_ENABLED?: string; + HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string; + }, targetProvider?: CredentialProvider, ): Promise<{ provider: Provider; providerName: CredentialProvider; credentialSource: CredentialSource } | null> { // 1. Try user's own credential first @@ -215,7 +280,7 @@ export async function createProviderForUser( return { provider, providerName, credentialSource: 'user' }; } - const config = buildProviderConfig(providerName, decryptedToken); + const config = buildProviderConfig(providerName, decryptedToken, env); return { provider: createProvider(config), providerName, credentialSource: 'user' }; } @@ -241,10 +306,29 @@ export async function createProviderForUser( return { provider, providerName: platformProvider, credentialSource: 'platform' }; } - const config = buildProviderConfig(platformProvider, decryptedToken); + const config = buildProviderConfig(platformProvider, decryptedToken, env); return { provider: createProvider(config), providerName: platformProvider, credentialSource: 'platform' }; } +function parseOptionalPositiveInt(value: string | undefined): number | undefined { + if (!value) return undefined; + const parsed = Number.parseInt(value, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : undefined; +} + +function parseOptionalBoolean(value: string | undefined): boolean | undefined { + if (value === undefined || value === '') return undefined; + if (value === 'true') return true; + if (value === 'false') return false; + return undefined; +} + +function parseOptionalList(value: string | undefined): string[] | undefined { + if (!value) return undefined; + const items = value.split(',').map((item) => item.trim()).filter(Boolean); + return items.length > 0 ? items : undefined; +} + /** * Lightweight credential source resolution — determines whether 'user' or 'platform' * credentials would be used for a given target provider WITHOUT decrypting tokens diff --git a/apps/api/tests/unit/fetch-timeout.test.ts b/apps/api/tests/unit/fetch-timeout.test.ts new file mode 100644 index 000000000..38837b415 --- /dev/null +++ b/apps/api/tests/unit/fetch-timeout.test.ts @@ -0,0 +1,101 @@ +/** + * Unit tests for fetch-timeout retry utilities. + * + * Tests pure helper functions (env parsing, delay computation) + * without requiring real network calls. + */ +import { describe, expect, it } from 'vitest'; + +import { + computeRetryDelayMs, + getRetryDelayMs, + getRetryMaxAttempts, + getTimeoutMs, +} from '../../src/services/fetch-timeout'; + +describe('getTimeoutMs', () => { + it('returns default when env is undefined', () => { + expect(getTimeoutMs(undefined, 5000)).toBe(5000); + }); + + it('returns default when env is empty', () => { + expect(getTimeoutMs('', 5000)).toBe(5000); + }); + + it('parses valid integer', () => { + expect(getTimeoutMs('10000', 5000)).toBe(10000); + }); + + it('returns default for non-numeric', () => { + expect(getTimeoutMs('abc', 5000)).toBe(5000); + }); + + it('returns default for zero', () => { + expect(getTimeoutMs('0', 5000)).toBe(5000); + }); + + it('returns default for negative', () => { + expect(getTimeoutMs('-100', 5000)).toBe(5000); + }); +}); + +describe('getRetryMaxAttempts', () => { + it('returns default when env is undefined', () => { + expect(getRetryMaxAttempts(undefined, 3)).toBe(3); + }); + + it('parses valid integer', () => { + expect(getRetryMaxAttempts('5', 3)).toBe(5); + }); + + it('allows zero (no retries)', () => { + expect(getRetryMaxAttempts('0', 3)).toBe(0); + }); + + it('returns default for negative', () => { + expect(getRetryMaxAttempts('-1', 3)).toBe(3); + }); +}); + +describe('getRetryDelayMs', () => { + it('returns default when env is undefined', () => { + expect(getRetryDelayMs(undefined, 1000)).toBe(1000); + }); + + it('parses valid integer', () => { + expect(getRetryDelayMs('2000', 1000)).toBe(2000); + }); + + it('allows zero delay', () => { + expect(getRetryDelayMs('0', 1000)).toBe(0); + }); +}); + +describe('computeRetryDelayMs', () => { + it('returns at least baseDelay for attempt 0', () => { + const delay = computeRetryDelayMs(0, 1000, 30000); + // base (1000) + jitter (0-25%) = 1000-1250 + expect(delay).toBeGreaterThanOrEqual(1000); + expect(delay).toBeLessThanOrEqual(1250); + }); + + it('doubles delay with each attempt (exponential)', () => { + // At attempt 3, exponential = 1000 * 2^3 = 8000 + // With jitter: 8000-10000 + const delay = computeRetryDelayMs(3, 1000, 30000); + expect(delay).toBeGreaterThanOrEqual(8000); + expect(delay).toBeLessThanOrEqual(10000); + }); + + it('caps at maxDelayMs', () => { + const delay = computeRetryDelayMs(20, 1000, 5000); + // capped at 5000 + jitter (0-25%) = 5000-6250 + expect(delay).toBeGreaterThanOrEqual(5000); + expect(delay).toBeLessThanOrEqual(6250); + }); + + it('handles zero base delay', () => { + const delay = computeRetryDelayMs(5, 0, 30000); + expect(delay).toBe(0); + }); +}); diff --git a/apps/api/tests/unit/node-agent-contract.test.ts b/apps/api/tests/unit/node-agent-contract.test.ts index 77429c9ff..d41213f6a 100644 --- a/apps/api/tests/unit/node-agent-contract.test.ts +++ b/apps/api/tests/unit/node-agent-contract.test.ts @@ -802,7 +802,7 @@ describe('Node Agent client functions send correct payloads', () => { let capturedUrl: string | null = null; vi.doMock('../../src/services/fetch-timeout', () => ({ - fetchWithTimeout: vi.fn().mockImplementation((url: string, init: RequestInit) => { + fetchWithTimeoutAndRetry: vi.fn().mockImplementation((url: string, init: RequestInit) => { capturedUrl = url; capturedHeaders = new Headers(init.headers); capturedBody = init.body as string; @@ -814,6 +814,8 @@ describe('Node Agent client functions send correct payloads', () => { ); }), getTimeoutMs: vi.fn().mockReturnValue(30000), + getRetryMaxAttempts: vi.fn().mockReturnValue(2), + getRetryDelayMs: vi.fn().mockReturnValue(1000), })); // Dynamic import to pick up mocks @@ -870,7 +872,7 @@ describe('Node Agent client functions send correct payloads', () => { let capturedUrl: string | null = null; vi.doMock('../../src/services/fetch-timeout', () => ({ - fetchWithTimeout: vi.fn().mockImplementation((url: string, init: RequestInit) => { + fetchWithTimeoutAndRetry: vi.fn().mockImplementation((url: string, init: RequestInit) => { capturedUrl = url; capturedMethod = init.method ?? 'GET'; return Promise.resolve( @@ -881,6 +883,8 @@ describe('Node Agent client functions send correct payloads', () => { ); }), getTimeoutMs: vi.fn().mockReturnValue(30000), + getRetryMaxAttempts: vi.fn().mockReturnValue(2), + getRetryDelayMs: vi.fn().mockReturnValue(1000), })); const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent'); @@ -908,7 +912,7 @@ describe('Node Agent client functions send correct payloads', () => { let capturedBody: string | null = null; vi.doMock('../../src/services/fetch-timeout', () => ({ - fetchWithTimeout: vi.fn().mockImplementation((_url: string, init: RequestInit) => { + fetchWithTimeoutAndRetry: vi.fn().mockImplementation((_url: string, init: RequestInit) => { capturedBody = init.body as string; return Promise.resolve( new Response( @@ -927,6 +931,8 @@ describe('Node Agent client functions send correct payloads', () => { ); }), getTimeoutMs: vi.fn().mockReturnValue(30000), + getRetryMaxAttempts: vi.fn().mockReturnValue(2), + getRetryDelayMs: vi.fn().mockReturnValue(1000), })); const { createAgentSessionOnNode } = await import('../../src/services/node-agent'); @@ -955,13 +961,15 @@ describe('Node Agent client functions send correct payloads', () => { })); vi.doMock('../../src/services/fetch-timeout', () => ({ - fetchWithTimeout: vi.fn().mockResolvedValue( + fetchWithTimeoutAndRetry: vi.fn().mockResolvedValue( new Response(JSON.stringify({ error: 'workspace not found' }), { status: 404, headers: { 'Content-Type': 'application/json' }, }) ), getTimeoutMs: vi.fn().mockReturnValue(30000), + getRetryMaxAttempts: vi.fn().mockReturnValue(2), + getRetryDelayMs: vi.fn().mockReturnValue(1000), })); const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent'); @@ -987,13 +995,15 @@ describe('Node Agent client functions send correct payloads', () => { // Simulate the API Worker's own 404 response (loop-back via wildcard DNS) vi.doMock('../../src/services/fetch-timeout', () => ({ - fetchWithTimeout: vi.fn().mockResolvedValue( + fetchWithTimeoutAndRetry: vi.fn().mockResolvedValue( new Response(JSON.stringify({ error: 'NOT_FOUND', message: 'Endpoint not found' }), { status: 404, headers: { 'Content-Type': 'application/json' }, }) ), getTimeoutMs: vi.fn().mockReturnValue(30000), + getRetryMaxAttempts: vi.fn().mockReturnValue(2), + getRetryDelayMs: vi.fn().mockReturnValue(1000), })); const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent'); @@ -1018,10 +1028,12 @@ describe('Node Agent client functions send correct payloads', () => { })); vi.doMock('../../src/services/fetch-timeout', () => ({ - fetchWithTimeout: vi.fn().mockRejectedValue( + fetchWithTimeoutAndRetry: vi.fn().mockRejectedValue( new Error('Request timed out after 30000ms: https://node-abc.vm.example.com:8443/workspaces/ws-test') ), getTimeoutMs: vi.fn().mockReturnValue(30000), + getRetryMaxAttempts: vi.fn().mockReturnValue(2), + getRetryDelayMs: vi.fn().mockReturnValue(1000), })); const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent'); diff --git a/apps/api/tests/unit/services/node-ip-validation.test.ts b/apps/api/tests/unit/services/node-ip-validation.test.ts index b4b1558fd..77ef73bdf 100644 --- a/apps/api/tests/unit/services/node-ip-validation.test.ts +++ b/apps/api/tests/unit/services/node-ip-validation.test.ts @@ -17,10 +17,12 @@ describe('provisionNode empty IP guard', () => { file.indexOf('export async function provisionNode'), file.indexOf('export async function stopNodeResources') ); + // Focus on the createVM path (after the recovery block) for the IP guard checks + const createVmSection = section.slice(section.indexOf('const vm = await provider.createVM')); it('checks for empty IP before creating DNS records', () => { - const ipCheckIdx = section.indexOf('if (!vm.ip)'); - const dnsCreateIdx = section.indexOf('createNodeBackendDNSRecord'); + const ipCheckIdx = createVmSection.indexOf('if (!vm.ip)'); + const dnsCreateIdx = createVmSection.indexOf('createNodeBackendDNSRecord'); expect(ipCheckIdx).toBeGreaterThan(-1); expect(dnsCreateIdx).toBeGreaterThan(-1); // IP check must come before DNS record creation @@ -28,34 +30,34 @@ describe('provisionNode empty IP guard', () => { }); it('keeps node in creating status when IP is empty (awaiting heartbeat backfill)', () => { - const ipGuardBlock = section.slice( - section.indexOf('if (!vm.ip)'), - section.indexOf('let backendDnsRecordId') + const ipGuardBlock = createVmSection.slice( + createVmSection.indexOf('if (!vm.ip)'), + createVmSection.indexOf('let backendDnsRecordId') ); expect(ipGuardBlock).toContain("status: 'creating'"); expect(ipGuardBlock).toContain('Awaiting IP allocation'); }); it('returns early after setting creating status for empty IP', () => { - const ipGuardBlock = section.slice( - section.indexOf('if (!vm.ip)'), - section.indexOf('let backendDnsRecordId') + const ipGuardBlock = createVmSection.slice( + createVmSection.indexOf('if (!vm.ip)'), + createVmSection.indexOf('let backendDnsRecordId') ); expect(ipGuardBlock).toContain('return;'); }); it('stores providerInstanceId even when IP is empty (for cleanup)', () => { - const ipGuardBlock = section.slice( - section.indexOf('if (!vm.ip)'), - section.indexOf('let backendDnsRecordId') + const ipGuardBlock = createVmSection.slice( + createVmSection.indexOf('if (!vm.ip)'), + createVmSection.indexOf('let backendDnsRecordId') ); expect(ipGuardBlock).toContain('providerInstanceId: vm.id'); }); it('logs structured info with nodeId and providerInstanceId', () => { - const ipGuardBlock = section.slice( - section.indexOf('if (!vm.ip)'), - section.indexOf('let backendDnsRecordId') + const ipGuardBlock = createVmSection.slice( + createVmSection.indexOf('if (!vm.ip)'), + createVmSection.indexOf('let backendDnsRecordId') ); expect(ipGuardBlock).toContain('nodeId: node.id'); expect(ipGuardBlock).toContain('providerInstanceId: vm.id'); diff --git a/apps/api/tests/unit/services/provider-credentials.test.ts b/apps/api/tests/unit/services/provider-credentials.test.ts index c9ef8a996..be2d68994 100644 --- a/apps/api/tests/unit/services/provider-credentials.test.ts +++ b/apps/api/tests/unit/services/provider-credentials.test.ts @@ -35,6 +35,31 @@ describe('buildProviderConfig', () => { }); }); + it('should build HetznerProviderConfig with retry and placement overrides from env', () => { + const config = buildProviderConfig('hetzner', 'my-hetzner-token', { + HETZNER_API_TIMEOUT_MS: '45000', + HETZNER_API_RETRY_MAX_ATTEMPTS: '4', + HETZNER_API_RETRY_BASE_DELAY_MS: '1500', + HETZNER_API_RETRY_MAX_DELAY_MS: '12000', + HETZNER_PLACEMENT_RETRY_DELAY_MS: '2500', + HETZNER_PLACEMENT_RETRY_ATTEMPTS: '3', + HETZNER_PLACEMENT_FALLBACK_ENABLED: 'false', + HETZNER_PLACEMENT_FALLBACK_LOCATIONS: 'hel1,nbg1', + }); + expect(config).toEqual({ + provider: 'hetzner', + apiToken: 'my-hetzner-token', + timeoutMs: 45000, + apiRetryMaxAttempts: 4, + apiRetryBaseDelayMs: 1500, + apiRetryMaxDelayMs: 12000, + placementRetryDelayMs: 2500, + placementRetryAttempts: 3, + placementFallbackEnabled: false, + placementFallbackLocations: ['hel1', 'nbg1'], + }); + }); + it('should build ScalewayProviderConfig from JSON string', () => { const token = JSON.stringify({ secretKey: 'scw-secret', diff --git a/apps/api/tests/unit/task-runner-do-helpers.test.ts b/apps/api/tests/unit/task-runner-do-helpers.test.ts index ac0753101..ae922fe22 100644 --- a/apps/api/tests/unit/task-runner-do-helpers.test.ts +++ b/apps/api/tests/unit/task-runner-do-helpers.test.ts @@ -125,4 +125,22 @@ describe('isTransientError', () => { const err = Object.assign(new Error('network error'), { permanent: true }); expect(isTransientError(err)).toBe(false); }); + + it('honors explicit retryable=true metadata', () => { + // Provider library marks an error as retryable even if the message is unknown + const err = Object.assign(new Error('some provider-specific error'), { retryable: true }); + expect(isTransientError(err)).toBe(true); + }); + + it('honors explicit retryable=false metadata', () => { + // Provider library marks a network error as non-retryable + const err = Object.assign(new Error('network error'), { retryable: false }); + expect(isTransientError(err)).toBe(false); + }); + + it('permanent flag takes precedence over retryable', () => { + // permanent should win over retryable + const err = Object.assign(new Error('some error'), { permanent: true, retryable: true }); + expect(isTransientError(err)).toBe(false); + }); }); diff --git a/apps/api/tests/unit/task-runner-do-state.test.ts b/apps/api/tests/unit/task-runner-do-state.test.ts index 7250be916..9952f73eb 100644 --- a/apps/api/tests/unit/task-runner-do-state.test.ts +++ b/apps/api/tests/unit/task-runner-do-state.test.ts @@ -144,7 +144,7 @@ describe('TaskRunner DO retry logic', () => { }); it('checks max retries before retrying', () => { - expect(doSource).toContain('state.retryCount < this.getMaxRetries()'); + expect(doSource).toContain('state.retryCount < this.getMaxRetries(state.currentStep)'); }); it('increments retryCount on transient failure', () => { @@ -429,14 +429,16 @@ describe('TaskRunner DO configuration', () => { for (const method of configMethods) { it(`has ${method}() configuration method`, () => { - expect(doSource).toContain(`private ${method}()`); + // getMaxRetries accepts an optional step parameter + const pattern = method === 'getMaxRetries' ? `private ${method}(step` : `private ${method}()`; + expect(doSource).toContain(pattern); }); it(`${method}() reads from env var with DEFAULT fallback`, () => { - const section = doSource.slice( - doSource.indexOf(`private ${method}()`), - doSource.indexOf('}', doSource.indexOf(`private ${method}()`) + 50) + 1 - ); + const searchPattern = method === 'getMaxRetries' ? `private ${method}(step` : `private ${method}()`; + const startIdx = doSource.indexOf(searchPattern); + // Extract a generous section (500 chars) to cover multi-line methods with nested blocks + const section = doSource.slice(startIdx, startIdx + 500); expect(section).toContain('parseEnvInt'); expect(section).toContain('this.env.'); expect(section).toContain('DEFAULT_TASK_RUNNER'); diff --git a/packages/cloud-init/src/generate.ts b/packages/cloud-init/src/generate.ts index 1171db868..4914ace34 100644 --- a/packages/cloud-init/src/generate.ts +++ b/packages/cloud-init/src/generate.ts @@ -108,6 +108,12 @@ export function validateCloudInitVariables(variables: CloudInitVariables): void errors.push(`logJournalMaxRetention: must match ${JOURNALD_TIME_RE} (got ${JSON.stringify(variables.logJournalMaxRetention)})`); } } + if (variables.provider !== undefined && variables.provider !== '') { + const validProviders = ['hetzner', 'scaleway', 'gcp']; + if (!validProviders.includes(variables.provider)) { + errors.push(`provider: must be one of ${validProviders.join(', ')} (got ${JSON.stringify(variables.provider)})`); + } + } if (variables.dockerDnsServers !== undefined && variables.dockerDnsServers !== '') { if (!SAFE_DNS_SERVERS_RE.test(variables.dockerDnsServers)) { errors.push(`dockerDnsServers: must contain only quoted IPs (got ${JSON.stringify(variables.dockerDnsServers)})`); @@ -162,6 +168,8 @@ export interface CloudInitVariables { vmAgentPort?: string; /** Timeout in seconds for fetching Cloudflare IP ranges at boot (default: 10) */ cfIpFetchTimeout?: string; + /** Cloud provider name (hetzner, scaleway, gcp) — passed to VM agent and used for apt mirror config */ + provider?: string; } /** @@ -204,6 +212,7 @@ export function generateCloudInit( '{{ tls_cert_path }}': variables.originCaCert ? '/etc/sam/tls/origin-ca.pem' : '', '{{ tls_key_path }}': variables.originCaCert ? '/etc/sam/tls/origin-ca-key.pem' : '', '{{ cf_ip_fetch_timeout }}': variables.cfIpFetchTimeout ?? '10', + '{{ provider }}': variables.provider ?? '', }; // Use function replacement to prevent $-pattern interpretation in values. diff --git a/packages/cloud-init/src/template.ts b/packages/cloud-init/src/template.ts index 751a2d6fd..ff576a702 100644 --- a/packages/cloud-init/src/template.ts +++ b/packages/cloud-init/src/template.ts @@ -75,6 +75,7 @@ write_files: Environment=VM_AGENT_PORT={{ vm_agent_port }} Environment=TLS_CERT_PATH={{ tls_cert_path }} Environment=TLS_KEY_PATH={{ tls_key_path }} + Environment=PROVIDER={{ provider }} ExecStart=/usr/local/bin/vm-agent Restart=always RestartSec=5 @@ -280,5 +281,12 @@ write_files: {{ origin_ca_key }} permissions: '0600' + - path: /etc/apt/apt.conf.d/80-retries + content: | + Acquire::Retries "3"; + Acquire::https::Timeout "30"; + Acquire::http::Timeout "30"; + permissions: '0644' + final_message: "Simple Agent Manager node {{ node_id }} provisioning started!" `; diff --git a/packages/cloud-init/tests/generate.test.ts b/packages/cloud-init/tests/generate.test.ts index 2a9b8679c..65fa258e5 100644 --- a/packages/cloud-init/tests/generate.test.ts +++ b/packages/cloud-init/tests/generate.test.ts @@ -1354,3 +1354,41 @@ describe('integrated size validation in generateCloudInit', () => { }); }); +describe('provider field', () => { + it('accepts valid providers', () => { + for (const p of ['hetzner', 'scaleway', 'gcp']) { + expect(() => generateCloudInit(baseVariables({ provider: p }))).not.toThrow(); + } + }); + + it('rejects invalid providers', () => { + expect(() => generateCloudInit(baseVariables({ provider: 'aws' }))).toThrow(/provider/i); + }); + + it('allows empty/missing provider', () => { + expect(() => generateCloudInit(baseVariables())).not.toThrow(); + expect(() => generateCloudInit(baseVariables({ provider: '' }))).not.toThrow(); + }); + + it('includes provider in systemd environment', () => { + const config = generateCloudInit(baseVariables({ provider: 'hetzner' })); + const parsed = YAML.parse(config); + const serviceFile = parsed.write_files.find( + (f: { path: string }) => f.path === '/etc/systemd/system/vm-agent.service', + ); + expect(serviceFile).toBeDefined(); + expect(serviceFile.content).toContain('Environment=PROVIDER=hetzner'); + }); + + it('includes apt retry config', () => { + const config = generateCloudInit(baseVariables()); + const parsed = YAML.parse(config); + const aptRetryFile = parsed.write_files.find( + (f: { path: string }) => f.path === '/etc/apt/apt.conf.d/80-retries', + ); + expect(aptRetryFile).toBeDefined(); + expect(aptRetryFile.content).toContain('Acquire::Retries "3"'); + expect(aptRetryFile.content).toContain('Acquire::https::Timeout "30"'); + }); +}); + diff --git a/packages/providers/src/hetzner.ts b/packages/providers/src/hetzner.ts index 8d86c5c2e..00e3b3f1a 100644 --- a/packages/providers/src/hetzner.ts +++ b/packages/providers/src/hetzner.ts @@ -1,7 +1,13 @@ import type { VMSize } from '@simple-agent-manager/shared'; import { DEFAULT_HETZNER_DATACENTER, DEFAULT_HETZNER_IMAGE } from '@simple-agent-manager/shared'; -import { providerFetch } from './provider-fetch'; +import { + getRetryDelayMs, + getRetryMaxAttempts, + getTimeoutMs, + providerFetch, + providerFetchWithRetry, +} from './provider-fetch'; import type { LocationMeta, Provider, SizeConfig, VMConfig, VMInstance, VMStatus } from './types'; import { ProviderError } from './types'; @@ -18,6 +24,7 @@ const HETZNER_LOCATION_META: Record = { }; export const DEFAULT_PLACEMENT_RETRY_DELAY_MS = 3_000; +export const DEFAULT_PLACEMENT_RETRY_ATTEMPTS = 2; const SIZE_CONFIGS: Record = { small: { @@ -74,20 +81,38 @@ export class HetznerProvider implements Provider { private readonly apiToken: string; private readonly datacenter: string; + private readonly timeoutMs: number; + private readonly apiRetryMaxAttempts: number; + private readonly apiRetryBaseDelayMs: number; + private readonly apiRetryMaxDelayMs: number; private readonly placementRetryDelayMs: number; + private readonly placementRetryAttempts: number; private readonly placementFallbackEnabled: boolean; + private readonly placementFallbackLocations: string[] | undefined; constructor( apiToken: string, datacenter?: string, placementRetryDelayMs?: number, placementFallbackEnabled?: boolean, + timeoutMs?: number, + apiRetryMaxAttempts?: number, + apiRetryBaseDelayMs?: number, + apiRetryMaxDelayMs?: number, + placementRetryAttempts?: number, + placementFallbackLocations?: string[], ) { this.apiToken = apiToken; this.datacenter = datacenter || DEFAULT_HETZNER_DATACENTER; this.defaultLocation = this.datacenter; - this.placementRetryDelayMs = placementRetryDelayMs ?? DEFAULT_PLACEMENT_RETRY_DELAY_MS; + this.timeoutMs = getTimeoutMs(timeoutMs === undefined ? undefined : String(timeoutMs)); + this.apiRetryMaxAttempts = getRetryMaxAttempts(apiRetryMaxAttempts); + this.apiRetryBaseDelayMs = getRetryDelayMs(apiRetryBaseDelayMs, 1_000); + this.apiRetryMaxDelayMs = getRetryDelayMs(apiRetryMaxDelayMs, 10_000); + this.placementRetryDelayMs = getRetryDelayMs(placementRetryDelayMs, DEFAULT_PLACEMENT_RETRY_DELAY_MS); + this.placementRetryAttempts = getRetryMaxAttempts(placementRetryAttempts, DEFAULT_PLACEMENT_RETRY_ATTEMPTS); this.placementFallbackEnabled = placementFallbackEnabled ?? true; + this.placementFallbackLocations = placementFallbackLocations?.filter((loc) => HETZNER_LOCATIONS.includes(loc as typeof HETZNER_LOCATIONS[number])); } async createVM(config: VMConfig): Promise { @@ -97,15 +122,20 @@ export class HetznerProvider implements Provider { } const primaryLocation = config.location || this.datacenter; - // Build attempt order: primary twice (with a delay between), then remaining locations shuffled + // Build attempt order: primary N times (with a delay between), then remaining locations. + // Fallback order can be explicitly configured for deterministic capacity handling. const fallbackLocations = this.placementFallbackEnabled - ? HETZNER_LOCATIONS + ? (this.placementFallbackLocations?.length + ? this.placementFallbackLocations + : HETZNER_LOCATIONS.filter((loc) => loc !== primaryLocation).sort(() => Math.random() - 0.5)) .filter((loc) => loc !== primaryLocation) - .sort(() => Math.random() - 0.5) : []; + const primaryAttempts = Array.from({ length: this.placementRetryAttempts }, (_, index) => ({ + location: primaryLocation, + delayMs: index === 0 ? 0 : this.placementRetryDelayMs, + })); const attemptsToTry: Array<{ location: string; delayMs: number }> = [ - { location: primaryLocation, delayMs: 0 }, - { location: primaryLocation, delayMs: this.placementRetryDelayMs }, + ...primaryAttempts, ...fallbackLocations.map((loc) => ({ location: loc, delayMs: 0 })), ]; @@ -134,7 +164,7 @@ export class HetznerProvider implements Provider { labels: config.labels || {}, start_after_create: true, }), - }); + }, this.timeoutMs); const data = (await response.json()) as HetznerServerResponse; if (attempt.location !== primaryLocation) { @@ -145,10 +175,21 @@ export class HetznerProvider implements Provider { return this.mapServerToVMInstance(data.server); } catch (err) { if (err instanceof ProviderError && err.statusCode === 412) { + const placementError = new ProviderError( + this.name, + err.statusCode, + err.message, + { + cause: err, + retryable: true, + reason: 'capacity', + idempotencyRisk: 'duplicate_resource', + }, + ); console.warn( `hetzner: placement failed in ${attempt.location} (412)`, ); - lastError = err; + lastError = placementError; continue; } throw err; // Non-placement errors are not retryable @@ -161,7 +202,7 @@ export class HetznerProvider implements Provider { async deleteVM(id: string): Promise { try { - await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}`, { + await this.fetch(`${HETZNER_API_URL}/servers/${id}`, { method: 'DELETE', headers: { Authorization: `Bearer ${this.apiToken}`, @@ -177,7 +218,7 @@ export class HetznerProvider implements Provider { async getVM(id: string): Promise { try { - const response = await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}`, { + const response = await this.fetch(`${HETZNER_API_URL}/servers/${id}`, { headers: { Authorization: `Bearer ${this.apiToken}`, }, @@ -205,7 +246,7 @@ export class HetznerProvider implements Provider { ? `${HETZNER_API_URL}/servers?label_selector=${encodeURIComponent(labelParts.join(','))}` : `${HETZNER_API_URL}/servers`; - const response = await providerFetch(this.name, url, { + const response = await this.fetch(url, { headers: { Authorization: `Bearer ${this.apiToken}`, }, @@ -216,7 +257,7 @@ export class HetznerProvider implements Provider { } async powerOff(id: string): Promise { - await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}/actions/poweroff`, { + await this.fetch(`${HETZNER_API_URL}/servers/${id}/actions/poweroff`, { method: 'POST', headers: { Authorization: `Bearer ${this.apiToken}`, @@ -225,7 +266,7 @@ export class HetznerProvider implements Provider { } async powerOn(id: string): Promise { - await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}/actions/poweron`, { + await this.fetch(`${HETZNER_API_URL}/servers/${id}/actions/poweron`, { method: 'POST', headers: { Authorization: `Bearer ${this.apiToken}`, @@ -234,7 +275,7 @@ export class HetznerProvider implements Provider { } async validateToken(): Promise { - await providerFetch(this.name, `${HETZNER_API_URL}/datacenters`, { + await this.fetch(`${HETZNER_API_URL}/datacenters`, { headers: { Authorization: `Bearer ${this.apiToken}`, }, @@ -242,6 +283,14 @@ export class HetznerProvider implements Provider { return true; } + private fetch(url: string | URL, init?: RequestInit): Promise { + return providerFetchWithRetry(this.name, url, init, this.timeoutMs, { + maxAttempts: this.apiRetryMaxAttempts, + baseDelayMs: this.apiRetryBaseDelayMs, + maxDelayMs: this.apiRetryMaxDelayMs, + }); + } + private mapServerToVMInstance(server: HetznerServerResponse['server']): VMInstance { return { id: String(server.id), diff --git a/packages/providers/src/index.ts b/packages/providers/src/index.ts index 02ad816c5..44e541f32 100644 --- a/packages/providers/src/index.ts +++ b/packages/providers/src/index.ts @@ -20,7 +20,15 @@ export type { export { ProviderError } from './types'; // Re-export utilities -export { getTimeoutMs,providerFetch } from './provider-fetch'; +export { + computeRetryDelayMs, + getRetryDelayMs, + getRetryMaxAttempts, + getTimeoutMs, + providerFetch, + providerFetchWithRetry, + shouldRetryProviderError, +} from './provider-fetch'; // Re-export providers export type { GcpTokenProvider } from './gcp'; @@ -40,6 +48,12 @@ export function createProvider(config: ProviderConfig): Provider { config.datacenter, config.placementRetryDelayMs, config.placementFallbackEnabled, + config.timeoutMs, + config.apiRetryMaxAttempts, + config.apiRetryBaseDelayMs, + config.apiRetryMaxDelayMs, + config.placementRetryAttempts, + config.placementFallbackLocations, ); case 'scaleway': return new ScalewayProvider( diff --git a/packages/providers/src/provider-fetch.ts b/packages/providers/src/provider-fetch.ts index 3393cc46e..369e3326e 100644 --- a/packages/providers/src/provider-fetch.ts +++ b/packages/providers/src/provider-fetch.ts @@ -1,6 +1,26 @@ -import { ProviderError } from './types'; +import { ProviderError,type ProviderErrorReason, type ProviderIdempotencyRisk } from './types'; const DEFAULT_PROVIDER_TIMEOUT_MS = 30_000; +const DEFAULT_PROVIDER_RETRY_MAX_ATTEMPTS = 3; +const DEFAULT_PROVIDER_RETRY_BASE_DELAY_MS = 1_000; +const DEFAULT_PROVIDER_RETRY_MAX_DELAY_MS = 10_000; +const DEFAULT_PROVIDER_RETRY_JITTER_RATIO = 0.25; + +export interface ProviderRetryConfig { + maxAttempts?: number; + baseDelayMs?: number; + maxDelayMs?: number; + jitterRatio?: number; + retryableStatusCodes?: readonly number[]; + idempotencyRisk?: ProviderIdempotencyRisk; +} + +interface NormalizedProviderError { + message: string; + reason: ProviderErrorReason; + retryable: boolean; + retryAfterMs?: number; +} /** * Parse a timeout from an env value string or return the default. @@ -15,6 +35,26 @@ export function getTimeoutMs( return parsed; } +export function getRetryMaxAttempts( + value: number | string | undefined, + defaultAttempts: number = DEFAULT_PROVIDER_RETRY_MAX_ATTEMPTS, +): number { + if (value === undefined || value === '') return defaultAttempts; + const parsed = typeof value === 'number' ? value : Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed <= 0) return defaultAttempts; + return Math.floor(parsed); +} + +export function getRetryDelayMs( + value: number | string | undefined, + defaultMs: number, +): number { + if (value === undefined || value === '') return defaultMs; + const parsed = typeof value === 'number' ? value : Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed < 0) return defaultMs; + return parsed; +} + /** * Fetch wrapper for provider API calls. * Adds configurable timeout via AbortController and wraps errors into ProviderError. @@ -42,24 +82,17 @@ export async function providerFetch( }); if (!response.ok) { - let errorMessage: string; - try { - const body = await response.text(); - // Try parsing as JSON for structured error messages - try { - const json = JSON.parse(body) as { error?: { message?: string }; message?: string }; - errorMessage = json.error?.message || json.message || body; - } catch { - errorMessage = body || `HTTP ${response.status}`; - } - } catch { - errorMessage = `HTTP ${response.status}`; - } + const normalized = await normalizeHttpError(response); throw new ProviderError( providerName, response.status, - `${providerName} API error (${response.status}): ${errorMessage}`, + `${providerName} API error (${response.status}): ${normalized.message}`, + { + retryable: normalized.retryable, + reason: normalized.reason, + retryAfterMs: normalized.retryAfterMs, + }, ); } @@ -72,7 +105,7 @@ export async function providerFetch( providerName, undefined, `${providerName} API request timed out after ${timeoutMs}ms: ${url}`, - { cause: err }, + { cause: err, retryable: true, reason: 'timeout' }, ); } @@ -80,9 +113,122 @@ export async function providerFetch( providerName, undefined, `${providerName} API request failed: ${err instanceof Error ? err.message : String(err)}`, - { cause: err instanceof Error ? err : undefined }, + { cause: err instanceof Error ? err : undefined, retryable: true, reason: 'network' }, ); } finally { clearTimeout(timeoutId); } } + +export async function providerFetchWithRetry( + providerName: string, + url: string | URL, + init?: RequestInit, + timeoutMs: number = DEFAULT_PROVIDER_TIMEOUT_MS, + retryConfig: ProviderRetryConfig = {}, +): Promise { + const maxAttempts = getRetryMaxAttempts(retryConfig.maxAttempts); + const baseDelayMs = getRetryDelayMs(retryConfig.baseDelayMs, DEFAULT_PROVIDER_RETRY_BASE_DELAY_MS); + const maxDelayMs = getRetryDelayMs(retryConfig.maxDelayMs, DEFAULT_PROVIDER_RETRY_MAX_DELAY_MS); + const jitterRatio = retryConfig.jitterRatio ?? DEFAULT_PROVIDER_RETRY_JITTER_RATIO; + + let lastError: ProviderError | undefined; + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + return await providerFetch(providerName, url, init, timeoutMs); + } catch (err) { + if (!(err instanceof ProviderError)) throw err; + lastError = err; + if (!shouldRetryProviderError(err, retryConfig) || attempt >= maxAttempts) { + throw err; + } + + const delayMs = err.retryAfterMs ?? computeRetryDelayMs(attempt, baseDelayMs, maxDelayMs, jitterRatio); + if (delayMs > 0) { + await new Promise((resolve) => setTimeout(resolve, delayMs)); + } + } + } + + throw lastError ?? new ProviderError(providerName, undefined, `${providerName} API request failed without an error`); +} + +export function shouldRetryProviderError( + err: ProviderError, + retryConfig: ProviderRetryConfig = {}, +): boolean { + if (retryConfig.retryableStatusCodes?.includes(err.statusCode ?? -1)) { + return true; + } + return err.retryable; +} + +export function computeRetryDelayMs( + attempt: number, + baseDelayMs: number = DEFAULT_PROVIDER_RETRY_BASE_DELAY_MS, + maxDelayMs: number = DEFAULT_PROVIDER_RETRY_MAX_DELAY_MS, + jitterRatio: number = DEFAULT_PROVIDER_RETRY_JITTER_RATIO, +): number { + const exponential = Math.min(baseDelayMs * Math.pow(2, Math.max(0, attempt - 1)), maxDelayMs); + if (jitterRatio <= 0 || exponential <= 0) return exponential; + const jitterRange = exponential * jitterRatio; + const jitter = (Math.random() * jitterRange * 2) - jitterRange; + return Math.max(0, Math.round(exponential + jitter)); +} + +async function normalizeHttpError(response: Response): Promise { + let errorMessage: string; + try { + const body = await response.text(); + try { + const json = JSON.parse(body) as { error?: { message?: string }; message?: string }; + errorMessage = json.error?.message || json.message || body; + } catch { + errorMessage = body || `HTTP ${response.status}`; + } + } catch { + errorMessage = `HTTP ${response.status}`; + } + + return { + message: errorMessage, + reason: classifyHttpStatus(response.status), + retryable: isRetryableHttpStatus(response.status), + retryAfterMs: parseRetryAfterMs(response.headers.get('Retry-After')), + }; +} + +function classifyHttpStatus(status: number): ProviderErrorReason { + if (status === 408) return 'timeout'; + if (status === 409 || status === 425) return 'conflict'; + if (status === 429) return 'rate_limit'; + if (status === 401 || status === 403) return 'auth'; + if (status === 404) return 'not_found'; + if (status >= 500) return 'provider_5xx'; + if (status >= 400) return 'validation'; + return 'unknown'; +} + +function isRetryableHttpStatus(status: number): boolean { + return status === 408 || + status === 425 || + status === 429 || + status === 500 || + status === 502 || + status === 503 || + status === 504; +} + +function parseRetryAfterMs(value: string | null): number | undefined { + if (!value) return undefined; + const seconds = Number.parseInt(value, 10); + if (Number.isFinite(seconds) && seconds >= 0) { + return seconds * 1000; + } + const dateMs = Date.parse(value); + if (Number.isFinite(dateMs)) { + return Math.max(0, dateMs - Date.now()); + } + return undefined; +} diff --git a/packages/providers/src/types.ts b/packages/providers/src/types.ts index a09a6bd49..67c6792a3 100644 --- a/packages/providers/src/types.ts +++ b/packages/providers/src/types.ts @@ -135,10 +135,22 @@ export interface HetznerProviderConfig { provider: 'hetzner'; apiToken: string; datacenter?: string; + /** Per-request API timeout in ms (default: 30000) */ + timeoutMs?: number; + /** Max HTTP retry attempts for transient provider API calls (default: 3) */ + apiRetryMaxAttempts?: number; + /** Base delay in ms for transient provider API retries (default: 1000) */ + apiRetryBaseDelayMs?: number; + /** Max delay in ms for transient provider API retries (default: 10000) */ + apiRetryMaxDelayMs?: number; /** Delay in ms before retrying same location on 412 (default: 3000) */ placementRetryDelayMs?: number; + /** Number of primary-location placement attempts before fallback locations (default: 2) */ + placementRetryAttempts?: number; /** Whether to try other locations after primary fails (default: true) */ placementFallbackEnabled?: boolean; + /** Ordered fallback locations to try after primary placement attempts fail */ + placementFallbackLocations?: string[]; } export interface ScalewayProviderConfig { @@ -175,11 +187,26 @@ export class ProviderError extends Error { public readonly statusCode: number | undefined, message: string, /** Original error */ - options?: { cause?: Error }, + options?: { + cause?: Error; + retryable?: boolean; + reason?: ProviderErrorReason; + retryAfterMs?: number; + idempotencyRisk?: ProviderIdempotencyRisk; + }, ) { super(message, options); + this.retryable = options?.retryable ?? false; + this.reason = options?.reason ?? classifyProviderErrorReason(statusCode); + this.retryAfterMs = options?.retryAfterMs; + this.idempotencyRisk = options?.idempotencyRisk ?? 'none'; } + readonly retryable: boolean; + readonly reason: ProviderErrorReason; + readonly retryAfterMs: number | undefined; + readonly idempotencyRisk: ProviderIdempotencyRisk; + /** Make Error properties visible to JSON.stringify */ toJSON(): Record { return { @@ -187,7 +214,38 @@ export class ProviderError extends Error { message: this.message, provider: this.providerName, statusCode: this.statusCode, + retryable: this.retryable, + reason: this.reason, + retryAfterMs: this.retryAfterMs, + idempotencyRisk: this.idempotencyRisk, cause: this.cause instanceof Error ? this.cause.message : this.cause, }; } } + +export type ProviderErrorReason = + | 'network' + | 'timeout' + | 'rate_limit' + | 'provider_5xx' + | 'capacity' + | 'placement' + | 'quota' + | 'auth' + | 'validation' + | 'not_found' + | 'conflict' + | 'unknown'; + +export type ProviderIdempotencyRisk = 'none' | 'duplicate_resource' | 'destructive'; + +function classifyProviderErrorReason(statusCode: number | undefined): ProviderErrorReason { + if (statusCode === undefined) return 'unknown'; + if (statusCode === 401 || statusCode === 403) return 'auth'; + if (statusCode === 404) return 'not_found'; + if (statusCode === 409) return 'conflict'; + if (statusCode === 429) return 'rate_limit'; + if (statusCode >= 400 && statusCode < 500) return 'validation'; + if (statusCode >= 500) return 'provider_5xx'; + return 'unknown'; +} diff --git a/packages/providers/tests/unit/hetzner.test.ts b/packages/providers/tests/unit/hetzner.test.ts index ce3f7cd7d..eaa06f3fc 100644 --- a/packages/providers/tests/unit/hetzner.test.ts +++ b/packages/providers/tests/unit/hetzner.test.ts @@ -438,6 +438,83 @@ describe('HetznerProvider', () => { // primary (1) + primary retry (2), no fallback locations expect(fetch).toHaveBeenCalledTimes(2); }); + + it('should honor configurable primary placement retry attempts', async () => { + vi.useFakeTimers(); + const retryProvider = new HetznerProvider( + 'test-token', + 'fsn1', + 3000, + false, + undefined, + undefined, + undefined, + undefined, + 3, + ); + globalThis.fetch = vi.fn().mockResolvedValue( + new Response( + JSON.stringify({ error: { message: 'error during placement' } }), + { status: 412 }, + ), + ); + + const promise = retryProvider.createVM(vmConfig).catch((err) => err); + await vi.runAllTimersAsync(); + const result = await promise; + expect(result).toBeInstanceOf(ProviderError); + expect(fetch).toHaveBeenCalledTimes(3); + + const locations = (fetch as ReturnType).mock.calls.map( + (call) => JSON.parse((call[1] as RequestInit).body as string).location, + ); + expect(locations).toEqual(['fsn1', 'fsn1', 'fsn1']); + }); + + it('should honor configured fallback location order', async () => { + vi.useFakeTimers(); + const orderedFallbackProvider = new HetznerProvider( + 'test-token', + 'fsn1', + 3000, + true, + undefined, + undefined, + undefined, + undefined, + 1, + ['hel1', 'nbg1'], + ); + const mockFetch = vi.fn() + .mockResolvedValueOnce( + new Response( + JSON.stringify({ error: { message: 'error during placement' } }), + { status: 412 }, + ), + ) + .mockResolvedValueOnce( + new Response( + JSON.stringify({ error: { message: 'error during placement' } }), + { status: 412 }, + ), + ) + .mockResolvedValueOnce( + new Response( + JSON.stringify({ server: createMockServer({ status: 'initializing' }) }), + { status: 200 }, + ), + ); + globalThis.fetch = mockFetch; + + const promise = orderedFallbackProvider.createVM(vmConfig); + await vi.runAllTimersAsync(); + await promise; + + const locations = mockFetch.mock.calls.map( + (call) => JSON.parse((call[1] as RequestInit).body as string).location, + ); + expect(locations).toEqual(['fsn1', 'hel1', 'nbg1']); + }); }); describe('deleteVM', () => { diff --git a/packages/providers/tests/unit/provider-error-tojson.test.ts b/packages/providers/tests/unit/provider-error-tojson.test.ts index e791a9f19..a14737e71 100644 --- a/packages/providers/tests/unit/provider-error-tojson.test.ts +++ b/packages/providers/tests/unit/provider-error-tojson.test.ts @@ -12,6 +12,10 @@ describe('ProviderError.toJSON', () => { message: 'Server not found', provider: 'hetzner', statusCode: 404, + retryable: false, + reason: 'not_found', + retryAfterMs: undefined, + idempotencyRisk: 'none', cause: undefined, }); }); @@ -69,4 +73,3 @@ describe('ProviderError.toJSON', () => { expect('stack' in json).toBe(false); }); }); - diff --git a/packages/providers/tests/unit/provider-fetch.test.ts b/packages/providers/tests/unit/provider-fetch.test.ts index e9e3570a7..cd4bbc494 100644 --- a/packages/providers/tests/unit/provider-fetch.test.ts +++ b/packages/providers/tests/unit/provider-fetch.test.ts @@ -1,6 +1,12 @@ import { afterEach,describe, expect, it, vi } from 'vitest'; -import { getTimeoutMs,providerFetch } from '../../src/provider-fetch'; +import { providerFetch, providerFetchWithRetry } from '../../src/provider-fetch'; +import { + computeRetryDelayMs, + getRetryDelayMs, + getRetryMaxAttempts, + getTimeoutMs, +} from '../../src/provider-fetch'; import { ProviderError } from '../../src/types'; describe('getTimeoutMs', () => { @@ -172,6 +178,101 @@ describe('providerFetch', () => { } catch (err) { const pe = err as ProviderError; expect(pe.message).toContain('Rate limited'); + expect(pe.retryable).toBe(true); + expect(pe.reason).toBe('rate_limit'); } }); }); + +describe('retry helpers', () => { + it('parses retry attempts with fallback', () => { + expect(getRetryMaxAttempts('4')).toBe(4); + expect(getRetryMaxAttempts('0', 2)).toBe(2); + expect(getRetryMaxAttempts('abc', 2)).toBe(2); + }); + + it('parses retry delays with fallback', () => { + expect(getRetryDelayMs('2500', 1000)).toBe(2500); + expect(getRetryDelayMs('-1', 1000)).toBe(1000); + expect(getRetryDelayMs('abc', 1000)).toBe(1000); + }); + + it('computes capped exponential retry delays without jitter', () => { + expect(computeRetryDelayMs(1, 1000, 10000, 0)).toBe(1000); + expect(computeRetryDelayMs(2, 1000, 10000, 0)).toBe(2000); + expect(computeRetryDelayMs(5, 1000, 10000, 0)).toBe(10000); + }); +}); + +describe('providerFetchWithRetry', () => { + const originalFetch = globalThis.fetch; + + afterEach(() => { + vi.useRealTimers(); + globalThis.fetch = originalFetch; + }); + + it('retries transient 5xx responses and returns the eventual success', async () => { + vi.useFakeTimers(); + const mockFetch = vi.fn() + .mockResolvedValueOnce(new Response('temporarily unavailable', { status: 503 })) + .mockResolvedValueOnce(new Response(JSON.stringify({ ok: true }), { status: 200 })); + globalThis.fetch = mockFetch; + + const promise = providerFetchWithRetry( + 'hetzner', + 'https://api.example.com/test', + undefined, + 30_000, + { maxAttempts: 2, baseDelayMs: 100, maxDelayMs: 100, jitterRatio: 0 }, + ); + + await vi.runAllTimersAsync(); + const response = await promise; + + expect(response.ok).toBe(true); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it('honors Retry-After for 429 responses', async () => { + vi.useFakeTimers(); + const mockFetch = vi.fn() + .mockResolvedValueOnce(new Response('rate limited', { + status: 429, + headers: { 'Retry-After': '2' }, + })) + .mockResolvedValueOnce(new Response(JSON.stringify({ ok: true }), { status: 200 })); + globalThis.fetch = mockFetch; + + const promise = providerFetchWithRetry( + 'hetzner', + 'https://api.example.com/test', + undefined, + 30_000, + { maxAttempts: 2, baseDelayMs: 100, maxDelayMs: 100, jitterRatio: 0 }, + ); + + await vi.advanceTimersByTimeAsync(1_999); + expect(mockFetch).toHaveBeenCalledTimes(1); + await vi.advanceTimersByTimeAsync(1); + await promise; + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it('does not retry auth failures', async () => { + const mockFetch = vi.fn().mockResolvedValue( + new Response(JSON.stringify({ error: { message: 'Unauthorized' } }), { status: 401 }), + ); + globalThis.fetch = mockFetch; + + await expect(providerFetchWithRetry( + 'hetzner', + 'https://api.example.com/test', + undefined, + 30_000, + { maxAttempts: 3, baseDelayMs: 1, maxDelayMs: 1, jitterRatio: 0 }, + )).rejects.toThrow(ProviderError); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); +}); diff --git a/packages/vm-agent/internal/bootstrap/bootstrap.go b/packages/vm-agent/internal/bootstrap/bootstrap.go index d330e3331..6312c527a 100644 --- a/packages/vm-agent/internal/bootstrap/bootstrap.go +++ b/packages/vm-agent/internal/bootstrap/bootstrap.go @@ -813,7 +813,15 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName return false, fmt.Errorf("devcontainer CLI never became available: %w", err) } - slog.Info("Starting devcontainer for workspace", "workspaceDir", cfg.WorkspaceDir) + // Apply devcontainer build timeout so runaway builds (large Dockerfiles, + // network stalls pulling images) do not block workspace provisioning forever. + if cfg.DevcontainerBuildTimeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, cfg.DevcontainerBuildTimeout) + defer cancel() + } + + slog.Info("Starting devcontainer for workspace", "workspaceDir", cfg.WorkspaceDir, "buildTimeout", cfg.DevcontainerBuildTimeout) hasConfig := hasDevcontainerConfig(cfg.WorkspaceDir) usedFallback := false diff --git a/packages/vm-agent/internal/config/config.go b/packages/vm-agent/internal/config/config.go index 31dbcd2b9..e763d4099 100644 --- a/packages/vm-agent/internal/config/config.go +++ b/packages/vm-agent/internal/config/config.go @@ -133,6 +133,14 @@ type Config struct { DefaultDevcontainerConfigPath string // Path to write the generated default config DefaultDevcontainerRemoteUser string // remoteUser for the default config (empty = omit, let image default) + // Cloud provider name (hetzner, scaleway, gcp) — set via cloud-init. + // Used for provider-specific configuration (e.g., apt mirror selection). + Provider string // (env: PROVIDER) + + // DevcontainerBuildTimeout is the maximum duration allowed for devcontainer up. + // After this timeout the build context is cancelled. + DevcontainerBuildTimeout time.Duration // (env: DEVCONTAINER_BUILD_TIMEOUT, default: 15m) + // Project linkage — set via cloud-init when the workspace belongs to a project. // If ProjectID is empty, the message reporter is disabled (no-op). ProjectID string // Linked project ID (env: PROJECT_ID) @@ -324,6 +332,12 @@ func Load() (*Config, error) { DefaultDevcontainerConfigPath: getEnv("DEFAULT_DEVCONTAINER_CONFIG_PATH", DefaultDevcontainerConfigPath), DefaultDevcontainerRemoteUser: getEnv("DEFAULT_DEVCONTAINER_REMOTE_USER", ""), // Empty = omit, use image default + // Provider awareness (set via cloud-init) + Provider: getEnv("PROVIDER", ""), + + // Devcontainer build timeout — configurable per constitution principle XI + DevcontainerBuildTimeout: getEnvDuration("DEVCONTAINER_BUILD_TIMEOUT", 15*time.Minute), + // Project linkage (set via cloud-init) ProjectID: getEnv("PROJECT_ID", ""), ChatSessionID: getEnv("CHAT_SESSION_ID", ""), diff --git a/packages/vm-agent/internal/config/config_test.go b/packages/vm-agent/internal/config/config_test.go index 392ef65a8..60564c2c1 100644 --- a/packages/vm-agent/internal/config/config_test.go +++ b/packages/vm-agent/internal/config/config_test.go @@ -866,3 +866,37 @@ func TestCloseIdleControlPlaneConnectionsFlushesPool(t *testing.T) { t.Fatalf("expected 2 total connections after flush (pool was purged), got %d", afterFlush) } } + +func TestProviderAndDevcontainerBuildTimeoutConfig(t *testing.T) { + t.Run("defaults", func(t *testing.T) { + t.Setenv("CONTROL_PLANE_URL", "https://api.example.com") + t.Setenv("NODE_ID", "test-node") + cfg, err := Load() + if err != nil { + t.Fatalf("Load() error: %v", err) + } + if cfg.Provider != "" { + t.Errorf("expected empty Provider, got %q", cfg.Provider) + } + if cfg.DevcontainerBuildTimeout != 15*time.Minute { + t.Errorf("expected DevcontainerBuildTimeout 15m, got %v", cfg.DevcontainerBuildTimeout) + } + }) + + t.Run("env overrides", func(t *testing.T) { + t.Setenv("CONTROL_PLANE_URL", "https://api.example.com") + t.Setenv("NODE_ID", "test-node") + t.Setenv("PROVIDER", "hetzner") + t.Setenv("DEVCONTAINER_BUILD_TIMEOUT", "5m") + cfg, err := Load() + if err != nil { + t.Fatalf("Load() error: %v", err) + } + if cfg.Provider != "hetzner" { + t.Errorf("expected Provider hetzner, got %q", cfg.Provider) + } + if cfg.DevcontainerBuildTimeout != 5*time.Minute { + t.Errorf("expected DevcontainerBuildTimeout 5m, got %v", cfg.DevcontainerBuildTimeout) + } + }) +}