diff --git a/.claude/skills/env-reference/SKILL.md b/.claude/skills/env-reference/SKILL.md
index e46579f3a..ae07a7fe9 100644
--- a/.claude/skills/env-reference/SKILL.md
+++ b/.claude/skills/env-reference/SKILL.md
@@ -78,6 +78,13 @@ See `apps/api/.env.example` for the full list. Key variables:
 - `NODE_AGENT_READY_TIMEOUT_MS` — Max wait for freshly provisioned node-agent health
 - `NODE_AGENT_READY_POLL_INTERVAL_MS` — Polling interval for fresh-node readiness checks
 - `HETZNER_API_TIMEOUT_MS` — Timeout for Hetzner Cloud API calls (default: 30000)
+- `HETZNER_API_RETRY_MAX_ATTEMPTS` — Retry budget for transient Hetzner API calls (default: 3)
+- `HETZNER_API_RETRY_BASE_DELAY_MS` — Base delay for transient Hetzner API retry backoff (default: 1000)
+- `HETZNER_API_RETRY_MAX_DELAY_MS` — Max delay for transient Hetzner API retry backoff (default: 10000)
+- `HETZNER_PLACEMENT_RETRY_DELAY_MS` — Delay before retrying Hetzner primary-location placement failures (default: 3000)
+- `HETZNER_PLACEMENT_RETRY_ATTEMPTS` — Number of primary-location placement attempts before fallback locations (default: 2)
+- `HETZNER_PLACEMENT_FALLBACK_ENABLED` — Whether Hetzner provisioning tries alternate locations after primary placement failures (default: true)
+- `HETZNER_PLACEMENT_FALLBACK_LOCATIONS` — Ordered comma-separated Hetzner fallback locations
 - `CF_API_TIMEOUT_MS` — Timeout for Cloudflare DNS API calls (default: 30000)
 - `NODE_AGENT_REQUEST_TIMEOUT_MS` — Timeout for Node Agent HTTP requests (default: 30000)
 
diff --git a/apps/api/.env.example b/apps/api/.env.example
index 776f34392..48ca14f21 100644
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@@ -210,13 +210,33 @@ BASE_DOMAIN=workspaces.example.com
 
 # External API timeouts (milliseconds, default: 30000)
 # HETZNER_API_TIMEOUT_MS=30000
+# HETZNER_API_RETRY_MAX_ATTEMPTS=3
+# HETZNER_API_RETRY_BASE_DELAY_MS=1000
+# HETZNER_API_RETRY_MAX_DELAY_MS=10000
+# HETZNER_PLACEMENT_RETRY_DELAY_MS=3000
+# HETZNER_PLACEMENT_RETRY_ATTEMPTS=2
+# HETZNER_PLACEMENT_FALLBACK_ENABLED=true
+# HETZNER_PLACEMENT_FALLBACK_LOCATIONS=hel1,nbg1,ash,hil
 # CF_API_TIMEOUT_MS=30000
+# CF_API_RETRY_MAX_ATTEMPTS=3
+# CF_API_RETRY_BASE_DELAY_MS=1000
+# CF_API_RETRY_MAX_DELAY_MS=30000
 # NODE_AGENT_REQUEST_TIMEOUT_MS=30000
+# NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS=2
+# NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS=1000
+# NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS=10000
 
 # Workspace readiness (TaskRunner DO)
 # TASK_RUNNER_WORKSPACE_READY_TIMEOUT_MS=1800000  # 30 minutes — max time for workspace-ready callback
 # TASK_RUNNER_WORKSPACE_READY_POLL_INTERVAL_MS=30000  # 30 seconds — D1 poll interval during workspace_ready step
 # PROVISIONING_TIMEOUT_MS=1800000                 # 30 minutes — cron marks 'creating' workspaces as error after this
+# TASK_RUNNER_STEP_MAX_RETRIES=3                  # Global per-step retry budget
+# TASK_RUNNER_NODE_SELECTION_MAX_RETRIES=          # Per-step override: node_selection
+# TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES=       # Per-step override: node_provisioning
+# TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES=        # Per-step override: node_agent_ready
+# TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES=      # Per-step override: workspace_creation
+# TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES=         # Per-step override: workspace_ready
+# TASK_RUNNER_AGENT_SESSION_MAX_RETRIES=           # Per-step override: agent_session
 
 # Docker daemon DNS for devcontainer builds (comma-separated quoted IPs)
 # DOCKER_DNS_SERVERS="1.1.1.1", "8.8.8.8"
diff --git a/apps/api/src/durable-objects/task-runner/helpers.ts b/apps/api/src/durable-objects/task-runner/helpers.ts
index 696f1e4d7..249f05f5d 100644
--- a/apps/api/src/durable-objects/task-runner/helpers.ts
+++ b/apps/api/src/durable-objects/task-runner/helpers.ts
@@ -34,6 +34,11 @@ export function isTransientError(err: unknown): boolean {
     return false;
   }
 
+  // Honor explicit retryable metadata (e.g. from provider libraries)
+  const retryable = (err as Error & { retryable?: boolean }).retryable;
+  if (retryable === true) return true;
+  if (retryable === false) return false;
+
   const msg = err.message.toLowerCase();
 
   // Network / timeout errors — always transient
diff --git a/apps/api/src/durable-objects/task-runner/index.ts b/apps/api/src/durable-objects/task-runner/index.ts
index c960d48e3..60481da49 100644
--- a/apps/api/src/durable-objects/task-runner/index.ts
+++ b/apps/api/src/durable-objects/task-runner/index.ts
@@ -205,7 +205,7 @@ export class TaskRunner extends DurableObject<Env> {
         durationMs,
       });
 
-      if (isTransientError(err) && state.retryCount < this.getMaxRetries()) {
+      if (isTransientError(err) && state.retryCount < this.getMaxRetries(state.currentStep)) {
         // Transient failure — retry with backoff
         state.retryCount++;
         await this.ctx.storage.put('state', state);
@@ -275,13 +275,33 @@ export class TaskRunner extends DurableObject<Env> {
   // Configuration (all configurable via env vars — Constitution Principle XI)
   // =========================================================================
 
-  private getMaxRetries(): number {
+  private getMaxRetries(step?: TaskExecutionStep): number {
+    // Per-step env override takes precedence over the global setting
+    if (step) {
+      const envKey = this.getPerStepEnvKey(step);
+      if (envKey) {
+        const perStep = parseEnvInt((this.env as unknown as Record<string, string | undefined>)[envKey], 0);
+        if (perStep > 0) return perStep;
+      }
+    }
     return parseEnvInt(
       this.env.TASK_RUNNER_STEP_MAX_RETRIES,
       DEFAULT_TASK_RUNNER_STEP_MAX_RETRIES,
     );
   }
 
+  private getPerStepEnvKey(step: TaskExecutionStep): string | null {
+    const map: Partial<Record<TaskExecutionStep, string>> = {
+      node_selection: 'TASK_RUNNER_NODE_SELECTION_MAX_RETRIES',
+      node_provisioning: 'TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES',
+      node_agent_ready: 'TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES',
+      workspace_creation: 'TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES',
+      workspace_ready: 'TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES',
+      agent_session: 'TASK_RUNNER_AGENT_SESSION_MAX_RETRIES',
+    };
+    return map[step] ?? null;
+  }
+
   private getRetryBaseDelayMs(): number {
     return parseEnvInt(
       this.env.TASK_RUNNER_RETRY_BASE_DELAY_MS,
diff --git a/apps/api/src/env.ts b/apps/api/src/env.ts
index 1b0b44cce..34b371496 100644
--- a/apps/api/src/env.ts
+++ b/apps/api/src/env.ts
@@ -196,8 +196,21 @@ export interface Env {
   HETZNER_BASE_IMAGE?: string;
   // External API timeouts (milliseconds)
   HETZNER_API_TIMEOUT_MS?: string;
+  HETZNER_API_RETRY_MAX_ATTEMPTS?: string;
+  HETZNER_API_RETRY_BASE_DELAY_MS?: string;
+  HETZNER_API_RETRY_MAX_DELAY_MS?: string;
+  HETZNER_PLACEMENT_RETRY_DELAY_MS?: string;
+  HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string;
+  HETZNER_PLACEMENT_FALLBACK_ENABLED?: string;
+  HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string;
   CF_API_TIMEOUT_MS?: string;
+  CF_API_RETRY_MAX_ATTEMPTS?: string;
+  CF_API_RETRY_BASE_DELAY_MS?: string;
+  CF_API_RETRY_MAX_DELAY_MS?: string;
   NODE_AGENT_REQUEST_TIMEOUT_MS?: string;
+  NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS?: string;
+  NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS?: string;
+  NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS?: string;
   // Project data DO limits
   CACHED_COMMANDS_MAX_PER_AGENT?: string;
   CACHED_COMMANDS_MAX_AGENT_TYPE_LENGTH?: string;
@@ -240,6 +253,12 @@ export interface Env {
   HEARTBEAT_ACP_SWEEP_TIMEOUT_MS?: string;
   // TaskRunner DO configuration (TDF-2: alarm-driven orchestration)
   TASK_RUNNER_STEP_MAX_RETRIES?: string;
+  TASK_RUNNER_NODE_SELECTION_MAX_RETRIES?: string;
+  TASK_RUNNER_NODE_PROVISIONING_MAX_RETRIES?: string;
+  TASK_RUNNER_NODE_AGENT_READY_MAX_RETRIES?: string;
+  TASK_RUNNER_WORKSPACE_CREATION_MAX_RETRIES?: string;
+  TASK_RUNNER_WORKSPACE_READY_MAX_RETRIES?: string;
+  TASK_RUNNER_AGENT_SESSION_MAX_RETRIES?: string;
   TASK_RUNNER_RETRY_BASE_DELAY_MS?: string;
   TASK_RUNNER_RETRY_MAX_DELAY_MS?: string;
   TASK_RUNNER_AGENT_POLL_INTERVAL_MS?: string;
diff --git a/apps/api/src/services/dns.ts b/apps/api/src/services/dns.ts
index 306d78835..956c5b35f 100644
--- a/apps/api/src/services/dns.ts
+++ b/apps/api/src/services/dns.ts
@@ -1,6 +1,7 @@
 import type { Env } from '../env';
 import { log } from '../lib/logger';
-import { fetchWithTimeout, getTimeoutMs } from './fetch-timeout';
+import type { FetchRetryOptions } from './fetch-timeout';
+import { fetchWithTimeoutAndRetry, getRetryDelayMs, getRetryMaxAttempts, getTimeoutMs } from './fetch-timeout';
 
 const CLOUDFLARE_API_BASE = 'https://api.cloudflare.com/client/v4';
 
@@ -10,6 +11,23 @@ const DEFAULT_DNS_TTL = 60;
 /** Default timeout for Cloudflare API calls (per Constitution Principle XI) */
 const DEFAULT_CF_API_TIMEOUT_MS = 30_000;
 
+/** Default retry config for CF API */
+const DEFAULT_CF_API_RETRY_MAX_ATTEMPTS = 3;
+const DEFAULT_CF_API_RETRY_BASE_DELAY_MS = 1_000;
+const DEFAULT_CF_API_RETRY_MAX_DELAY_MS = 30_000;
+
+/**
+ * Build retry options from env.
+ */
+function cfRetryOptions(env: Env): FetchRetryOptions {
+  return {
+    timeoutMs: getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS),
+    maxAttempts: getRetryMaxAttempts(env.CF_API_RETRY_MAX_ATTEMPTS, DEFAULT_CF_API_RETRY_MAX_ATTEMPTS),
+    baseDelayMs: getRetryDelayMs(env.CF_API_RETRY_BASE_DELAY_MS, DEFAULT_CF_API_RETRY_BASE_DELAY_MS),
+    maxDelayMs: getRetryDelayMs(env.CF_API_RETRY_MAX_DELAY_MS, DEFAULT_CF_API_RETRY_MAX_DELAY_MS),
+  };
+}
+
 /**
  * Get DNS TTL from env or use default (per constitution principle XI).
  */
@@ -79,16 +97,74 @@ export class DNSService implements DNSServiceInterface {
 }
 
 /**
- * Create a DNS A record for a workspace.
- * Uses Cloudflare proxy for automatic HTTPS.
+ * Search for an existing DNS A record by name.
+ * Returns the record if found, null otherwise.
+ */
+async function findExistingARecord(
+  recordName: string,
+  env: Env,
+): Promise<{ id: string; content: string } | null> {
+  const retryOpts = cfRetryOptions(env);
+  const searchUrl = `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records?name=${encodeURIComponent(recordName)}&type=A`;
+
+  const response = await fetchWithTimeoutAndRetry(
+    searchUrl,
+    {
+      headers: {
+        Authorization: `Bearer ${env.CF_API_TOKEN}`,
+      },
+    },
+    retryOpts,
+  );
+
+  if (!response.ok) return null;
+
+  const data = await response.json() as { result: Array<{ id: string; content: string; type: string }> };
+  const records = data.result || [];
+  const aRecord = records.find((r) => r.type === 'A');
+  return aRecord ? { id: aRecord.id, content: aRecord.content } : null;
+}
+
+/**
+ * Create or upsert a DNS A record for a workspace.
+ * Uses idempotent upsert: searches for existing matching record and PATCHes if found.
+ * If CREATE returns duplicate/already-exists, re-searches and PATCHes.
  */
 export async function createDNSRecord(
   workspaceId: string,
   ip: string,
   env: Env
 ): Promise<string> {
-  const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS);
-  const response = await fetchWithTimeout(
+  const recordName = `ws-${workspaceId}.${env.BASE_DOMAIN}`;
+  return upsertARecord(recordName, ip, true, env);
+}
+
+/**
+ * Idempotent DNS A record upsert.
+ * 1. Search for existing record matching the name.
+ * 2. If found, PATCH it with the new IP.
+ * 3. If not found, POST to create.
+ * 4. If create returns duplicate, re-search and PATCH.
+ */
+async function upsertARecord(
+  recordName: string,
+  ip: string,
+  proxied: boolean,
+  env: Env,
+): Promise<string> {
+  const retryOpts = cfRetryOptions(env);
+  const ttl = getDnsTTL(env);
+
+  // Step 1: Search for existing record
+  const existing = await findExistingARecord(recordName, env);
+  if (existing) {
+    log.info('dns.upsert_existing_found', { name: recordName, existingIp: existing.content, newIp: ip });
+    await patchDNSRecordContent(existing.id, ip, env);
+    return existing.id;
+  }
+
+  // Step 2: Attempt to create
+  const response = await fetchWithTimeoutAndRetry(
     `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records`,
     {
       method: 'POST',
@@ -98,34 +174,78 @@ export async function createDNSRecord(
       },
       body: JSON.stringify({
         type: 'A',
-        name: `ws-${workspaceId}`,
+        name: recordName,
         content: ip,
-        ttl: getDnsTTL(env), // Configurable TTL (default 1 minute for fast updates)
-        proxied: true, // Enable Cloudflare proxy for HTTPS
+        ttl,
+        proxied,
       }),
     },
-    timeoutMs
+    retryOpts,
+  );
+
+  if (response.ok) {
+    const data = await response.json() as { result: { id: string } };
+    return data.result.id;
+  }
+
+  // Step 3: If duplicate/already-exists, re-search and PATCH
+  const errorBody = await response.json().catch(() => ({})) as { errors?: Array<{ code: number; message: string }> };
+  const isDuplicate = errorBody.errors?.some(
+    (e) => e.code === 81057 || e.message?.toLowerCase().includes('already exists'),
+  );
+
+  if (isDuplicate) {
+    log.info('dns.create_duplicate_fallback', { name: recordName });
+    const retryExisting = await findExistingARecord(recordName, env);
+    if (retryExisting) {
+      await patchDNSRecordContent(retryExisting.id, ip, env);
+      return retryExisting.id;
+    }
+  }
+
+  const message = errorBody.errors?.[0]?.message || `Failed to create DNS record: ${response.status}`;
+  throw new Error(message);
+}
+
+/**
+ * PATCH a DNS record with a new IP address.
+ */
+async function patchDNSRecordContent(
+  recordId: string,
+  ip: string,
+  env: Env,
+): Promise<void> {
+  const retryOpts = cfRetryOptions(env);
+  const response = await fetchWithTimeoutAndRetry(
+    `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records/${recordId}`,
+    {
+      method: 'PATCH',
+      headers: {
+        Authorization: `Bearer ${env.CF_API_TOKEN}`,
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({ content: ip }),
+    },
+    retryOpts,
   );
 
   if (!response.ok) {
     const error = await response.json().catch(() => ({})) as { errors?: Array<{ message: string }> };
-    const message = error.errors?.[0]?.message || `Failed to create DNS record: ${response.status}`;
+    const message = error.errors?.[0]?.message || `Failed to update DNS record: ${response.status}`;
     throw new Error(message);
   }
-
-  const data = await response.json() as { result: { id: string } };
-  return data.result.id;
 }
 
 /**
  * Delete a DNS record by ID.
+ * Idempotent: ignores 404 (record already deleted).
  */
 export async function deleteDNSRecord(
   recordId: string,
   env: Env
 ): Promise<void> {
-  const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS);
-  const response = await fetchWithTimeout(
+  const retryOpts = cfRetryOptions(env);
+  const response = await fetchWithTimeoutAndRetry(
     `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records/${recordId}`,
     {
       method: 'DELETE',
@@ -133,7 +253,7 @@ export async function deleteDNSRecord(
         Authorization: `Bearer ${env.CF_API_TOKEN}`,
       },
     },
-    timeoutMs
+    retryOpts,
   );
 
   // Ignore 404 errors (record already deleted)
@@ -152,27 +272,7 @@ export async function updateDNSRecord(
   ip: string,
   env: Env
 ): Promise<void> {
-  const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS);
-  const response = await fetchWithTimeout(
-    `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records/${recordId}`,
-    {
-      method: 'PATCH',
-      headers: {
-        Authorization: `Bearer ${env.CF_API_TOKEN}`,
-        'Content-Type': 'application/json',
-      },
-      body: JSON.stringify({
-        content: ip,
-      }),
-    },
-    timeoutMs
-  );
-
-  if (!response.ok) {
-    const error = await response.json().catch(() => ({})) as { errors?: Array<{ message: string }> };
-    const message = error.errors?.[0]?.message || `Failed to update DNS record: ${response.status}`;
-    throw new Error(message);
-  }
+  await patchDNSRecordContent(recordId, ip, env);
 }
 
 /**
@@ -189,6 +289,7 @@ export async function cleanupWorkspaceDNSRecords(
 ): Promise<number> {
   const baseDomain = env.BASE_DOMAIN;
   const id = workspaceId.toLowerCase();
+  const retryOpts = cfRetryOptions(env);
 
   // Search for all possible DNS record name formats
   const recordNames = [
@@ -200,12 +301,11 @@ export async function cleanupWorkspaceDNSRecords(
 
   for (const recordName of recordNames) {
     const searchUrl = `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records?name=${encodeURIComponent(recordName)}`;
-    const cfTimeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS);
-    const response = await fetchWithTimeout(searchUrl, {
+    const response = await fetchWithTimeoutAndRetry(searchUrl, {
       headers: {
         Authorization: `Bearer ${env.CF_API_TOKEN}`,
       },
-    }, cfTimeoutMs);
+    }, retryOpts);
 
     if (!response.ok) {
       log.error('dns.search_records_failed', { recordName, status: response.status });
@@ -248,47 +348,15 @@ export async function createBackendDNSRecord(
 
 /**
  * Create a proxied (orange-clouded) A record for a node VM backend.
- * Cloudflare's edge handles TLS termination; the VM agent serves HTTPS
- * with an Origin CA certificate that CF trusts.
- *
- * Uses {nodeId}.vm.{BASE_DOMAIN} (two-level subdomain) to bypass Cloudflare
- * same-zone routing. The wildcard Worker route *.{domain}/* only matches
- * single-level subdomains, so {nodeId}.vm.{domain} is NOT intercepted.
- * This allows Worker subrequests (from DO alarms) to reach the VM directly.
+ * Uses idempotent upsert to recover from partial failures.
  */
 export async function createNodeBackendDNSRecord(
   nodeId: string,
   ip: string,
   env: Env
 ): Promise<string> {
-  const timeoutMs = getTimeoutMs(env.CF_API_TIMEOUT_MS, DEFAULT_CF_API_TIMEOUT_MS);
-  const response = await fetchWithTimeout(
-    `${CLOUDFLARE_API_BASE}/zones/${env.CF_ZONE_ID}/dns_records`,
-    {
-      method: 'POST',
-      headers: {
-        Authorization: `Bearer ${env.CF_API_TOKEN}`,
-        'Content-Type': 'application/json',
-      },
-      body: JSON.stringify({
-        type: 'A',
-        name: `${nodeId.toLowerCase()}.vm`,
-        content: ip,
-        ttl: getDnsTTL(env),
-        proxied: true, // Orange-clouded — CF edge terminates TLS, re-encrypts to Origin CA
-      }),
-    },
-    timeoutMs
-  );
-
-  if (!response.ok) {
-    const error = await response.json().catch(() => ({})) as { errors?: Array<{ message: string }> };
-    const message = error.errors?.[0]?.message || `Failed to create backend DNS record: ${response.status}`;
-    throw new Error(message);
-  }
-
-  const data = await response.json() as { result: { id: string } };
-  return data.result.id;
+  const recordName = `${nodeId.toLowerCase()}.vm.${env.BASE_DOMAIN}`;
+  return upsertARecord(recordName, ip, true, env);
 }
 
 /**
diff --git a/apps/api/src/services/fetch-timeout.ts b/apps/api/src/services/fetch-timeout.ts
index 38fd2d65c..6ff94f1af 100644
--- a/apps/api/src/services/fetch-timeout.ts
+++ b/apps/api/src/services/fetch-timeout.ts
@@ -1,11 +1,18 @@
 /**
- * Fetch wrapper with configurable timeout.
+ * Fetch wrapper with configurable timeout and retry.
  *
  * Cloudflare Workers support `AbortController`/`AbortSignal` for fetch cancellation.
- * This utility adds a timeout that aborts the request if it exceeds the specified duration.
+ * This utility adds a timeout that aborts the request if it exceeds the specified duration,
+ * and optional retry with bounded exponential backoff and jitter.
  */
 
 const DEFAULT_API_TIMEOUT_MS = 30_000;
+const DEFAULT_RETRY_MAX_ATTEMPTS = 3;
+const DEFAULT_RETRY_BASE_DELAY_MS = 1_000;
+const DEFAULT_RETRY_MAX_DELAY_MS = 30_000;
+
+/** HTTP status codes considered transient and safe to retry. */
+const RETRYABLE_STATUS_CODES = new Set([408, 425, 429, 500, 502, 503, 504]);
 
 /**
  * Parse a timeout from env or return the default.
@@ -17,6 +24,98 @@ export function getTimeoutMs(envValue: string | undefined, defaultMs: number = D
   return parsed;
 }
 
+/**
+ * Parse a retry-related env int or return the provided default.
+ */
+export function getRetryMaxAttempts(envValue: string | undefined, defaultVal: number = DEFAULT_RETRY_MAX_ATTEMPTS): number {
+  if (!envValue) return defaultVal;
+  const parsed = Number.parseInt(envValue, 10);
+  if (!Number.isFinite(parsed) || parsed < 0) return defaultVal;
+  return parsed;
+}
+
+/**
+ * Parse a retry delay env int or return the provided default.
+ */
+export function getRetryDelayMs(envValue: string | undefined, defaultVal: number): number {
+  if (!envValue) return defaultVal;
+  const parsed = Number.parseInt(envValue, 10);
+  if (!Number.isFinite(parsed) || parsed < 0) return defaultVal;
+  return parsed;
+}
+
+/**
+ * Compute bounded exponential backoff with jitter.
+ *
+ * delay = min(baseDelay * 2^attempt, maxDelay) + random jitter (0-25%)
+ */
+export function computeRetryDelayMs(
+  attempt: number,
+  baseDelayMs: number = DEFAULT_RETRY_BASE_DELAY_MS,
+  maxDelayMs: number = DEFAULT_RETRY_MAX_DELAY_MS,
+): number {
+  const exponential = baseDelayMs * Math.pow(2, attempt);
+  const capped = Math.min(exponential, maxDelayMs);
+  const jitter = capped * Math.random() * 0.25;
+  return Math.floor(capped + jitter);
+}
+
+/** Options for fetchWithTimeoutAndRetry. */
+export interface FetchRetryOptions {
+  /** Maximum number of retry attempts (0 = no retries). */
+  maxAttempts?: number;
+  /** Base delay for exponential backoff in ms. */
+  baseDelayMs?: number;
+  /** Maximum delay between retries in ms. */
+  maxDelayMs?: number;
+  /** Per-request timeout in ms. */
+  timeoutMs?: number;
+}
+
+/**
+ * Determine whether an HTTP response or fetch error is retryable.
+ */
+function isRetryableResponse(response: Response): boolean {
+  return RETRYABLE_STATUS_CODES.has(response.status);
+}
+
+function isRetryableError(err: unknown): boolean {
+  if (!(err instanceof Error)) return false;
+  const msg = err.message.toLowerCase();
+  return (
+    err.name === 'AbortError' ||
+    msg.includes('timed out') ||
+    msg.includes('timeout') ||
+    msg.includes('fetch failed') ||
+    msg.includes('network') ||
+    msg.includes('econnrefused') ||
+    msg.includes('econnreset') ||
+    msg.includes('enotfound') ||
+    msg.includes('socket hang up')
+  );
+}
+
+/**
+ * Parse the Retry-After header value into a delay in milliseconds.
+ * Supports both integer seconds and HTTP date formats.
+ * Returns undefined if the header is missing or unparseable.
+ */
+function parseRetryAfter(response: Response): number | undefined {
+  const header = response.headers.get('Retry-After');
+  if (!header) return undefined;
+  const seconds = Number.parseInt(header, 10);
+  if (Number.isFinite(seconds) && seconds > 0) {
+    return seconds * 1000;
+  }
+  // Try HTTP date
+  const date = Date.parse(header);
+  if (Number.isFinite(date)) {
+    const delayMs = date - Date.now();
+    return delayMs > 0 ? delayMs : undefined;
+  }
+  return undefined;
+}
+
 /**
  * Fetch with an automatic timeout.
  * Aborts the request if it doesn't complete within `timeoutMs`.
@@ -50,3 +149,60 @@ export async function fetchWithTimeout(
     clearTimeout(timeoutId);
   }
 }
+
+/**
+ * Fetch with timeout and automatic retry on transient failures.
+ *
+ * Retries on:
+ *   - HTTP status codes: 408, 425, 429, 500, 502, 503, 504
+ *   - Network/timeout errors (AbortError, fetch failed, ECONNREFUSED, etc.)
+ *
+ * Honors Retry-After headers when present.
+ * Uses bounded exponential backoff with random jitter.
+ */
+export async function fetchWithTimeoutAndRetry(
+  url: string | URL,
+  init?: RequestInit,
+  options?: FetchRetryOptions,
+): Promise<Response> {
+  const maxAttempts = options?.maxAttempts ?? DEFAULT_RETRY_MAX_ATTEMPTS;
+  const baseDelayMs = options?.baseDelayMs ?? DEFAULT_RETRY_BASE_DELAY_MS;
+  const maxDelayMs = options?.maxDelayMs ?? DEFAULT_RETRY_MAX_DELAY_MS;
+  const timeoutMs = options?.timeoutMs ?? DEFAULT_API_TIMEOUT_MS;
+
+  let lastError: Error | undefined;
+  let lastResponse: Response | undefined;
+
+  for (let attempt = 0; attempt <= maxAttempts; attempt++) {
+    try {
+      const response = await fetchWithTimeout(url, init, timeoutMs);
+
+      if (!isRetryableResponse(response) || attempt >= maxAttempts) {
+        return response;
+      }
+
+      // Retryable status — wait and try again
+      lastResponse = response;
+      const retryAfterMs = parseRetryAfter(response);
+      const backoffMs = retryAfterMs ?? computeRetryDelayMs(attempt, baseDelayMs, maxDelayMs);
+      await sleep(backoffMs);
+    } catch (err) {
+      lastError = err instanceof Error ? err : new Error(String(err));
+
+      if (!isRetryableError(err) || attempt >= maxAttempts) {
+        throw lastError;
+      }
+
+      const backoffMs = computeRetryDelayMs(attempt, baseDelayMs, maxDelayMs);
+      await sleep(backoffMs);
+    }
+  }
+
+  // Should not reach here, but if it does return last response or throw last error
+  if (lastResponse) return lastResponse;
+  throw lastError ?? new Error(`fetchWithTimeoutAndRetry: exhausted ${maxAttempts} retries for ${url}`);
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
diff --git a/apps/api/src/services/node-agent.ts b/apps/api/src/services/node-agent.ts
index 0f5593e44..318b6ab47 100644
--- a/apps/api/src/services/node-agent.ts
+++ b/apps/api/src/services/node-agent.ts
@@ -1,9 +1,25 @@
 import type { Env } from '../env';
-import { fetchWithTimeout, getTimeoutMs } from './fetch-timeout';
+import type { FetchRetryOptions } from './fetch-timeout';
+import { fetchWithTimeoutAndRetry, getRetryDelayMs, getRetryMaxAttempts, getTimeoutMs } from './fetch-timeout';
 import { signNodeManagementToken } from './jwt';
 import { recordNodeRoutingMetric } from './telemetry';
 
 const DEFAULT_NODE_AGENT_REQUEST_TIMEOUT_MS = 30_000;
+const DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS = 2;
+const DEFAULT_NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS = 1_000;
+const DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS = 10_000;
+
+/**
+ * Build retry options for node agent requests from env.
+ */
+function nodeAgentRetryOptions(env: Env): FetchRetryOptions {
+  return {
+    timeoutMs: getTimeoutMs(env.NODE_AGENT_REQUEST_TIMEOUT_MS, DEFAULT_NODE_AGENT_REQUEST_TIMEOUT_MS),
+    maxAttempts: getRetryMaxAttempts(env.NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS, DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_ATTEMPTS),
+    baseDelayMs: getRetryDelayMs(env.NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS, DEFAULT_NODE_AGENT_REQUEST_RETRY_BASE_DELAY_MS),
+    maxDelayMs: getRetryDelayMs(env.NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS, DEFAULT_NODE_AGENT_REQUEST_RETRY_MAX_DELAY_MS),
+  };
+}
 
 const DEFAULT_NODE_AGENT_READY_TIMEOUT_MS = 900_000; // 15 min — cloud-init takes 8-12 min on Hetzner
 const DEFAULT_NODE_AGENT_READY_POLL_INTERVAL_MS = 5000;
@@ -136,14 +152,11 @@ async function nodeAgentRequest<T>(
     env
   );
 
-  const requestTimeoutMs = getTimeoutMs(
-    env.NODE_AGENT_REQUEST_TIMEOUT_MS,
-    DEFAULT_NODE_AGENT_REQUEST_TIMEOUT_MS
-  );
-  const response = await fetchWithTimeout(url, {
+  const retryOpts = nodeAgentRetryOptions(env);
+  const response = await fetchWithTimeoutAndRetry(url, {
     ...options,
     headers,
-  }, requestTimeoutMs);
+  }, retryOpts);
 
   recordNodeRoutingMetric(
     {
@@ -519,8 +532,11 @@ export async function nodeAgentRawRequest(
   headers.set('X-SAM-Node-Id', nodeId);
 
   const DEFAULT_EXPORT_TIMEOUT_MS = 60_000;
-  const timeoutMs = getTimeoutMs(env.NODE_AGENT_REQUEST_TIMEOUT_MS, DEFAULT_EXPORT_TIMEOUT_MS);
-  return fetchWithTimeout(url, { method: 'GET', headers }, timeoutMs);
+  const retryOpts: FetchRetryOptions = {
+    ...nodeAgentRetryOptions(env),
+    timeoutMs: getTimeoutMs(env.NODE_AGENT_REQUEST_TIMEOUT_MS, DEFAULT_EXPORT_TIMEOUT_MS),
+  };
+  return fetchWithTimeoutAndRetry(url, { method: 'GET', headers }, retryOpts);
 }
 
 export async function rebuildWorkspaceOnNode(
diff --git a/apps/api/src/services/nodes.ts b/apps/api/src/services/nodes.ts
index ff12eb958..1b2925bd1 100644
--- a/apps/api/src/services/nodes.ts
+++ b/apps/api/src/services/nodes.ts
@@ -1,4 +1,5 @@
 import { generateCloudInit, validateCloudInitSize } from '@simple-agent-manager/cloud-init';
+import type { Provider } from '@simple-agent-manager/providers';
 import { ProviderError } from '@simple-agent-manager/providers';
 import type { CredentialProvider, TaskMode } from '@simple-agent-manager/shared';
 import { and, eq } from 'drizzle-orm';
@@ -62,6 +63,34 @@ export function resolveHetznerBaseImageOverride(
   return trimmed ? trimmed : undefined;
 }
 
+/**
+ * Recover an existing VM that matches the node ID labels.
+ * Used for idempotent provisioning: if a previous createVM succeeded but
+ * the subsequent DB update failed, this finds the orphaned VM so we can
+ * resume instead of creating a duplicate.
+ */
+async function recoverExistingNodeVM(
+  provider: Provider,
+  nodeId: string,
+): Promise<{ id: string; ip: string | null } | null> {
+  try {
+    const vms = await provider.listVMs({
+      node: nodeId.toLowerCase(),
+      managed: 'simple-agent-manager',
+    });
+    const first = vms[0];
+    if (first) {
+      return { id: first.id, ip: first.ip ?? null };
+    }
+  } catch (err) {
+    log.warn('node_provisioning.recovery_search_failed', {
+      nodeId,
+      error: err instanceof Error ? err.message : String(err),
+    });
+  }
+  return null;
+}
+
 export async function createNodeRecord(env: Env, input: CreateNodeInput): Promise<ProvisionedNode> {
   const db = drizzle(env.DATABASE, { schema });
   const now = new Date().toISOString();
@@ -164,6 +193,7 @@ export async function provisionNode(
       originCaCert: env.ORIGIN_CA_CERT,
       originCaKey: env.ORIGIN_CA_KEY,
       vmAgentPort: env.VM_AGENT_PORT,
+      provider: targetProvider,
     });
 
     if (!validateCloudInitSize(cloudInit)) {
@@ -172,6 +202,49 @@ export async function provisionNode(
 
     const provider = providerResult.provider;
 
+    // Idempotent recovery: if a previous attempt already created the VM but
+    // the control-plane state update failed (e.g. crash between createVM and
+    // the DB update), recover the existing VM instead of creating a duplicate.
+    const existingVm = await recoverExistingNodeVM(provider, node.id);
+    if (existingVm) {
+      log.info('node_provisioning.recovered_existing_vm', {
+        nodeId: node.id,
+        providerInstanceId: existingVm.id,
+        ip: existingVm.ip ?? null,
+      });
+
+      if (existingVm.ip) {
+        let backendDnsRecordId: string | null = null;
+        try {
+          backendDnsRecordId = await createNodeBackendDNSRecord(node.id, existingVm.ip, env);
+        } catch (dnsErr) {
+          log.error('node_provisioning.dns_record_failed', { nodeId: node.id, ...serializeError(dnsErr) });
+        }
+        await db
+          .update(schema.nodes)
+          .set({
+            providerInstanceId: existingVm.id,
+            ipAddress: existingVm.ip,
+            backendDnsRecordId,
+            status: 'running',
+            healthStatus: 'stale',
+            updatedAt: new Date().toISOString(),
+          })
+          .where(eq(schema.nodes.id, node.id));
+      } else {
+        await db
+          .update(schema.nodes)
+          .set({
+            providerInstanceId: existingVm.id,
+            status: 'creating',
+            errorMessage: 'Recovered existing VM — awaiting IP allocation',
+            updatedAt: new Date().toISOString(),
+          })
+          .where(eq(schema.nodes.id, node.id));
+      }
+      return;
+    }
+
     const baseImageOverride = resolveHetznerBaseImageOverride(
       targetProvider,
       env.HETZNER_BASE_IMAGE,
diff --git a/apps/api/src/services/provider-credentials.ts b/apps/api/src/services/provider-credentials.ts
index 65c3bf97f..1d837f121 100644
--- a/apps/api/src/services/provider-credentials.ts
+++ b/apps/api/src/services/provider-credentials.ts
@@ -1,4 +1,4 @@
-import type { Provider, ProviderConfig } from '@simple-agent-manager/providers';
+import type { HetznerProviderConfig, Provider, ProviderConfig } from '@simple-agent-manager/providers';
 import { createProvider, GcpProvider } from '@simple-agent-manager/providers';
 import type { CredentialProvider, CredentialSource, GcpOidcCredential } from '@simple-agent-manager/shared';
 import { and, eq } from 'drizzle-orm';
@@ -61,10 +61,20 @@ export function extractScalewaySecretKey(decryptedToken: string): string | null
 export function buildProviderConfig(
   provider: CredentialProvider,
   decryptedToken: string,
+  env?: {
+    HETZNER_API_TIMEOUT_MS?: string;
+    HETZNER_API_RETRY_MAX_ATTEMPTS?: string;
+    HETZNER_API_RETRY_BASE_DELAY_MS?: string;
+    HETZNER_API_RETRY_MAX_DELAY_MS?: string;
+    HETZNER_PLACEMENT_RETRY_DELAY_MS?: string;
+    HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string;
+    HETZNER_PLACEMENT_FALLBACK_ENABLED?: string;
+    HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string;
+  },
 ): ProviderConfig {
   switch (provider) {
     case 'hetzner':
-      return { provider: 'hetzner', apiToken: decryptedToken };
+      return buildHetznerProviderConfig(decryptedToken, env);
     case 'scaleway': {
       let parsed: unknown;
       try {
@@ -90,6 +100,44 @@ export function buildProviderConfig(
   }
 }
 
+function buildHetznerProviderConfig(
+  apiToken: string,
+  env?: {
+    HETZNER_API_TIMEOUT_MS?: string;
+    HETZNER_API_RETRY_MAX_ATTEMPTS?: string;
+    HETZNER_API_RETRY_BASE_DELAY_MS?: string;
+    HETZNER_API_RETRY_MAX_DELAY_MS?: string;
+    HETZNER_PLACEMENT_RETRY_DELAY_MS?: string;
+    HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string;
+    HETZNER_PLACEMENT_FALLBACK_ENABLED?: string;
+    HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string;
+  },
+): HetznerProviderConfig {
+  const config: HetznerProviderConfig = {
+    provider: 'hetzner',
+    apiToken,
+  };
+
+  const timeoutMs = parseOptionalPositiveInt(env?.HETZNER_API_TIMEOUT_MS);
+  if (timeoutMs !== undefined) config.timeoutMs = timeoutMs;
+  const apiRetryMaxAttempts = parseOptionalPositiveInt(env?.HETZNER_API_RETRY_MAX_ATTEMPTS);
+  if (apiRetryMaxAttempts !== undefined) config.apiRetryMaxAttempts = apiRetryMaxAttempts;
+  const apiRetryBaseDelayMs = parseOptionalPositiveInt(env?.HETZNER_API_RETRY_BASE_DELAY_MS);
+  if (apiRetryBaseDelayMs !== undefined) config.apiRetryBaseDelayMs = apiRetryBaseDelayMs;
+  const apiRetryMaxDelayMs = parseOptionalPositiveInt(env?.HETZNER_API_RETRY_MAX_DELAY_MS);
+  if (apiRetryMaxDelayMs !== undefined) config.apiRetryMaxDelayMs = apiRetryMaxDelayMs;
+  const placementRetryDelayMs = parseOptionalPositiveInt(env?.HETZNER_PLACEMENT_RETRY_DELAY_MS);
+  if (placementRetryDelayMs !== undefined) config.placementRetryDelayMs = placementRetryDelayMs;
+  const placementRetryAttempts = parseOptionalPositiveInt(env?.HETZNER_PLACEMENT_RETRY_ATTEMPTS);
+  if (placementRetryAttempts !== undefined) config.placementRetryAttempts = placementRetryAttempts;
+  const placementFallbackEnabled = parseOptionalBoolean(env?.HETZNER_PLACEMENT_FALLBACK_ENABLED);
+  if (placementFallbackEnabled !== undefined) config.placementFallbackEnabled = placementFallbackEnabled;
+  const placementFallbackLocations = parseOptionalList(env?.HETZNER_PLACEMENT_FALLBACK_LOCATIONS);
+  if (placementFallbackLocations !== undefined) config.placementFallbackLocations = placementFallbackLocations;
+
+  return config;
+}
+
 /**
  * Parse a decrypted GCP credential token into structured GcpOidcCredential fields.
  */
@@ -179,7 +227,24 @@ export async function createProviderForUser(
   db: ReturnType<typeof drizzle>,
   userId: string,
   encryptionKey: string,
-  env: { KV: KVNamespace; BASE_DOMAIN: string; JWT_PRIVATE_KEY: string; JWT_PUBLIC_KEY: string; GCP_IDENTITY_TOKEN_EXPIRY_SECONDS?: string; GCP_TOKEN_CACHE_TTL_SECONDS?: string; GCP_API_TIMEOUT_MS?: string; GCP_OPERATION_POLL_TIMEOUT_MS?: string },
+  env: {
+    KV: KVNamespace;
+    BASE_DOMAIN: string;
+    JWT_PRIVATE_KEY: string;
+    JWT_PUBLIC_KEY: string;
+    GCP_IDENTITY_TOKEN_EXPIRY_SECONDS?: string;
+    GCP_TOKEN_CACHE_TTL_SECONDS?: string;
+    GCP_API_TIMEOUT_MS?: string;
+    GCP_OPERATION_POLL_TIMEOUT_MS?: string;
+    HETZNER_API_TIMEOUT_MS?: string;
+    HETZNER_API_RETRY_MAX_ATTEMPTS?: string;
+    HETZNER_API_RETRY_BASE_DELAY_MS?: string;
+    HETZNER_API_RETRY_MAX_DELAY_MS?: string;
+    HETZNER_PLACEMENT_RETRY_DELAY_MS?: string;
+    HETZNER_PLACEMENT_RETRY_ATTEMPTS?: string;
+    HETZNER_PLACEMENT_FALLBACK_ENABLED?: string;
+    HETZNER_PLACEMENT_FALLBACK_LOCATIONS?: string;
+  },
   targetProvider?: CredentialProvider,
 ): Promise<{ provider: Provider; providerName: CredentialProvider; credentialSource: CredentialSource } | null> {
   // 1. Try user's own credential first
@@ -215,7 +280,7 @@ export async function createProviderForUser(
       return { provider, providerName, credentialSource: 'user' };
     }
 
-    const config = buildProviderConfig(providerName, decryptedToken);
+    const config = buildProviderConfig(providerName, decryptedToken, env);
     return { provider: createProvider(config), providerName, credentialSource: 'user' };
   }
 
@@ -241,10 +306,29 @@ export async function createProviderForUser(
     return { provider, providerName: platformProvider, credentialSource: 'platform' };
   }
 
-  const config = buildProviderConfig(platformProvider, decryptedToken);
+  const config = buildProviderConfig(platformProvider, decryptedToken, env);
   return { provider: createProvider(config), providerName: platformProvider, credentialSource: 'platform' };
 }
 
+function parseOptionalPositiveInt(value: string | undefined): number | undefined {
+  if (!value) return undefined;
+  const parsed = Number.parseInt(value, 10);
+  return Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
+}
+
+function parseOptionalBoolean(value: string | undefined): boolean | undefined {
+  if (value === undefined || value === '') return undefined;
+  if (value === 'true') return true;
+  if (value === 'false') return false;
+  return undefined;
+}
+
+function parseOptionalList(value: string | undefined): string[] | undefined {
+  if (!value) return undefined;
+  const items = value.split(',').map((item) => item.trim()).filter(Boolean);
+  return items.length > 0 ? items : undefined;
+}
+
 /**
  * Lightweight credential source resolution — determines whether 'user' or 'platform'
  * credentials would be used for a given target provider WITHOUT decrypting tokens
diff --git a/apps/api/tests/unit/fetch-timeout.test.ts b/apps/api/tests/unit/fetch-timeout.test.ts
new file mode 100644
index 000000000..38837b415
--- /dev/null
+++ b/apps/api/tests/unit/fetch-timeout.test.ts
@@ -0,0 +1,101 @@
+/**
+ * Unit tests for fetch-timeout retry utilities.
+ *
+ * Tests pure helper functions (env parsing, delay computation)
+ * without requiring real network calls.
+ */
+import { describe, expect, it } from 'vitest';
+
+import {
+  computeRetryDelayMs,
+  getRetryDelayMs,
+  getRetryMaxAttempts,
+  getTimeoutMs,
+} from '../../src/services/fetch-timeout';
+
+describe('getTimeoutMs', () => {
+  it('returns default when env is undefined', () => {
+    expect(getTimeoutMs(undefined, 5000)).toBe(5000);
+  });
+
+  it('returns default when env is empty', () => {
+    expect(getTimeoutMs('', 5000)).toBe(5000);
+  });
+
+  it('parses valid integer', () => {
+    expect(getTimeoutMs('10000', 5000)).toBe(10000);
+  });
+
+  it('returns default for non-numeric', () => {
+    expect(getTimeoutMs('abc', 5000)).toBe(5000);
+  });
+
+  it('returns default for zero', () => {
+    expect(getTimeoutMs('0', 5000)).toBe(5000);
+  });
+
+  it('returns default for negative', () => {
+    expect(getTimeoutMs('-100', 5000)).toBe(5000);
+  });
+});
+
+describe('getRetryMaxAttempts', () => {
+  it('returns default when env is undefined', () => {
+    expect(getRetryMaxAttempts(undefined, 3)).toBe(3);
+  });
+
+  it('parses valid integer', () => {
+    expect(getRetryMaxAttempts('5', 3)).toBe(5);
+  });
+
+  it('allows zero (no retries)', () => {
+    expect(getRetryMaxAttempts('0', 3)).toBe(0);
+  });
+
+  it('returns default for negative', () => {
+    expect(getRetryMaxAttempts('-1', 3)).toBe(3);
+  });
+});
+
+describe('getRetryDelayMs', () => {
+  it('returns default when env is undefined', () => {
+    expect(getRetryDelayMs(undefined, 1000)).toBe(1000);
+  });
+
+  it('parses valid integer', () => {
+    expect(getRetryDelayMs('2000', 1000)).toBe(2000);
+  });
+
+  it('allows zero delay', () => {
+    expect(getRetryDelayMs('0', 1000)).toBe(0);
+  });
+});
+
+describe('computeRetryDelayMs', () => {
+  it('returns at least baseDelay for attempt 0', () => {
+    const delay = computeRetryDelayMs(0, 1000, 30000);
+    // base (1000) + jitter (0-25%) = 1000-1250
+    expect(delay).toBeGreaterThanOrEqual(1000);
+    expect(delay).toBeLessThanOrEqual(1250);
+  });
+
+  it('doubles delay with each attempt (exponential)', () => {
+    // At attempt 3, exponential = 1000 * 2^3 = 8000
+    // With jitter: 8000-10000
+    const delay = computeRetryDelayMs(3, 1000, 30000);
+    expect(delay).toBeGreaterThanOrEqual(8000);
+    expect(delay).toBeLessThanOrEqual(10000);
+  });
+
+  it('caps at maxDelayMs', () => {
+    const delay = computeRetryDelayMs(20, 1000, 5000);
+    // capped at 5000 + jitter (0-25%) = 5000-6250
+    expect(delay).toBeGreaterThanOrEqual(5000);
+    expect(delay).toBeLessThanOrEqual(6250);
+  });
+
+  it('handles zero base delay', () => {
+    const delay = computeRetryDelayMs(5, 0, 30000);
+    expect(delay).toBe(0);
+  });
+});
diff --git a/apps/api/tests/unit/node-agent-contract.test.ts b/apps/api/tests/unit/node-agent-contract.test.ts
index 77429c9ff..d41213f6a 100644
--- a/apps/api/tests/unit/node-agent-contract.test.ts
+++ b/apps/api/tests/unit/node-agent-contract.test.ts
@@ -802,7 +802,7 @@ describe('Node Agent client functions send correct payloads', () => {
     let capturedUrl: string | null = null;
 
     vi.doMock('../../src/services/fetch-timeout', () => ({
-      fetchWithTimeout: vi.fn().mockImplementation((url: string, init: RequestInit) => {
+      fetchWithTimeoutAndRetry: vi.fn().mockImplementation((url: string, init: RequestInit) => {
         capturedUrl = url;
         capturedHeaders = new Headers(init.headers);
         capturedBody = init.body as string;
@@ -814,6 +814,8 @@ describe('Node Agent client functions send correct payloads', () => {
         );
       }),
       getTimeoutMs: vi.fn().mockReturnValue(30000),
+      getRetryMaxAttempts: vi.fn().mockReturnValue(2),
+      getRetryDelayMs: vi.fn().mockReturnValue(1000),
     }));
 
     // Dynamic import to pick up mocks
@@ -870,7 +872,7 @@ describe('Node Agent client functions send correct payloads', () => {
     let capturedUrl: string | null = null;
 
     vi.doMock('../../src/services/fetch-timeout', () => ({
-      fetchWithTimeout: vi.fn().mockImplementation((url: string, init: RequestInit) => {
+      fetchWithTimeoutAndRetry: vi.fn().mockImplementation((url: string, init: RequestInit) => {
         capturedUrl = url;
         capturedMethod = init.method ?? 'GET';
         return Promise.resolve(
@@ -881,6 +883,8 @@ describe('Node Agent client functions send correct payloads', () => {
         );
       }),
       getTimeoutMs: vi.fn().mockReturnValue(30000),
+      getRetryMaxAttempts: vi.fn().mockReturnValue(2),
+      getRetryDelayMs: vi.fn().mockReturnValue(1000),
     }));
 
     const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent');
@@ -908,7 +912,7 @@ describe('Node Agent client functions send correct payloads', () => {
     let capturedBody: string | null = null;
 
     vi.doMock('../../src/services/fetch-timeout', () => ({
-      fetchWithTimeout: vi.fn().mockImplementation((_url: string, init: RequestInit) => {
+      fetchWithTimeoutAndRetry: vi.fn().mockImplementation((_url: string, init: RequestInit) => {
         capturedBody = init.body as string;
         return Promise.resolve(
           new Response(
@@ -927,6 +931,8 @@ describe('Node Agent client functions send correct payloads', () => {
         );
       }),
       getTimeoutMs: vi.fn().mockReturnValue(30000),
+      getRetryMaxAttempts: vi.fn().mockReturnValue(2),
+      getRetryDelayMs: vi.fn().mockReturnValue(1000),
     }));
 
     const { createAgentSessionOnNode } = await import('../../src/services/node-agent');
@@ -955,13 +961,15 @@ describe('Node Agent client functions send correct payloads', () => {
     }));
 
     vi.doMock('../../src/services/fetch-timeout', () => ({
-      fetchWithTimeout: vi.fn().mockResolvedValue(
+      fetchWithTimeoutAndRetry: vi.fn().mockResolvedValue(
         new Response(JSON.stringify({ error: 'workspace not found' }), {
           status: 404,
           headers: { 'Content-Type': 'application/json' },
         })
       ),
       getTimeoutMs: vi.fn().mockReturnValue(30000),
+      getRetryMaxAttempts: vi.fn().mockReturnValue(2),
+      getRetryDelayMs: vi.fn().mockReturnValue(1000),
     }));
 
     const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent');
@@ -987,13 +995,15 @@ describe('Node Agent client functions send correct payloads', () => {
 
     // Simulate the API Worker's own 404 response (loop-back via wildcard DNS)
     vi.doMock('../../src/services/fetch-timeout', () => ({
-      fetchWithTimeout: vi.fn().mockResolvedValue(
+      fetchWithTimeoutAndRetry: vi.fn().mockResolvedValue(
         new Response(JSON.stringify({ error: 'NOT_FOUND', message: 'Endpoint not found' }), {
           status: 404,
           headers: { 'Content-Type': 'application/json' },
         })
       ),
       getTimeoutMs: vi.fn().mockReturnValue(30000),
+      getRetryMaxAttempts: vi.fn().mockReturnValue(2),
+      getRetryDelayMs: vi.fn().mockReturnValue(1000),
     }));
 
     const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent');
@@ -1018,10 +1028,12 @@ describe('Node Agent client functions send correct payloads', () => {
     }));
 
     vi.doMock('../../src/services/fetch-timeout', () => ({
-      fetchWithTimeout: vi.fn().mockRejectedValue(
+      fetchWithTimeoutAndRetry: vi.fn().mockRejectedValue(
         new Error('Request timed out after 30000ms: https://node-abc.vm.example.com:8443/workspaces/ws-test')
       ),
       getTimeoutMs: vi.fn().mockReturnValue(30000),
+      getRetryMaxAttempts: vi.fn().mockReturnValue(2),
+      getRetryDelayMs: vi.fn().mockReturnValue(1000),
     }));
 
     const { deleteWorkspaceOnNode } = await import('../../src/services/node-agent');
diff --git a/apps/api/tests/unit/services/node-ip-validation.test.ts b/apps/api/tests/unit/services/node-ip-validation.test.ts
index b4b1558fd..77ef73bdf 100644
--- a/apps/api/tests/unit/services/node-ip-validation.test.ts
+++ b/apps/api/tests/unit/services/node-ip-validation.test.ts
@@ -17,10 +17,12 @@ describe('provisionNode empty IP guard', () => {
     file.indexOf('export async function provisionNode'),
     file.indexOf('export async function stopNodeResources')
   );
+  // Focus on the createVM path (after the recovery block) for the IP guard checks
+  const createVmSection = section.slice(section.indexOf('const vm = await provider.createVM'));
 
   it('checks for empty IP before creating DNS records', () => {
-    const ipCheckIdx = section.indexOf('if (!vm.ip)');
-    const dnsCreateIdx = section.indexOf('createNodeBackendDNSRecord');
+    const ipCheckIdx = createVmSection.indexOf('if (!vm.ip)');
+    const dnsCreateIdx = createVmSection.indexOf('createNodeBackendDNSRecord');
     expect(ipCheckIdx).toBeGreaterThan(-1);
     expect(dnsCreateIdx).toBeGreaterThan(-1);
     // IP check must come before DNS record creation
@@ -28,34 +30,34 @@ describe('provisionNode empty IP guard', () => {
   });
 
   it('keeps node in creating status when IP is empty (awaiting heartbeat backfill)', () => {
-    const ipGuardBlock = section.slice(
-      section.indexOf('if (!vm.ip)'),
-      section.indexOf('let backendDnsRecordId')
+    const ipGuardBlock = createVmSection.slice(
+      createVmSection.indexOf('if (!vm.ip)'),
+      createVmSection.indexOf('let backendDnsRecordId')
     );
     expect(ipGuardBlock).toContain("status: 'creating'");
     expect(ipGuardBlock).toContain('Awaiting IP allocation');
   });
 
   it('returns early after setting creating status for empty IP', () => {
-    const ipGuardBlock = section.slice(
-      section.indexOf('if (!vm.ip)'),
-      section.indexOf('let backendDnsRecordId')
+    const ipGuardBlock = createVmSection.slice(
+      createVmSection.indexOf('if (!vm.ip)'),
+      createVmSection.indexOf('let backendDnsRecordId')
     );
     expect(ipGuardBlock).toContain('return;');
   });
 
   it('stores providerInstanceId even when IP is empty (for cleanup)', () => {
-    const ipGuardBlock = section.slice(
-      section.indexOf('if (!vm.ip)'),
-      section.indexOf('let backendDnsRecordId')
+    const ipGuardBlock = createVmSection.slice(
+      createVmSection.indexOf('if (!vm.ip)'),
+      createVmSection.indexOf('let backendDnsRecordId')
     );
     expect(ipGuardBlock).toContain('providerInstanceId: vm.id');
   });
 
   it('logs structured info with nodeId and providerInstanceId', () => {
-    const ipGuardBlock = section.slice(
-      section.indexOf('if (!vm.ip)'),
-      section.indexOf('let backendDnsRecordId')
+    const ipGuardBlock = createVmSection.slice(
+      createVmSection.indexOf('if (!vm.ip)'),
+      createVmSection.indexOf('let backendDnsRecordId')
     );
     expect(ipGuardBlock).toContain('nodeId: node.id');
     expect(ipGuardBlock).toContain('providerInstanceId: vm.id');
diff --git a/apps/api/tests/unit/services/provider-credentials.test.ts b/apps/api/tests/unit/services/provider-credentials.test.ts
index c9ef8a996..be2d68994 100644
--- a/apps/api/tests/unit/services/provider-credentials.test.ts
+++ b/apps/api/tests/unit/services/provider-credentials.test.ts
@@ -35,6 +35,31 @@ describe('buildProviderConfig', () => {
     });
   });
 
+  it('should build HetznerProviderConfig with retry and placement overrides from env', () => {
+    const config = buildProviderConfig('hetzner', 'my-hetzner-token', {
+      HETZNER_API_TIMEOUT_MS: '45000',
+      HETZNER_API_RETRY_MAX_ATTEMPTS: '4',
+      HETZNER_API_RETRY_BASE_DELAY_MS: '1500',
+      HETZNER_API_RETRY_MAX_DELAY_MS: '12000',
+      HETZNER_PLACEMENT_RETRY_DELAY_MS: '2500',
+      HETZNER_PLACEMENT_RETRY_ATTEMPTS: '3',
+      HETZNER_PLACEMENT_FALLBACK_ENABLED: 'false',
+      HETZNER_PLACEMENT_FALLBACK_LOCATIONS: 'hel1,nbg1',
+    });
+    expect(config).toEqual({
+      provider: 'hetzner',
+      apiToken: 'my-hetzner-token',
+      timeoutMs: 45000,
+      apiRetryMaxAttempts: 4,
+      apiRetryBaseDelayMs: 1500,
+      apiRetryMaxDelayMs: 12000,
+      placementRetryDelayMs: 2500,
+      placementRetryAttempts: 3,
+      placementFallbackEnabled: false,
+      placementFallbackLocations: ['hel1', 'nbg1'],
+    });
+  });
+
   it('should build ScalewayProviderConfig from JSON string', () => {
     const token = JSON.stringify({
       secretKey: 'scw-secret',
diff --git a/apps/api/tests/unit/task-runner-do-helpers.test.ts b/apps/api/tests/unit/task-runner-do-helpers.test.ts
index ac0753101..ae922fe22 100644
--- a/apps/api/tests/unit/task-runner-do-helpers.test.ts
+++ b/apps/api/tests/unit/task-runner-do-helpers.test.ts
@@ -125,4 +125,22 @@ describe('isTransientError', () => {
     const err = Object.assign(new Error('network error'), { permanent: true });
     expect(isTransientError(err)).toBe(false);
   });
+
+  it('honors explicit retryable=true metadata', () => {
+    // Provider library marks an error as retryable even if the message is unknown
+    const err = Object.assign(new Error('some provider-specific error'), { retryable: true });
+    expect(isTransientError(err)).toBe(true);
+  });
+
+  it('honors explicit retryable=false metadata', () => {
+    // Provider library marks a network error as non-retryable
+    const err = Object.assign(new Error('network error'), { retryable: false });
+    expect(isTransientError(err)).toBe(false);
+  });
+
+  it('permanent flag takes precedence over retryable', () => {
+    // permanent should win over retryable
+    const err = Object.assign(new Error('some error'), { permanent: true, retryable: true });
+    expect(isTransientError(err)).toBe(false);
+  });
 });
diff --git a/apps/api/tests/unit/task-runner-do-state.test.ts b/apps/api/tests/unit/task-runner-do-state.test.ts
index 7250be916..9952f73eb 100644
--- a/apps/api/tests/unit/task-runner-do-state.test.ts
+++ b/apps/api/tests/unit/task-runner-do-state.test.ts
@@ -144,7 +144,7 @@ describe('TaskRunner DO retry logic', () => {
   });
 
   it('checks max retries before retrying', () => {
-    expect(doSource).toContain('state.retryCount < this.getMaxRetries()');
+    expect(doSource).toContain('state.retryCount < this.getMaxRetries(state.currentStep)');
   });
 
   it('increments retryCount on transient failure', () => {
@@ -429,14 +429,16 @@ describe('TaskRunner DO configuration', () => {
 
   for (const method of configMethods) {
     it(`has ${method}() configuration method`, () => {
-      expect(doSource).toContain(`private ${method}()`);
+      // getMaxRetries accepts an optional step parameter
+      const pattern = method === 'getMaxRetries' ? `private ${method}(step` : `private ${method}()`;
+      expect(doSource).toContain(pattern);
     });
 
     it(`${method}() reads from env var with DEFAULT fallback`, () => {
-      const section = doSource.slice(
-        doSource.indexOf(`private ${method}()`),
-        doSource.indexOf('}', doSource.indexOf(`private ${method}()`) + 50) + 1
-      );
+      const searchPattern = method === 'getMaxRetries' ? `private ${method}(step` : `private ${method}()`;
+      const startIdx = doSource.indexOf(searchPattern);
+      // Extract a generous section (500 chars) to cover multi-line methods with nested blocks
+      const section = doSource.slice(startIdx, startIdx + 500);
       expect(section).toContain('parseEnvInt');
       expect(section).toContain('this.env.');
       expect(section).toContain('DEFAULT_TASK_RUNNER');
diff --git a/packages/cloud-init/src/generate.ts b/packages/cloud-init/src/generate.ts
index 1171db868..4914ace34 100644
--- a/packages/cloud-init/src/generate.ts
+++ b/packages/cloud-init/src/generate.ts
@@ -108,6 +108,12 @@ export function validateCloudInitVariables(variables: CloudInitVariables): void
       errors.push(`logJournalMaxRetention: must match ${JOURNALD_TIME_RE} (got ${JSON.stringify(variables.logJournalMaxRetention)})`);
     }
   }
+  if (variables.provider !== undefined && variables.provider !== '') {
+    const validProviders = ['hetzner', 'scaleway', 'gcp'];
+    if (!validProviders.includes(variables.provider)) {
+      errors.push(`provider: must be one of ${validProviders.join(', ')} (got ${JSON.stringify(variables.provider)})`);
+    }
+  }
   if (variables.dockerDnsServers !== undefined && variables.dockerDnsServers !== '') {
     if (!SAFE_DNS_SERVERS_RE.test(variables.dockerDnsServers)) {
       errors.push(`dockerDnsServers: must contain only quoted IPs (got ${JSON.stringify(variables.dockerDnsServers)})`);
@@ -162,6 +168,8 @@ export interface CloudInitVariables {
   vmAgentPort?: string;
   /** Timeout in seconds for fetching Cloudflare IP ranges at boot (default: 10) */
   cfIpFetchTimeout?: string;
+  /** Cloud provider name (hetzner, scaleway, gcp) — passed to VM agent and used for apt mirror config */
+  provider?: string;
 }
 
 /**
@@ -204,6 +212,7 @@ export function generateCloudInit(
     '{{ tls_cert_path }}': variables.originCaCert ? '/etc/sam/tls/origin-ca.pem' : '',
     '{{ tls_key_path }}': variables.originCaCert ? '/etc/sam/tls/origin-ca-key.pem' : '',
     '{{ cf_ip_fetch_timeout }}': variables.cfIpFetchTimeout ?? '10',
+    '{{ provider }}': variables.provider ?? '',
   };
 
   // Use function replacement to prevent $-pattern interpretation in values.
diff --git a/packages/cloud-init/src/template.ts b/packages/cloud-init/src/template.ts
index 751a2d6fd..ff576a702 100644
--- a/packages/cloud-init/src/template.ts
+++ b/packages/cloud-init/src/template.ts
@@ -75,6 +75,7 @@ write_files:
       Environment=VM_AGENT_PORT={{ vm_agent_port }}
       Environment=TLS_CERT_PATH={{ tls_cert_path }}
       Environment=TLS_KEY_PATH={{ tls_key_path }}
+      Environment=PROVIDER={{ provider }}
       ExecStart=/usr/local/bin/vm-agent
       Restart=always
       RestartSec=5
@@ -280,5 +281,12 @@ write_files:
       {{ origin_ca_key }}
     permissions: '0600'
 
+  - path: /etc/apt/apt.conf.d/80-retries
+    content: |
+      Acquire::Retries "3";
+      Acquire::https::Timeout "30";
+      Acquire::http::Timeout "30";
+    permissions: '0644'
+
 final_message: "Simple Agent Manager node {{ node_id }} provisioning started!"
 `;
diff --git a/packages/cloud-init/tests/generate.test.ts b/packages/cloud-init/tests/generate.test.ts
index 2a9b8679c..65fa258e5 100644
--- a/packages/cloud-init/tests/generate.test.ts
+++ b/packages/cloud-init/tests/generate.test.ts
@@ -1354,3 +1354,41 @@ describe('integrated size validation in generateCloudInit', () => {
   });
 });
 
+describe('provider field', () => {
+  it('accepts valid providers', () => {
+    for (const p of ['hetzner', 'scaleway', 'gcp']) {
+      expect(() => generateCloudInit(baseVariables({ provider: p }))).not.toThrow();
+    }
+  });
+
+  it('rejects invalid providers', () => {
+    expect(() => generateCloudInit(baseVariables({ provider: 'aws' }))).toThrow(/provider/i);
+  });
+
+  it('allows empty/missing provider', () => {
+    expect(() => generateCloudInit(baseVariables())).not.toThrow();
+    expect(() => generateCloudInit(baseVariables({ provider: '' }))).not.toThrow();
+  });
+
+  it('includes provider in systemd environment', () => {
+    const config = generateCloudInit(baseVariables({ provider: 'hetzner' }));
+    const parsed = YAML.parse(config);
+    const serviceFile = parsed.write_files.find(
+      (f: { path: string }) => f.path === '/etc/systemd/system/vm-agent.service',
+    );
+    expect(serviceFile).toBeDefined();
+    expect(serviceFile.content).toContain('Environment=PROVIDER=hetzner');
+  });
+
+  it('includes apt retry config', () => {
+    const config = generateCloudInit(baseVariables());
+    const parsed = YAML.parse(config);
+    const aptRetryFile = parsed.write_files.find(
+      (f: { path: string }) => f.path === '/etc/apt/apt.conf.d/80-retries',
+    );
+    expect(aptRetryFile).toBeDefined();
+    expect(aptRetryFile.content).toContain('Acquire::Retries "3"');
+    expect(aptRetryFile.content).toContain('Acquire::https::Timeout "30"');
+  });
+});
+
diff --git a/packages/providers/src/hetzner.ts b/packages/providers/src/hetzner.ts
index 8d86c5c2e..00e3b3f1a 100644
--- a/packages/providers/src/hetzner.ts
+++ b/packages/providers/src/hetzner.ts
@@ -1,7 +1,13 @@
 import type { VMSize } from '@simple-agent-manager/shared';
 import { DEFAULT_HETZNER_DATACENTER, DEFAULT_HETZNER_IMAGE } from '@simple-agent-manager/shared';
 
-import { providerFetch } from './provider-fetch';
+import {
+  getRetryDelayMs,
+  getRetryMaxAttempts,
+  getTimeoutMs,
+  providerFetch,
+  providerFetchWithRetry,
+} from './provider-fetch';
 import type { LocationMeta, Provider, SizeConfig, VMConfig, VMInstance, VMStatus } from './types';
 import { ProviderError } from './types';
 
@@ -18,6 +24,7 @@ const HETZNER_LOCATION_META: Record<string, LocationMeta> = {
 };
 
 export const DEFAULT_PLACEMENT_RETRY_DELAY_MS = 3_000;
+export const DEFAULT_PLACEMENT_RETRY_ATTEMPTS = 2;
 
 const SIZE_CONFIGS: Record<VMSize, SizeConfig> = {
   small: {
@@ -74,20 +81,38 @@ export class HetznerProvider implements Provider {
 
   private readonly apiToken: string;
   private readonly datacenter: string;
+  private readonly timeoutMs: number;
+  private readonly apiRetryMaxAttempts: number;
+  private readonly apiRetryBaseDelayMs: number;
+  private readonly apiRetryMaxDelayMs: number;
   private readonly placementRetryDelayMs: number;
+  private readonly placementRetryAttempts: number;
   private readonly placementFallbackEnabled: boolean;
+  private readonly placementFallbackLocations: string[] | undefined;
 
   constructor(
     apiToken: string,
     datacenter?: string,
     placementRetryDelayMs?: number,
     placementFallbackEnabled?: boolean,
+    timeoutMs?: number,
+    apiRetryMaxAttempts?: number,
+    apiRetryBaseDelayMs?: number,
+    apiRetryMaxDelayMs?: number,
+    placementRetryAttempts?: number,
+    placementFallbackLocations?: string[],
   ) {
     this.apiToken = apiToken;
     this.datacenter = datacenter || DEFAULT_HETZNER_DATACENTER;
     this.defaultLocation = this.datacenter;
-    this.placementRetryDelayMs = placementRetryDelayMs ?? DEFAULT_PLACEMENT_RETRY_DELAY_MS;
+    this.timeoutMs = getTimeoutMs(timeoutMs === undefined ? undefined : String(timeoutMs));
+    this.apiRetryMaxAttempts = getRetryMaxAttempts(apiRetryMaxAttempts);
+    this.apiRetryBaseDelayMs = getRetryDelayMs(apiRetryBaseDelayMs, 1_000);
+    this.apiRetryMaxDelayMs = getRetryDelayMs(apiRetryMaxDelayMs, 10_000);
+    this.placementRetryDelayMs = getRetryDelayMs(placementRetryDelayMs, DEFAULT_PLACEMENT_RETRY_DELAY_MS);
+    this.placementRetryAttempts = getRetryMaxAttempts(placementRetryAttempts, DEFAULT_PLACEMENT_RETRY_ATTEMPTS);
     this.placementFallbackEnabled = placementFallbackEnabled ?? true;
+    this.placementFallbackLocations = placementFallbackLocations?.filter((loc) => HETZNER_LOCATIONS.includes(loc as typeof HETZNER_LOCATIONS[number]));
   }
 
   async createVM(config: VMConfig): Promise<VMInstance> {
@@ -97,15 +122,20 @@ export class HetznerProvider implements Provider {
     }
     const primaryLocation = config.location || this.datacenter;
 
-    // Build attempt order: primary twice (with a delay between), then remaining locations shuffled
+    // Build attempt order: primary N times (with a delay between), then remaining locations.
+    // Fallback order can be explicitly configured for deterministic capacity handling.
     const fallbackLocations = this.placementFallbackEnabled
-      ? HETZNER_LOCATIONS
+      ? (this.placementFallbackLocations?.length
+          ? this.placementFallbackLocations
+          : HETZNER_LOCATIONS.filter((loc) => loc !== primaryLocation).sort(() => Math.random() - 0.5))
           .filter((loc) => loc !== primaryLocation)
-          .sort(() => Math.random() - 0.5)
       : [];
+    const primaryAttempts = Array.from({ length: this.placementRetryAttempts }, (_, index) => ({
+      location: primaryLocation,
+      delayMs: index === 0 ? 0 : this.placementRetryDelayMs,
+    }));
     const attemptsToTry: Array<{ location: string; delayMs: number }> = [
-      { location: primaryLocation, delayMs: 0 },
-      { location: primaryLocation, delayMs: this.placementRetryDelayMs },
+      ...primaryAttempts,
       ...fallbackLocations.map((loc) => ({ location: loc, delayMs: 0 })),
     ];
 
@@ -134,7 +164,7 @@ export class HetznerProvider implements Provider {
             labels: config.labels || {},
             start_after_create: true,
           }),
-        });
+        }, this.timeoutMs);
 
         const data = (await response.json()) as HetznerServerResponse;
         if (attempt.location !== primaryLocation) {
@@ -145,10 +175,21 @@ export class HetznerProvider implements Provider {
         return this.mapServerToVMInstance(data.server);
       } catch (err) {
         if (err instanceof ProviderError && err.statusCode === 412) {
+          const placementError = new ProviderError(
+            this.name,
+            err.statusCode,
+            err.message,
+            {
+              cause: err,
+              retryable: true,
+              reason: 'capacity',
+              idempotencyRisk: 'duplicate_resource',
+            },
+          );
           console.warn(
             `hetzner: placement failed in ${attempt.location} (412)`,
           );
-          lastError = err;
+          lastError = placementError;
           continue;
         }
         throw err; // Non-placement errors are not retryable
@@ -161,7 +202,7 @@ export class HetznerProvider implements Provider {
 
   async deleteVM(id: string): Promise<void> {
     try {
-      await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}`, {
+      await this.fetch(`${HETZNER_API_URL}/servers/${id}`, {
         method: 'DELETE',
         headers: {
           Authorization: `Bearer ${this.apiToken}`,
@@ -177,7 +218,7 @@ export class HetznerProvider implements Provider {
 
   async getVM(id: string): Promise<VMInstance | null> {
     try {
-      const response = await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}`, {
+      const response = await this.fetch(`${HETZNER_API_URL}/servers/${id}`, {
         headers: {
           Authorization: `Bearer ${this.apiToken}`,
         },
@@ -205,7 +246,7 @@ export class HetznerProvider implements Provider {
       ? `${HETZNER_API_URL}/servers?label_selector=${encodeURIComponent(labelParts.join(','))}`
       : `${HETZNER_API_URL}/servers`;
 
-    const response = await providerFetch(this.name, url, {
+    const response = await this.fetch(url, {
       headers: {
         Authorization: `Bearer ${this.apiToken}`,
       },
@@ -216,7 +257,7 @@ export class HetznerProvider implements Provider {
   }
 
   async powerOff(id: string): Promise<void> {
-    await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}/actions/poweroff`, {
+    await this.fetch(`${HETZNER_API_URL}/servers/${id}/actions/poweroff`, {
       method: 'POST',
       headers: {
         Authorization: `Bearer ${this.apiToken}`,
@@ -225,7 +266,7 @@ export class HetznerProvider implements Provider {
   }
 
   async powerOn(id: string): Promise<void> {
-    await providerFetch(this.name, `${HETZNER_API_URL}/servers/${id}/actions/poweron`, {
+    await this.fetch(`${HETZNER_API_URL}/servers/${id}/actions/poweron`, {
       method: 'POST',
       headers: {
         Authorization: `Bearer ${this.apiToken}`,
@@ -234,7 +275,7 @@ export class HetznerProvider implements Provider {
   }
 
   async validateToken(): Promise<boolean> {
-    await providerFetch(this.name, `${HETZNER_API_URL}/datacenters`, {
+    await this.fetch(`${HETZNER_API_URL}/datacenters`, {
       headers: {
         Authorization: `Bearer ${this.apiToken}`,
       },
@@ -242,6 +283,14 @@ export class HetznerProvider implements Provider {
     return true;
   }
 
+  private fetch(url: string | URL, init?: RequestInit): Promise<Response> {
+    return providerFetchWithRetry(this.name, url, init, this.timeoutMs, {
+      maxAttempts: this.apiRetryMaxAttempts,
+      baseDelayMs: this.apiRetryBaseDelayMs,
+      maxDelayMs: this.apiRetryMaxDelayMs,
+    });
+  }
+
   private mapServerToVMInstance(server: HetznerServerResponse['server']): VMInstance {
     return {
       id: String(server.id),
diff --git a/packages/providers/src/index.ts b/packages/providers/src/index.ts
index 02ad816c5..44e541f32 100644
--- a/packages/providers/src/index.ts
+++ b/packages/providers/src/index.ts
@@ -20,7 +20,15 @@ export type {
 export { ProviderError } from './types';
 
 // Re-export utilities
-export { getTimeoutMs,providerFetch } from './provider-fetch';
+export {
+  computeRetryDelayMs,
+  getRetryDelayMs,
+  getRetryMaxAttempts,
+  getTimeoutMs,
+  providerFetch,
+  providerFetchWithRetry,
+  shouldRetryProviderError,
+} from './provider-fetch';
 
 // Re-export providers
 export type { GcpTokenProvider } from './gcp';
@@ -40,6 +48,12 @@ export function createProvider(config: ProviderConfig): Provider {
         config.datacenter,
         config.placementRetryDelayMs,
         config.placementFallbackEnabled,
+        config.timeoutMs,
+        config.apiRetryMaxAttempts,
+        config.apiRetryBaseDelayMs,
+        config.apiRetryMaxDelayMs,
+        config.placementRetryAttempts,
+        config.placementFallbackLocations,
       );
     case 'scaleway':
       return new ScalewayProvider(
diff --git a/packages/providers/src/provider-fetch.ts b/packages/providers/src/provider-fetch.ts
index 3393cc46e..369e3326e 100644
--- a/packages/providers/src/provider-fetch.ts
+++ b/packages/providers/src/provider-fetch.ts
@@ -1,6 +1,26 @@
-import { ProviderError } from './types';
+import { ProviderError,type ProviderErrorReason, type ProviderIdempotencyRisk } from './types';
 
 const DEFAULT_PROVIDER_TIMEOUT_MS = 30_000;
+const DEFAULT_PROVIDER_RETRY_MAX_ATTEMPTS = 3;
+const DEFAULT_PROVIDER_RETRY_BASE_DELAY_MS = 1_000;
+const DEFAULT_PROVIDER_RETRY_MAX_DELAY_MS = 10_000;
+const DEFAULT_PROVIDER_RETRY_JITTER_RATIO = 0.25;
+
+export interface ProviderRetryConfig {
+  maxAttempts?: number;
+  baseDelayMs?: number;
+  maxDelayMs?: number;
+  jitterRatio?: number;
+  retryableStatusCodes?: readonly number[];
+  idempotencyRisk?: ProviderIdempotencyRisk;
+}
+
+interface NormalizedProviderError {
+  message: string;
+  reason: ProviderErrorReason;
+  retryable: boolean;
+  retryAfterMs?: number;
+}
 
 /**
  * Parse a timeout from an env value string or return the default.
@@ -15,6 +35,26 @@ export function getTimeoutMs(
   return parsed;
 }
 
+export function getRetryMaxAttempts(
+  value: number | string | undefined,
+  defaultAttempts: number = DEFAULT_PROVIDER_RETRY_MAX_ATTEMPTS,
+): number {
+  if (value === undefined || value === '') return defaultAttempts;
+  const parsed = typeof value === 'number' ? value : Number.parseInt(value, 10);
+  if (!Number.isFinite(parsed) || parsed <= 0) return defaultAttempts;
+  return Math.floor(parsed);
+}
+
+export function getRetryDelayMs(
+  value: number | string | undefined,
+  defaultMs: number,
+): number {
+  if (value === undefined || value === '') return defaultMs;
+  const parsed = typeof value === 'number' ? value : Number.parseInt(value, 10);
+  if (!Number.isFinite(parsed) || parsed < 0) return defaultMs;
+  return parsed;
+}
+
 /**
  * Fetch wrapper for provider API calls.
  * Adds configurable timeout via AbortController and wraps errors into ProviderError.
@@ -42,24 +82,17 @@ export async function providerFetch(
     });
 
     if (!response.ok) {
-      let errorMessage: string;
-      try {
-        const body = await response.text();
-        // Try parsing as JSON for structured error messages
-        try {
-          const json = JSON.parse(body) as { error?: { message?: string }; message?: string };
-          errorMessage = json.error?.message || json.message || body;
-        } catch {
-          errorMessage = body || `HTTP ${response.status}`;
-        }
-      } catch {
-        errorMessage = `HTTP ${response.status}`;
-      }
+      const normalized = await normalizeHttpError(response);
 
       throw new ProviderError(
         providerName,
         response.status,
-        `${providerName} API error (${response.status}): ${errorMessage}`,
+        `${providerName} API error (${response.status}): ${normalized.message}`,
+        {
+          retryable: normalized.retryable,
+          reason: normalized.reason,
+          retryAfterMs: normalized.retryAfterMs,
+        },
       );
     }
 
@@ -72,7 +105,7 @@ export async function providerFetch(
         providerName,
         undefined,
         `${providerName} API request timed out after ${timeoutMs}ms: ${url}`,
-        { cause: err },
+        { cause: err, retryable: true, reason: 'timeout' },
       );
     }
 
@@ -80,9 +113,122 @@ export async function providerFetch(
       providerName,
       undefined,
       `${providerName} API request failed: ${err instanceof Error ? err.message : String(err)}`,
-      { cause: err instanceof Error ? err : undefined },
+      { cause: err instanceof Error ? err : undefined, retryable: true, reason: 'network' },
     );
   } finally {
     clearTimeout(timeoutId);
   }
 }
+
+export async function providerFetchWithRetry(
+  providerName: string,
+  url: string | URL,
+  init?: RequestInit,
+  timeoutMs: number = DEFAULT_PROVIDER_TIMEOUT_MS,
+  retryConfig: ProviderRetryConfig = {},
+): Promise<Response> {
+  const maxAttempts = getRetryMaxAttempts(retryConfig.maxAttempts);
+  const baseDelayMs = getRetryDelayMs(retryConfig.baseDelayMs, DEFAULT_PROVIDER_RETRY_BASE_DELAY_MS);
+  const maxDelayMs = getRetryDelayMs(retryConfig.maxDelayMs, DEFAULT_PROVIDER_RETRY_MAX_DELAY_MS);
+  const jitterRatio = retryConfig.jitterRatio ?? DEFAULT_PROVIDER_RETRY_JITTER_RATIO;
+
+  let lastError: ProviderError | undefined;
+
+  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+    try {
+      return await providerFetch(providerName, url, init, timeoutMs);
+    } catch (err) {
+      if (!(err instanceof ProviderError)) throw err;
+      lastError = err;
+      if (!shouldRetryProviderError(err, retryConfig) || attempt >= maxAttempts) {
+        throw err;
+      }
+
+      const delayMs = err.retryAfterMs ?? computeRetryDelayMs(attempt, baseDelayMs, maxDelayMs, jitterRatio);
+      if (delayMs > 0) {
+        await new Promise((resolve) => setTimeout(resolve, delayMs));
+      }
+    }
+  }
+
+  throw lastError ?? new ProviderError(providerName, undefined, `${providerName} API request failed without an error`);
+}
+
+export function shouldRetryProviderError(
+  err: ProviderError,
+  retryConfig: ProviderRetryConfig = {},
+): boolean {
+  if (retryConfig.retryableStatusCodes?.includes(err.statusCode ?? -1)) {
+    return true;
+  }
+  return err.retryable;
+}
+
+export function computeRetryDelayMs(
+  attempt: number,
+  baseDelayMs: number = DEFAULT_PROVIDER_RETRY_BASE_DELAY_MS,
+  maxDelayMs: number = DEFAULT_PROVIDER_RETRY_MAX_DELAY_MS,
+  jitterRatio: number = DEFAULT_PROVIDER_RETRY_JITTER_RATIO,
+): number {
+  const exponential = Math.min(baseDelayMs * Math.pow(2, Math.max(0, attempt - 1)), maxDelayMs);
+  if (jitterRatio <= 0 || exponential <= 0) return exponential;
+  const jitterRange = exponential * jitterRatio;
+  const jitter = (Math.random() * jitterRange * 2) - jitterRange;
+  return Math.max(0, Math.round(exponential + jitter));
+}
+
+async function normalizeHttpError(response: Response): Promise<NormalizedProviderError> {
+  let errorMessage: string;
+  try {
+    const body = await response.text();
+    try {
+      const json = JSON.parse(body) as { error?: { message?: string }; message?: string };
+      errorMessage = json.error?.message || json.message || body;
+    } catch {
+      errorMessage = body || `HTTP ${response.status}`;
+    }
+  } catch {
+    errorMessage = `HTTP ${response.status}`;
+  }
+
+  return {
+    message: errorMessage,
+    reason: classifyHttpStatus(response.status),
+    retryable: isRetryableHttpStatus(response.status),
+    retryAfterMs: parseRetryAfterMs(response.headers.get('Retry-After')),
+  };
+}
+
+function classifyHttpStatus(status: number): ProviderErrorReason {
+  if (status === 408) return 'timeout';
+  if (status === 409 || status === 425) return 'conflict';
+  if (status === 429) return 'rate_limit';
+  if (status === 401 || status === 403) return 'auth';
+  if (status === 404) return 'not_found';
+  if (status >= 500) return 'provider_5xx';
+  if (status >= 400) return 'validation';
+  return 'unknown';
+}
+
+function isRetryableHttpStatus(status: number): boolean {
+  return status === 408 ||
+    status === 425 ||
+    status === 429 ||
+    status === 500 ||
+    status === 502 ||
+    status === 503 ||
+    status === 504;
+}
+
+function parseRetryAfterMs(value: string | null): number | undefined {
+  if (!value) return undefined;
+  const seconds = Number.parseInt(value, 10);
+  if (Number.isFinite(seconds) && seconds >= 0) {
+    return seconds * 1000;
+  }
+  const dateMs = Date.parse(value);
+  if (Number.isFinite(dateMs)) {
+    return Math.max(0, dateMs - Date.now());
+  }
+  return undefined;
+}
diff --git a/packages/providers/src/types.ts b/packages/providers/src/types.ts
index a09a6bd49..67c6792a3 100644
--- a/packages/providers/src/types.ts
+++ b/packages/providers/src/types.ts
@@ -135,10 +135,22 @@ export interface HetznerProviderConfig {
   provider: 'hetzner';
   apiToken: string;
   datacenter?: string;
+  /** Per-request API timeout in ms (default: 30000) */
+  timeoutMs?: number;
+  /** Max HTTP retry attempts for transient provider API calls (default: 3) */
+  apiRetryMaxAttempts?: number;
+  /** Base delay in ms for transient provider API retries (default: 1000) */
+  apiRetryBaseDelayMs?: number;
+  /** Max delay in ms for transient provider API retries (default: 10000) */
+  apiRetryMaxDelayMs?: number;
   /** Delay in ms before retrying same location on 412 (default: 3000) */
   placementRetryDelayMs?: number;
+  /** Number of primary-location placement attempts before fallback locations (default: 2) */
+  placementRetryAttempts?: number;
   /** Whether to try other locations after primary fails (default: true) */
   placementFallbackEnabled?: boolean;
+  /** Ordered fallback locations to try after primary placement attempts fail */
+  placementFallbackLocations?: string[];
 }
 
 export interface ScalewayProviderConfig {
@@ -175,11 +187,26 @@ export class ProviderError extends Error {
     public readonly statusCode: number | undefined,
     message: string,
     /** Original error */
-    options?: { cause?: Error },
+    options?: {
+      cause?: Error;
+      retryable?: boolean;
+      reason?: ProviderErrorReason;
+      retryAfterMs?: number;
+      idempotencyRisk?: ProviderIdempotencyRisk;
+    },
   ) {
     super(message, options);
+    this.retryable = options?.retryable ?? false;
+    this.reason = options?.reason ?? classifyProviderErrorReason(statusCode);
+    this.retryAfterMs = options?.retryAfterMs;
+    this.idempotencyRisk = options?.idempotencyRisk ?? 'none';
   }
 
+  readonly retryable: boolean;
+  readonly reason: ProviderErrorReason;
+  readonly retryAfterMs: number | undefined;
+  readonly idempotencyRisk: ProviderIdempotencyRisk;
+
   /** Make Error properties visible to JSON.stringify */
   toJSON(): Record<string, unknown> {
     return {
@@ -187,7 +214,38 @@ export class ProviderError extends Error {
       message: this.message,
       provider: this.providerName,
       statusCode: this.statusCode,
+      retryable: this.retryable,
+      reason: this.reason,
+      retryAfterMs: this.retryAfterMs,
+      idempotencyRisk: this.idempotencyRisk,
       cause: this.cause instanceof Error ? this.cause.message : this.cause,
     };
   }
 }
+
+export type ProviderErrorReason =
+  | 'network'
+  | 'timeout'
+  | 'rate_limit'
+  | 'provider_5xx'
+  | 'capacity'
+  | 'placement'
+  | 'quota'
+  | 'auth'
+  | 'validation'
+  | 'not_found'
+  | 'conflict'
+  | 'unknown';
+
+export type ProviderIdempotencyRisk = 'none' | 'duplicate_resource' | 'destructive';
+
+function classifyProviderErrorReason(statusCode: number | undefined): ProviderErrorReason {
+  if (statusCode === undefined) return 'unknown';
+  if (statusCode === 401 || statusCode === 403) return 'auth';
+  if (statusCode === 404) return 'not_found';
+  if (statusCode === 409) return 'conflict';
+  if (statusCode === 429) return 'rate_limit';
+  if (statusCode >= 400 && statusCode < 500) return 'validation';
+  if (statusCode >= 500) return 'provider_5xx';
+  return 'unknown';
+}
diff --git a/packages/providers/tests/unit/hetzner.test.ts b/packages/providers/tests/unit/hetzner.test.ts
index ce3f7cd7d..eaa06f3fc 100644
--- a/packages/providers/tests/unit/hetzner.test.ts
+++ b/packages/providers/tests/unit/hetzner.test.ts
@@ -438,6 +438,83 @@ describe('HetznerProvider', () => {
       // primary (1) + primary retry (2), no fallback locations
       expect(fetch).toHaveBeenCalledTimes(2);
     });
+
+    it('should honor configurable primary placement retry attempts', async () => {
+      vi.useFakeTimers();
+      const retryProvider = new HetznerProvider(
+        'test-token',
+        'fsn1',
+        3000,
+        false,
+        undefined,
+        undefined,
+        undefined,
+        undefined,
+        3,
+      );
+      globalThis.fetch = vi.fn().mockResolvedValue(
+        new Response(
+          JSON.stringify({ error: { message: 'error during placement' } }),
+          { status: 412 },
+        ),
+      );
+
+      const promise = retryProvider.createVM(vmConfig).catch((err) => err);
+      await vi.runAllTimersAsync();
+      const result = await promise;
+      expect(result).toBeInstanceOf(ProviderError);
+      expect(fetch).toHaveBeenCalledTimes(3);
+
+      const locations = (fetch as ReturnType<typeof vi.fn>).mock.calls.map(
+        (call) => JSON.parse((call[1] as RequestInit).body as string).location,
+      );
+      expect(locations).toEqual(['fsn1', 'fsn1', 'fsn1']);
+    });
+
+    it('should honor configured fallback location order', async () => {
+      vi.useFakeTimers();
+      const orderedFallbackProvider = new HetznerProvider(
+        'test-token',
+        'fsn1',
+        3000,
+        true,
+        undefined,
+        undefined,
+        undefined,
+        undefined,
+        1,
+        ['hel1', 'nbg1'],
+      );
+      const mockFetch = vi.fn()
+        .mockResolvedValueOnce(
+          new Response(
+            JSON.stringify({ error: { message: 'error during placement' } }),
+            { status: 412 },
+          ),
+        )
+        .mockResolvedValueOnce(
+          new Response(
+            JSON.stringify({ error: { message: 'error during placement' } }),
+            { status: 412 },
+          ),
+        )
+        .mockResolvedValueOnce(
+          new Response(
+            JSON.stringify({ server: createMockServer({ status: 'initializing' }) }),
+            { status: 200 },
+          ),
+        );
+      globalThis.fetch = mockFetch;
+
+      const promise = orderedFallbackProvider.createVM(vmConfig);
+      await vi.runAllTimersAsync();
+      await promise;
+
+      const locations = mockFetch.mock.calls.map(
+        (call) => JSON.parse((call[1] as RequestInit).body as string).location,
+      );
+      expect(locations).toEqual(['fsn1', 'hel1', 'nbg1']);
+    });
   });
 
   describe('deleteVM', () => {
diff --git a/packages/providers/tests/unit/provider-error-tojson.test.ts b/packages/providers/tests/unit/provider-error-tojson.test.ts
index e791a9f19..a14737e71 100644
--- a/packages/providers/tests/unit/provider-error-tojson.test.ts
+++ b/packages/providers/tests/unit/provider-error-tojson.test.ts
@@ -12,6 +12,10 @@ describe('ProviderError.toJSON', () => {
       message: 'Server not found',
       provider: 'hetzner',
       statusCode: 404,
+      retryable: false,
+      reason: 'not_found',
+      retryAfterMs: undefined,
+      idempotencyRisk: 'none',
       cause: undefined,
     });
   });
@@ -69,4 +73,3 @@ describe('ProviderError.toJSON', () => {
     expect('stack' in json).toBe(false);
   });
 });
-
diff --git a/packages/providers/tests/unit/provider-fetch.test.ts b/packages/providers/tests/unit/provider-fetch.test.ts
index e9e3570a7..cd4bbc494 100644
--- a/packages/providers/tests/unit/provider-fetch.test.ts
+++ b/packages/providers/tests/unit/provider-fetch.test.ts
@@ -1,6 +1,12 @@
 import { afterEach,describe, expect, it, vi } from 'vitest';
 
-import { getTimeoutMs,providerFetch } from '../../src/provider-fetch';
+import { providerFetch, providerFetchWithRetry } from '../../src/provider-fetch';
+import {
+  computeRetryDelayMs,
+  getRetryDelayMs,
+  getRetryMaxAttempts,
+  getTimeoutMs,
+} from '../../src/provider-fetch';
 import { ProviderError } from '../../src/types';
 
 describe('getTimeoutMs', () => {
@@ -172,6 +178,101 @@ describe('providerFetch', () => {
     } catch (err) {
       const pe = err as ProviderError;
       expect(pe.message).toContain('Rate limited');
+      expect(pe.retryable).toBe(true);
+      expect(pe.reason).toBe('rate_limit');
     }
   });
 });
+
+describe('retry helpers', () => {
+  it('parses retry attempts with fallback', () => {
+    expect(getRetryMaxAttempts('4')).toBe(4);
+    expect(getRetryMaxAttempts('0', 2)).toBe(2);
+    expect(getRetryMaxAttempts('abc', 2)).toBe(2);
+  });
+
+  it('parses retry delays with fallback', () => {
+    expect(getRetryDelayMs('2500', 1000)).toBe(2500);
+    expect(getRetryDelayMs('-1', 1000)).toBe(1000);
+    expect(getRetryDelayMs('abc', 1000)).toBe(1000);
+  });
+
+  it('computes capped exponential retry delays without jitter', () => {
+    expect(computeRetryDelayMs(1, 1000, 10000, 0)).toBe(1000);
+    expect(computeRetryDelayMs(2, 1000, 10000, 0)).toBe(2000);
+    expect(computeRetryDelayMs(5, 1000, 10000, 0)).toBe(10000);
+  });
+});
+
+describe('providerFetchWithRetry', () => {
+  const originalFetch = globalThis.fetch;
+
+  afterEach(() => {
+    vi.useRealTimers();
+    globalThis.fetch = originalFetch;
+  });
+
+  it('retries transient 5xx responses and returns the eventual success', async () => {
+    vi.useFakeTimers();
+    const mockFetch = vi.fn()
+      .mockResolvedValueOnce(new Response('temporarily unavailable', { status: 503 }))
+      .mockResolvedValueOnce(new Response(JSON.stringify({ ok: true }), { status: 200 }));
+    globalThis.fetch = mockFetch;
+
+    const promise = providerFetchWithRetry(
+      'hetzner',
+      'https://api.example.com/test',
+      undefined,
+      30_000,
+      { maxAttempts: 2, baseDelayMs: 100, maxDelayMs: 100, jitterRatio: 0 },
+    );
+
+    await vi.runAllTimersAsync();
+    const response = await promise;
+
+    expect(response.ok).toBe(true);
+    expect(mockFetch).toHaveBeenCalledTimes(2);
+  });
+
+  it('honors Retry-After for 429 responses', async () => {
+    vi.useFakeTimers();
+    const mockFetch = vi.fn()
+      .mockResolvedValueOnce(new Response('rate limited', {
+        status: 429,
+        headers: { 'Retry-After': '2' },
+      }))
+      .mockResolvedValueOnce(new Response(JSON.stringify({ ok: true }), { status: 200 }));
+    globalThis.fetch = mockFetch;
+
+    const promise = providerFetchWithRetry(
+      'hetzner',
+      'https://api.example.com/test',
+      undefined,
+      30_000,
+      { maxAttempts: 2, baseDelayMs: 100, maxDelayMs: 100, jitterRatio: 0 },
+    );
+
+    await vi.advanceTimersByTimeAsync(1_999);
+    expect(mockFetch).toHaveBeenCalledTimes(1);
+    await vi.advanceTimersByTimeAsync(1);
+    await promise;
+    expect(mockFetch).toHaveBeenCalledTimes(2);
+  });
+
+  it('does not retry auth failures', async () => {
+    const mockFetch = vi.fn().mockResolvedValue(
+      new Response(JSON.stringify({ error: { message: 'Unauthorized' } }), { status: 401 }),
+    );
+    globalThis.fetch = mockFetch;
+
+    await expect(providerFetchWithRetry(
+      'hetzner',
+      'https://api.example.com/test',
+      undefined,
+      30_000,
+      { maxAttempts: 3, baseDelayMs: 1, maxDelayMs: 1, jitterRatio: 0 },
+    )).rejects.toThrow(ProviderError);
+
+    expect(mockFetch).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/packages/vm-agent/internal/bootstrap/bootstrap.go b/packages/vm-agent/internal/bootstrap/bootstrap.go
index d330e3331..6312c527a 100644
--- a/packages/vm-agent/internal/bootstrap/bootstrap.go
+++ b/packages/vm-agent/internal/bootstrap/bootstrap.go
@@ -813,7 +813,15 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName
 		return false, fmt.Errorf("devcontainer CLI never became available: %w", err)
 	}
 
-	slog.Info("Starting devcontainer for workspace", "workspaceDir", cfg.WorkspaceDir)
+	// Apply devcontainer build timeout so runaway builds (large Dockerfiles,
+	// network stalls pulling images) do not block workspace provisioning forever.
+	if cfg.DevcontainerBuildTimeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, cfg.DevcontainerBuildTimeout)
+		defer cancel()
+	}
+
+	slog.Info("Starting devcontainer for workspace", "workspaceDir", cfg.WorkspaceDir, "buildTimeout", cfg.DevcontainerBuildTimeout)
 
 	hasConfig := hasDevcontainerConfig(cfg.WorkspaceDir)
 	usedFallback := false
diff --git a/packages/vm-agent/internal/config/config.go b/packages/vm-agent/internal/config/config.go
index 31dbcd2b9..e763d4099 100644
--- a/packages/vm-agent/internal/config/config.go
+++ b/packages/vm-agent/internal/config/config.go
@@ -133,6 +133,14 @@ type Config struct {
 	DefaultDevcontainerConfigPath string // Path to write the generated default config
 	DefaultDevcontainerRemoteUser string // remoteUser for the default config (empty = omit, let image default)
 
+	// Cloud provider name (hetzner, scaleway, gcp) — set via cloud-init.
+	// Used for provider-specific configuration (e.g., apt mirror selection).
+	Provider string // (env: PROVIDER)
+
+	// DevcontainerBuildTimeout is the maximum duration allowed for devcontainer up.
+	// After this timeout the build context is cancelled.
+	DevcontainerBuildTimeout time.Duration // (env: DEVCONTAINER_BUILD_TIMEOUT, default: 15m)
+
 	// Project linkage — set via cloud-init when the workspace belongs to a project.
 	// If ProjectID is empty, the message reporter is disabled (no-op).
 	ProjectID     string // Linked project ID (env: PROJECT_ID)
@@ -324,6 +332,12 @@ func Load() (*Config, error) {
 		DefaultDevcontainerConfigPath: getEnv("DEFAULT_DEVCONTAINER_CONFIG_PATH", DefaultDevcontainerConfigPath),
 		DefaultDevcontainerRemoteUser: getEnv("DEFAULT_DEVCONTAINER_REMOTE_USER", ""), // Empty = omit, use image default
 
+		// Provider awareness (set via cloud-init)
+		Provider: getEnv("PROVIDER", ""),
+
+		// Devcontainer build timeout — configurable per constitution principle XI
+		DevcontainerBuildTimeout: getEnvDuration("DEVCONTAINER_BUILD_TIMEOUT", 15*time.Minute),
+
 		// Project linkage (set via cloud-init)
 		ProjectID:     getEnv("PROJECT_ID", ""),
 		ChatSessionID: getEnv("CHAT_SESSION_ID", ""),
diff --git a/packages/vm-agent/internal/config/config_test.go b/packages/vm-agent/internal/config/config_test.go
index 392ef65a8..60564c2c1 100644
--- a/packages/vm-agent/internal/config/config_test.go
+++ b/packages/vm-agent/internal/config/config_test.go
@@ -866,3 +866,37 @@ func TestCloseIdleControlPlaneConnectionsFlushesPool(t *testing.T) {
 		t.Fatalf("expected 2 total connections after flush (pool was purged), got %d", afterFlush)
 	}
 }
+
+func TestProviderAndDevcontainerBuildTimeoutConfig(t *testing.T) {
+	t.Run("defaults", func(t *testing.T) {
+		t.Setenv("CONTROL_PLANE_URL", "https://api.example.com")
+		t.Setenv("NODE_ID", "test-node")
+		cfg, err := Load()
+		if err != nil {
+			t.Fatalf("Load() error: %v", err)
+		}
+		if cfg.Provider != "" {
+			t.Errorf("expected empty Provider, got %q", cfg.Provider)
+		}
+		if cfg.DevcontainerBuildTimeout != 15*time.Minute {
+			t.Errorf("expected DevcontainerBuildTimeout 15m, got %v", cfg.DevcontainerBuildTimeout)
+		}
+	})
+
+	t.Run("env overrides", func(t *testing.T) {
+		t.Setenv("CONTROL_PLANE_URL", "https://api.example.com")
+		t.Setenv("NODE_ID", "test-node")
+		t.Setenv("PROVIDER", "hetzner")
+		t.Setenv("DEVCONTAINER_BUILD_TIMEOUT", "5m")
+		cfg, err := Load()
+		if err != nil {
+			t.Fatalf("Load() error: %v", err)
+		}
+		if cfg.Provider != "hetzner" {
+			t.Errorf("expected Provider hetzner, got %q", cfg.Provider)
+		}
+		if cfg.DevcontainerBuildTimeout != 5*time.Minute {
+			t.Errorf("expected DevcontainerBuildTimeout 5m, got %v", cfg.DevcontainerBuildTimeout)
+		}
+	})
+}