From 4b3bb4ef9c7ab959ffafe55909eec8a94050a7bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 09:38:01 +0000 Subject: [PATCH 1/6] task: move devcontainer-network-resilience to active Co-Authored-By: Claude Opus 4.6 --- .../2026-05-05-devcontainer-network-resilience.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tasks/{backlog => active}/2026-05-05-devcontainer-network-resilience.md (100%) diff --git a/tasks/backlog/2026-05-05-devcontainer-network-resilience.md b/tasks/active/2026-05-05-devcontainer-network-resilience.md similarity index 100% rename from tasks/backlog/2026-05-05-devcontainer-network-resilience.md rename to tasks/active/2026-05-05-devcontainer-network-resilience.md From 8a47da82860a7531405ab87654a957dd6303d77a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 09:49:45 +0000 Subject: [PATCH 2/6] fix: add devcontainer network resilience (apt mirrors, build timeout, retries) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Thread cloud provider through cloud-init to VM agent via PROVIDER env var - Add provider-specific apt mirror script (/etc/sam/apt-mirror-config.sh) that maps Hetzner → mirror.hetzner.com for container apt operations - Add apt retry config (3 retries, 30s timeout) on host via cloud-init - Add configurable devcontainer build timeout (default: 15min) to prevent indefinite hangs when apt/network is degraded - VM agent injects apt mirror config into containers before package installs - Validate provider field in cloud-init variable validation Fixes intermittent devcontainer build failures caused by containers using archive.ubuntu.com instead of Hetzner's local mirror, combined with no build timeout causing 30min+ hangs during Ubuntu repo outages. Co-Authored-By: Claude Opus 4.6 --- apps/api/src/services/nodes.ts | 1 + packages/cloud-init/src/generate.ts | 12 +++ packages/cloud-init/src/index.ts | 4 +- packages/cloud-init/src/template.ts | 26 ++++++ packages/cloud-init/tests/generate.test.ts | 87 +++++++++++++++++++ .../vm-agent/internal/bootstrap/bootstrap.go | 80 ++++++++++++++++- .../internal/bootstrap/bootstrap_test.go | 41 +++++++++ packages/vm-agent/internal/config/config.go | 13 +++ .../vm-agent/internal/config/config_test.go | 54 ++++++++++++ 9 files changed, 313 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/nodes.ts b/apps/api/src/services/nodes.ts index ff12eb958..6d94ffe31 100644 --- a/apps/api/src/services/nodes.ts +++ b/apps/api/src/services/nodes.ts @@ -153,6 +153,7 @@ export async function provisionNode( controlPlaneUrl: `https://api.${env.BASE_DOMAIN}`, jwksUrl: `https://api.${env.BASE_DOMAIN}/.well-known/jwks.json`, callbackToken, + provider: targetProvider, logJournalMaxUse: env.LOG_JOURNAL_MAX_USE, logJournalKeepFree: env.LOG_JOURNAL_KEEP_FREE, logJournalMaxRetention: env.LOG_JOURNAL_MAX_RETENTION, diff --git a/packages/cloud-init/src/generate.ts b/packages/cloud-init/src/generate.ts index 1171db868..e8c74d3c6 100644 --- a/packages/cloud-init/src/generate.ts +++ b/packages/cloud-init/src/generate.ts @@ -93,6 +93,11 @@ export function validateCloudInitVariables(variables: CloudInitVariables): void errors.push(`taskMode: must be 'task' or 'conversation' (got ${JSON.stringify(variables.taskMode)})`); } } + if (variables.provider !== undefined && variables.provider !== '') { + if (!VALID_CLOUD_PROVIDERS.includes(variables.provider as CloudProvider)) { + errors.push(`provider: must be one of ${VALID_CLOUD_PROVIDERS.join(', ')} (got ${JSON.stringify(variables.provider)})`); + } + } if (variables.logJournalMaxUse !== undefined && variables.logJournalMaxUse !== '') { if (!JOURNALD_SIZE_RE.test(variables.logJournalMaxUse)) { errors.push(`logJournalMaxUse: must match ${JOURNALD_SIZE_RE} (got ${JSON.stringify(variables.logJournalMaxUse)})`); @@ -129,6 +134,10 @@ export function validateCloudInitVariables(variables: CloudInitVariables): void } } +/** Valid cloud provider values for cloud-init. */ +export const VALID_CLOUD_PROVIDERS = ['hetzner', 'scaleway', 'gcp'] as const; +export type CloudProvider = (typeof VALID_CLOUD_PROVIDERS)[number]; + /** * Variables for cloud-init generation. */ @@ -138,6 +147,8 @@ export interface CloudInitVariables { controlPlaneUrl: string; jwksUrl: string; callbackToken: string; + /** Cloud provider (hetzner, scaleway, gcp). Used for provider-specific apt mirrors. */ + provider?: string; /** journald SystemMaxUse (default: 500M) */ logJournalMaxUse?: string; /** journald SystemKeepFree (default: 1G) */ @@ -204,6 +215,7 @@ export function generateCloudInit( '{{ tls_cert_path }}': variables.originCaCert ? '/etc/sam/tls/origin-ca.pem' : '', '{{ tls_key_path }}': variables.originCaCert ? '/etc/sam/tls/origin-ca-key.pem' : '', '{{ cf_ip_fetch_timeout }}': variables.cfIpFetchTimeout ?? '10', + '{{ provider }}': variables.provider ?? '', }; // Use function replacement to prevent $-pattern interpretation in values. diff --git a/packages/cloud-init/src/index.ts b/packages/cloud-init/src/index.ts index c26ea5b98..6324a03fe 100644 --- a/packages/cloud-init/src/index.ts +++ b/packages/cloud-init/src/index.ts @@ -1,3 +1,3 @@ -export { generateCloudInit, HETZNER_USER_DATA_MAX_BYTES, validateCloudInitSize, validateCloudInitVariables } from './generate'; -export type { CloudInitVariables, GenerateCloudInitOptions } from './generate'; +export { generateCloudInit, HETZNER_USER_DATA_MAX_BYTES, VALID_CLOUD_PROVIDERS, validateCloudInitSize, validateCloudInitVariables } from './generate'; +export type { CloudInitVariables, CloudProvider, GenerateCloudInitOptions } from './generate'; export { CLOUD_INIT_TEMPLATE } from './template'; diff --git a/packages/cloud-init/src/template.ts b/packages/cloud-init/src/template.ts index 751a2d6fd..212af8240 100644 --- a/packages/cloud-init/src/template.ts +++ b/packages/cloud-init/src/template.ts @@ -75,6 +75,7 @@ write_files: Environment=VM_AGENT_PORT={{ vm_agent_port }} Environment=TLS_CERT_PATH={{ tls_cert_path }} Environment=TLS_KEY_PATH={{ tls_key_path }} + Environment=PROVIDER={{ provider }} ExecStart=/usr/local/bin/vm-agent Restart=always RestartSec=5 @@ -280,5 +281,30 @@ write_files: {{ origin_ca_key }} permissions: '0600' + - path: /etc/apt/apt.conf.d/80-retries + content: | + Acquire::Retries "3"; + Acquire::http::Timeout "30"; + Acquire::https::Timeout "30"; + permissions: '0644' + + - path: /etc/sam/apt-mirror-config.sh + permissions: '0755' + content: | + #!/bin/bash + # Provider-specific apt mirror configuration for Docker containers. + # Sourced by the VM agent bootstrap to inject fast mirrors into containers. + # Only overrides for providers with known fast local mirrors. + PROVIDER="{{ provider }}" + case "$PROVIDER" in + hetzner) + APT_MIRROR="mirror.hetzner.com" + ;; + *) + APT_MIRROR="" + ;; + esac + export APT_MIRROR + final_message: "Simple Agent Manager node {{ node_id }} provisioning started!" `; diff --git a/packages/cloud-init/tests/generate.test.ts b/packages/cloud-init/tests/generate.test.ts index 2a9b8679c..3f96213f2 100644 --- a/packages/cloud-init/tests/generate.test.ts +++ b/packages/cloud-init/tests/generate.test.ts @@ -1299,6 +1299,93 @@ describe('regex injection prevention ($-pattern in replacement values)', () => { }); }); +describe('provider field and apt mirror configuration', () => { + it('substitutes PROVIDER env var in systemd service when provider is set', () => { + const config = generateCloudInit(baseVariables({ provider: 'hetzner' })); + expect(config).toContain('Environment=PROVIDER=hetzner'); + }); + + it('produces empty PROVIDER env var when provider is undefined', () => { + const config = generateCloudInit(baseVariables()); + expect(config).toContain('Environment=PROVIDER='); + expect(config).not.toContain('PROVIDER=undefined'); + }); + + it('includes apt retry configuration in write_files', () => { + const config = generateCloudInit(baseVariables()); + const parsed = YAML.parse(config); + + const aptRetry = parsed.write_files.find( + (f: { path: string }) => f.path === '/etc/apt/apt.conf.d/80-retries' + ); + expect(aptRetry).toBeDefined(); + expect(aptRetry.content).toContain('Acquire::Retries "3"'); + expect(aptRetry.content).toContain('Acquire::http::Timeout "30"'); + expect(aptRetry.content).toContain('Acquire::https::Timeout "30"'); + }); + + it('includes provider-specific apt mirror script in write_files', () => { + const config = generateCloudInit(baseVariables({ provider: 'hetzner' })); + const parsed = YAML.parse(config); + + const mirrorScript = parsed.write_files.find( + (f: { path: string }) => f.path === '/etc/sam/apt-mirror-config.sh' + ); + expect(mirrorScript).toBeDefined(); + expect(mirrorScript.permissions).toBe('0755'); + expect(mirrorScript.content).toContain('PROVIDER="hetzner"'); + expect(mirrorScript.content).toContain('APT_MIRROR="mirror.hetzner.com"'); + }); + + it('apt mirror script sets empty APT_MIRROR for non-hetzner providers', () => { + const config = generateCloudInit(baseVariables({ provider: 'scaleway' })); + const parsed = YAML.parse(config); + + const mirrorScript = parsed.write_files.find( + (f: { path: string }) => f.path === '/etc/sam/apt-mirror-config.sh' + ); + expect(mirrorScript).toBeDefined(); + expect(mirrorScript.content).toContain('PROVIDER="scaleway"'); + // The default case sets APT_MIRROR="" + expect(mirrorScript.content).toContain('APT_MIRROR=""'); + }); + + it('apt mirror script sets empty PROVIDER when provider is omitted', () => { + const config = generateCloudInit(baseVariables()); + const parsed = YAML.parse(config); + + const mirrorScript = parsed.write_files.find( + (f: { path: string }) => f.path === '/etc/sam/apt-mirror-config.sh' + ); + expect(mirrorScript).toBeDefined(); + expect(mirrorScript.content).toContain('PROVIDER=""'); + }); +}); + +describe('validateCloudInitVariables — provider field', () => { + it('accepts valid provider values', () => { + for (const provider of ['hetzner', 'scaleway', 'gcp']) { + expect(() => validateCloudInitVariables(baseVariables({ provider }))).not.toThrow(); + } + }); + + it('accepts empty string for provider', () => { + expect(() => validateCloudInitVariables(baseVariables({ provider: '' }))).not.toThrow(); + }); + + it('accepts undefined provider', () => { + expect(() => validateCloudInitVariables(baseVariables({ provider: undefined }))).not.toThrow(); + }); + + it('rejects invalid provider', () => { + expect(() => validateCloudInitVariables(baseVariables({ provider: 'aws' }))).toThrow('provider'); + }); + + it('rejects provider with shell metacharacters', () => { + expect(() => validateCloudInitVariables(baseVariables({ provider: 'hetzner; rm -rf /' }))).toThrow('provider'); + }); +}); + describe('integrated size validation in generateCloudInit', () => { it('throws when output exceeds 32KB (default behavior)', () => { // Create variables that will produce a config exceeding 32KB diff --git a/packages/vm-agent/internal/bootstrap/bootstrap.go b/packages/vm-agent/internal/bootstrap/bootstrap.go index d330e3331..7515b9847 100644 --- a/packages/vm-agent/internal/bootstrap/bootstrap.go +++ b/packages/vm-agent/internal/bootstrap/bootstrap.go @@ -208,6 +208,12 @@ func Run(ctx context.Context, cfg *config.Config, reporter *bootlog.Reporter) er reporter.Log("devcontainer_up", "completed", "Devcontainer ready") } + // Inject provider-specific apt mirror config into the container before any package installs. + // Non-fatal: if injection fails, apt will fall back to default archive.ubuntu.com. + if containerID, findErr := findDevcontainerID(ctx, cfg); findErr == nil { + injectAptMirrorConfig(ctx, cfg, containerID) + } + // Ensure gh CLI is available (install if missing from custom devcontainers). // Non-fatal: workspace still works without gh, just can't create PRs. reporter.Log("gh_cli", "started", "Checking GitHub CLI availability") @@ -362,6 +368,11 @@ func PrepareWorkspace(ctx context.Context, cfg *config.Config, state ProvisionSt } } + // Inject provider-specific apt mirror config into the container before any package installs. + if containerID, findErr := findDevcontainerID(ctx, cfg); findErr == nil { + injectAptMirrorConfig(ctx, cfg, containerID) + } + // Ensure gh CLI is available (install if missing from custom devcontainers). reporter.Log("gh_cli", "started", "Checking GitHub CLI availability") if err := ensureGitHubCLI(ctx, cfg); err != nil { @@ -857,11 +868,13 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName slog.Info("Repo has its own devcontainer config, skipping additional-features injection") } - cmd := exec.CommandContext(ctx, "devcontainer", args...) + buildCtx, buildCancel := devcontainerBuildContext(ctx, cfg) + defer buildCancel() + cmd := exec.CommandContext(buildCtx, "devcontainer", args...) output, err := cmd.CombinedOutput() if err != nil { // Repo config failed — log the error and fall back to default image. - slog.Warn("Devcontainer build failed with repo config, falling back to default image", "error", err, "output", strings.TrimSpace(string(output))) + slog.Warn("Devcontainer build failed with repo config, falling back to default image", "error", err, "output", strings.TrimSpace(string(output)), "timedOut", buildCtx.Err() == context.DeadlineExceeded) var fallbackErr error usedFallback, fallbackErr = fallbackToDefaultDevcontainer(ctx, cfg, volumeName, credHelperHostPath, err, output) if fallbackErr != nil { @@ -871,7 +884,9 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName } } else { // No config — use default. - _, err := runDevcontainerWithDefault(ctx, cfg, volumeName, credHelperHostPath) + buildCtx, buildCancel := devcontainerBuildContext(ctx, cfg) + defer buildCancel() + _, err := runDevcontainerWithDefault(buildCtx, cfg, volumeName, credHelperHostPath) if err != nil { return false, err } @@ -887,6 +902,65 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName return usedFallback, nil } +// devcontainerBuildContext wraps the parent context with a DevcontainerBuildTimeout deadline. +// This prevents devcontainer up from hanging indefinitely when network/apt operations fail. +func devcontainerBuildContext(parent context.Context, cfg *config.Config) (context.Context, context.CancelFunc) { + if cfg.DevcontainerBuildTimeout > 0 { + slog.Info("Applying devcontainer build timeout", "timeout", cfg.DevcontainerBuildTimeout) + return context.WithTimeout(parent, cfg.DevcontainerBuildTimeout) + } + return context.WithCancel(parent) +} + +// injectAptMirrorConfig injects provider-specific apt mirror configuration into a running container. +// This ensures containers on Hetzner use mirror.hetzner.com instead of archive.ubuntu.com, +// which is slow/unreachable through Docker bridge NAT on Hetzner networks. +func injectAptMirrorConfig(ctx context.Context, cfg *config.Config, containerID string) { + if cfg.Provider == "" { + return + } + + // Read the provider-specific mirror config from the host + mirrorScript := fmt.Sprintf(`set -e +# Source provider mirror config written by cloud-init +if [ -f /etc/sam/apt-mirror-config.sh ]; then + . /etc/sam/apt-mirror-config.sh +fi + +if [ -z "$APT_MIRROR" ]; then + exit 0 +fi + +# Write apt mirror sources list and retry config into the container +docker exec -u root %s sh -c " + # Add retry config + mkdir -p /etc/apt/apt.conf.d + echo 'Acquire::Retries \"3\";' > /etc/apt/apt.conf.d/80-retries + echo 'Acquire::http::Timeout \"30\";' >> /etc/apt/apt.conf.d/80-retries + echo 'Acquire::https::Timeout \"30\";' >> /etc/apt/apt.conf.d/80-retries + + # Override sources to use provider mirror + if [ -f /etc/apt/sources.list ]; then + sed -i \"s|http://archive.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list + sed -i \"s|http://security.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list + fi + # Handle DEB822 format (Ubuntu 24.04+) + if [ -f /etc/apt/sources.list.d/ubuntu.sources ]; then + sed -i \"s|http://archive.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list.d/ubuntu.sources + sed -i \"s|http://security.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list.d/ubuntu.sources + fi +" +`, containerID) + + cmd := exec.CommandContext(ctx, "sh", "-c", mirrorScript) + output, err := cmd.CombinedOutput() + if err != nil { + slog.Warn("Failed to inject apt mirror config into container (non-fatal)", "error", err, "output", strings.TrimSpace(string(output)), "provider", cfg.Provider) + return + } + slog.Info("Injected apt mirror config into container", "provider", cfg.Provider, "containerID", containerID) +} + func fallbackToDefaultDevcontainer( ctx context.Context, cfg *config.Config, diff --git a/packages/vm-agent/internal/bootstrap/bootstrap_test.go b/packages/vm-agent/internal/bootstrap/bootstrap_test.go index a5e9dc9a6..470cc8979 100644 --- a/packages/vm-agent/internal/bootstrap/bootstrap_test.go +++ b/packages/vm-agent/internal/bootstrap/bootstrap_test.go @@ -2990,3 +2990,44 @@ func TestWriteDefaultDevcontainerConfigWithVolumeAndCredentialHelper(t *testing. t.Fatal("expected containerEnv for credential helper") } } + +func TestDevcontainerBuildContext_WithTimeout(t *testing.T) { + cfg := &config.Config{ + DevcontainerBuildTimeout: 5 * time.Second, + } + parent := context.Background() + ctx, cancel := devcontainerBuildContext(parent, cfg) + defer cancel() + + deadline, ok := ctx.Deadline() + if !ok { + t.Fatal("expected context to have a deadline") + } + // Deadline should be approximately 5s from now + remaining := time.Until(deadline) + if remaining < 4*time.Second || remaining > 6*time.Second { + t.Fatalf("expected ~5s remaining, got %v", remaining) + } +} + +func TestDevcontainerBuildContext_ZeroTimeout(t *testing.T) { + cfg := &config.Config{ + DevcontainerBuildTimeout: 0, + } + parent := context.Background() + ctx, cancel := devcontainerBuildContext(parent, cfg) + defer cancel() + + _, ok := ctx.Deadline() + if ok { + t.Fatal("expected context to have no deadline when timeout is 0") + } +} + +func TestInjectAptMirrorConfig_EmptyProvider(t *testing.T) { + cfg := &config.Config{ + Provider: "", + } + // Should not panic or error with empty provider — it's a no-op + injectAptMirrorConfig(context.Background(), cfg, "nonexistent-container-id") +} diff --git a/packages/vm-agent/internal/config/config.go b/packages/vm-agent/internal/config/config.go index 31dbcd2b9..522b04817 100644 --- a/packages/vm-agent/internal/config/config.go +++ b/packages/vm-agent/internal/config/config.go @@ -133,6 +133,13 @@ type Config struct { DefaultDevcontainerConfigPath string // Path to write the generated default config DefaultDevcontainerRemoteUser string // remoteUser for the default config (empty = omit, let image default) + // Devcontainer build timeout — prevents indefinite hangs when apt/network fails. + // Configurable per constitution principle XI. + DevcontainerBuildTimeout time.Duration // Max time for a single devcontainer up call (env: DEVCONTAINER_BUILD_TIMEOUT, default: 15m) + + // Cloud provider — used for provider-specific optimizations (apt mirrors, etc.) + Provider string // Cloud provider name (env: PROVIDER, e.g. "hetzner", "scaleway", "gcp") + // Project linkage — set via cloud-init when the workspace belongs to a project. // If ProjectID is empty, the message reporter is disabled (no-op). ProjectID string // Linked project ID (env: PROJECT_ID) @@ -324,6 +331,12 @@ func Load() (*Config, error) { DefaultDevcontainerConfigPath: getEnv("DEFAULT_DEVCONTAINER_CONFIG_PATH", DefaultDevcontainerConfigPath), DefaultDevcontainerRemoteUser: getEnv("DEFAULT_DEVCONTAINER_REMOTE_USER", ""), // Empty = omit, use image default + // Devcontainer build timeout — prevents indefinite hangs on network failures. + DevcontainerBuildTimeout: getEnvDuration("DEVCONTAINER_BUILD_TIMEOUT", 15*time.Minute), + + // Cloud provider (set via cloud-init) + Provider: getEnv("PROVIDER", ""), + // Project linkage (set via cloud-init) ProjectID: getEnv("PROJECT_ID", ""), ChatSessionID: getEnv("CHAT_SESSION_ID", ""), diff --git a/packages/vm-agent/internal/config/config_test.go b/packages/vm-agent/internal/config/config_test.go index 392ef65a8..acb14dd4d 100644 --- a/packages/vm-agent/internal/config/config_test.go +++ b/packages/vm-agent/internal/config/config_test.go @@ -866,3 +866,57 @@ func TestCloseIdleControlPlaneConnectionsFlushesPool(t *testing.T) { t.Fatalf("expected 2 total connections after flush (pool was purged), got %d", afterFlush) } } + +func TestDevcontainerBuildTimeoutDefault(t *testing.T) { + t.Setenv("CONTROL_PLANE_URL", "https://api.example.com") + t.Setenv("WORKSPACE_ID", "ws-123") + + cfg, err := Load() + if err != nil { + t.Fatalf("Load returned error: %v", err) + } + if cfg.DevcontainerBuildTimeout != 15*time.Minute { + t.Fatalf("DevcontainerBuildTimeout=%v, want %v", cfg.DevcontainerBuildTimeout, 15*time.Minute) + } +} + +func TestDevcontainerBuildTimeoutOverride(t *testing.T) { + t.Setenv("CONTROL_PLANE_URL", "https://api.example.com") + t.Setenv("WORKSPACE_ID", "ws-123") + t.Setenv("DEVCONTAINER_BUILD_TIMEOUT", "25m") + + cfg, err := Load() + if err != nil { + t.Fatalf("Load returned error: %v", err) + } + if cfg.DevcontainerBuildTimeout != 25*time.Minute { + t.Fatalf("DevcontainerBuildTimeout=%v, want %v", cfg.DevcontainerBuildTimeout, 25*time.Minute) + } +} + +func TestProviderDefault(t *testing.T) { + t.Setenv("CONTROL_PLANE_URL", "https://api.example.com") + t.Setenv("WORKSPACE_ID", "ws-123") + + cfg, err := Load() + if err != nil { + t.Fatalf("Load returned error: %v", err) + } + if cfg.Provider != "" { + t.Fatalf("Provider=%q, want empty string", cfg.Provider) + } +} + +func TestProviderOverride(t *testing.T) { + t.Setenv("CONTROL_PLANE_URL", "https://api.example.com") + t.Setenv("WORKSPACE_ID", "ws-123") + t.Setenv("PROVIDER", "hetzner") + + cfg, err := Load() + if err != nil { + t.Fatalf("Load returned error: %v", err) + } + if cfg.Provider != "hetzner" { + t.Fatalf("Provider=%q, want %q", cfg.Provider, "hetzner") + } +} From e42cfb27e9a5058143a6420c688d3bb2cf72a541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 10:00:38 +0000 Subject: [PATCH 3/6] fix: address go-specialist review findings (shell injection, context leak) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CRITICAL: Eliminate sh -c outer shell in injectAptMirrorConfig — use exec.Command("docker","exec",...) with containerID as a direct argument to prevent shell injection. Mirror hostname is resolved in Go and validated with a hostname regex before use. - HIGH: Call buildCancel() immediately after cmd.CombinedOutput() returns instead of relying solely on defer, releasing the timer goroutine before the potentially long fallback build runs. - MEDIUM: Log findDevcontainerID failure at slog.Debug level at both call sites so silent skips are diagnosable. - MEDIUM: Add // Non-fatal comment to second call site for consistency. - LOW: Downgrade devcontainerBuildContext log from Info to Debug. - LOW: Document zero-means-disabled contract in devcontainerBuildContext. Co-Authored-By: Claude Opus 4.6 --- .../vm-agent/internal/bootstrap/bootstrap.go | 83 +++++++++++-------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/packages/vm-agent/internal/bootstrap/bootstrap.go b/packages/vm-agent/internal/bootstrap/bootstrap.go index 7515b9847..7b78e5904 100644 --- a/packages/vm-agent/internal/bootstrap/bootstrap.go +++ b/packages/vm-agent/internal/bootstrap/bootstrap.go @@ -212,6 +212,8 @@ func Run(ctx context.Context, cfg *config.Config, reporter *bootlog.Reporter) er // Non-fatal: if injection fails, apt will fall back to default archive.ubuntu.com. if containerID, findErr := findDevcontainerID(ctx, cfg); findErr == nil { injectAptMirrorConfig(ctx, cfg, containerID) + } else { + slog.Debug("Could not find devcontainer for apt mirror injection (non-fatal)", "error", findErr) } // Ensure gh CLI is available (install if missing from custom devcontainers). @@ -369,8 +371,11 @@ func PrepareWorkspace(ctx context.Context, cfg *config.Config, state ProvisionSt } // Inject provider-specific apt mirror config into the container before any package installs. + // Non-fatal: if injection fails, apt will fall back to default archive.ubuntu.com. if containerID, findErr := findDevcontainerID(ctx, cfg); findErr == nil { injectAptMirrorConfig(ctx, cfg, containerID) + } else { + slog.Debug("Could not find devcontainer for apt mirror injection (non-fatal)", "error", findErr) } // Ensure gh CLI is available (install if missing from custom devcontainers). @@ -869,9 +874,9 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName } buildCtx, buildCancel := devcontainerBuildContext(ctx, cfg) - defer buildCancel() cmd := exec.CommandContext(buildCtx, "devcontainer", args...) output, err := cmd.CombinedOutput() + buildCancel() // Release timer immediately; fallback uses parent ctx. if err != nil { // Repo config failed — log the error and fall back to default image. slog.Warn("Devcontainer build failed with repo config, falling back to default image", "error", err, "output", strings.TrimSpace(string(output)), "timedOut", buildCtx.Err() == context.DeadlineExceeded) @@ -904,9 +909,11 @@ func ensureDevcontainerReady(ctx context.Context, cfg *config.Config, volumeName // devcontainerBuildContext wraps the parent context with a DevcontainerBuildTimeout deadline. // This prevents devcontainer up from hanging indefinitely when network/apt operations fail. +// If DevcontainerBuildTimeout is zero (e.g. DEVCONTAINER_BUILD_TIMEOUT=0), no deadline is +// applied and only parent cancellation is forwarded. func devcontainerBuildContext(parent context.Context, cfg *config.Config) (context.Context, context.CancelFunc) { if cfg.DevcontainerBuildTimeout > 0 { - slog.Info("Applying devcontainer build timeout", "timeout", cfg.DevcontainerBuildTimeout) + slog.Debug("Applying devcontainer build timeout", "timeout", cfg.DevcontainerBuildTimeout) return context.WithTimeout(parent, cfg.DevcontainerBuildTimeout) } return context.WithCancel(parent) @@ -915,50 +922,60 @@ func devcontainerBuildContext(parent context.Context, cfg *config.Config) (conte // injectAptMirrorConfig injects provider-specific apt mirror configuration into a running container. // This ensures containers on Hetzner use mirror.hetzner.com instead of archive.ubuntu.com, // which is slow/unreachable through Docker bridge NAT on Hetzner networks. +// Non-fatal: if injection fails, apt will fall back to default archive.ubuntu.com. func injectAptMirrorConfig(ctx context.Context, cfg *config.Config, containerID string) { if cfg.Provider == "" { return } - // Read the provider-specific mirror config from the host - mirrorScript := fmt.Sprintf(`set -e -# Source provider mirror config written by cloud-init -if [ -f /etc/sam/apt-mirror-config.sh ]; then - . /etc/sam/apt-mirror-config.sh -fi + // Resolve the apt mirror from the host-side config written by cloud-init. + mirror := resolveAptMirror(cfg.Provider) + if mirror == "" { + return + } -if [ -z "$APT_MIRROR" ]; then - exit 0 -fi + // Validate mirror is a safe hostname (alphanumeric, dots, hyphens only). + if !isValidMirrorHostname(mirror) { + slog.Warn("APT_MIRROR value looks unsafe, skipping injection", "mirror", mirror, "provider", cfg.Provider) + return + } -# Write apt mirror sources list and retry config into the container -docker exec -u root %s sh -c " - # Add retry config - mkdir -p /etc/apt/apt.conf.d - echo 'Acquire::Retries \"3\";' > /etc/apt/apt.conf.d/80-retries - echo 'Acquire::http::Timeout \"30\";' >> /etc/apt/apt.conf.d/80-retries - echo 'Acquire::https::Timeout \"30\";' >> /etc/apt/apt.conf.d/80-retries - - # Override sources to use provider mirror - if [ -f /etc/apt/sources.list ]; then - sed -i \"s|http://archive.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list - sed -i \"s|http://security.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list - fi - # Handle DEB822 format (Ubuntu 24.04+) - if [ -f /etc/apt/sources.list.d/ubuntu.sources ]; then - sed -i \"s|http://archive.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list.d/ubuntu.sources - sed -i \"s|http://security.ubuntu.com|http://$APT_MIRROR|g\" /etc/apt/sources.list.d/ubuntu.sources - fi -" -`, containerID) + // Build the inner script that runs inside the container. + // Uses exec.Command with containerID as a direct argument (not shell-interpolated) + // to prevent any injection via containerID. + innerScript := fmt.Sprintf( + `mkdir -p /etc/apt/apt.conf.d && `+ + `printf 'Acquire::Retries "3";\nAcquire::http::Timeout "30";\nAcquire::https::Timeout "30";\n' > /etc/apt/apt.conf.d/80-retries && `+ + `{ [ -f /etc/apt/sources.list ] && sed -i 's|http://archive.ubuntu.com|http://%[1]s|g; s|http://security.ubuntu.com|http://%[1]s|g' /etc/apt/sources.list || true; } && `+ + `{ [ -f /etc/apt/sources.list.d/ubuntu.sources ] && sed -i 's|http://archive.ubuntu.com|http://%[1]s|g; s|http://security.ubuntu.com|http://%[1]s|g' /etc/apt/sources.list.d/ubuntu.sources || true; }`, + mirror, + ) - cmd := exec.CommandContext(ctx, "sh", "-c", mirrorScript) + cmd := exec.CommandContext(ctx, "docker", "exec", "-u", "root", containerID, "sh", "-c", innerScript) output, err := cmd.CombinedOutput() if err != nil { slog.Warn("Failed to inject apt mirror config into container (non-fatal)", "error", err, "output", strings.TrimSpace(string(output)), "provider", cfg.Provider) return } - slog.Info("Injected apt mirror config into container", "provider", cfg.Provider, "containerID", containerID) + slog.Info("Injected apt mirror config into container", "provider", cfg.Provider, "containerID", containerID, "mirror", mirror) +} + +// resolveAptMirror returns the apt mirror hostname for the given cloud provider. +// Returns empty string if no specific mirror is configured for the provider. +func resolveAptMirror(provider string) string { + switch provider { + case "hetzner": + return "mirror.hetzner.com" + default: + return "" + } +} + +// isValidMirrorHostname validates that a mirror value contains only safe hostname characters. +var validMirrorRe = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9.\-]*[a-zA-Z0-9])?$`) + +func isValidMirrorHostname(mirror string) bool { + return validMirrorRe.MatchString(mirror) } func fallbackToDefaultDevcontainer( From b59c4a27107fcc29455cf990cab5640f219b537f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 10:04:41 +0000 Subject: [PATCH 4/6] fix: separate apt retry config from mirror injection for universal resilience Apt retry config (Acquire::Retries, timeouts) now injects into ALL containers regardless of provider, not just Hetzner. Mirror replacement remains provider-specific. Adds TestInjectAptRetryConfig test. Co-Authored-By: Claude Opus 4.6 --- .../vm-agent/internal/bootstrap/bootstrap.go | 35 +++++++---- .../internal/bootstrap/bootstrap_test.go | 58 +++++++++++++++++++ 2 files changed, 81 insertions(+), 12 deletions(-) diff --git a/packages/vm-agent/internal/bootstrap/bootstrap.go b/packages/vm-agent/internal/bootstrap/bootstrap.go index 7b78e5904..8f446d62a 100644 --- a/packages/vm-agent/internal/bootstrap/bootstrap.go +++ b/packages/vm-agent/internal/bootstrap/bootstrap.go @@ -208,12 +208,13 @@ func Run(ctx context.Context, cfg *config.Config, reporter *bootlog.Reporter) er reporter.Log("devcontainer_up", "completed", "Devcontainer ready") } - // Inject provider-specific apt mirror config into the container before any package installs. - // Non-fatal: if injection fails, apt will fall back to default archive.ubuntu.com. + // Inject apt retry config (all providers) and mirror config (provider-specific) before package installs. + // Non-fatal: if injection fails, apt will use default settings. if containerID, findErr := findDevcontainerID(ctx, cfg); findErr == nil { + injectAptRetryConfig(ctx, containerID) injectAptMirrorConfig(ctx, cfg, containerID) } else { - slog.Debug("Could not find devcontainer for apt mirror injection (non-fatal)", "error", findErr) + slog.Debug("Could not find devcontainer for apt config injection (non-fatal)", "error", findErr) } // Ensure gh CLI is available (install if missing from custom devcontainers). @@ -370,12 +371,13 @@ func PrepareWorkspace(ctx context.Context, cfg *config.Config, state ProvisionSt } } - // Inject provider-specific apt mirror config into the container before any package installs. - // Non-fatal: if injection fails, apt will fall back to default archive.ubuntu.com. + // Inject apt retry config (all providers) and mirror config (provider-specific) before package installs. + // Non-fatal: if injection fails, apt will use default settings. if containerID, findErr := findDevcontainerID(ctx, cfg); findErr == nil { + injectAptRetryConfig(ctx, containerID) injectAptMirrorConfig(ctx, cfg, containerID) } else { - slog.Debug("Could not find devcontainer for apt mirror injection (non-fatal)", "error", findErr) + slog.Debug("Could not find devcontainer for apt config injection (non-fatal)", "error", findErr) } // Ensure gh CLI is available (install if missing from custom devcontainers). @@ -919,6 +921,20 @@ func devcontainerBuildContext(parent context.Context, cfg *config.Config) (conte return context.WithCancel(parent) } +// injectAptRetryConfig injects apt retry and timeout configuration into a running container. +// This makes apt operations resilient to transient network failures regardless of cloud provider. +// Non-fatal: if injection fails, apt will use default settings (no retries). +func injectAptRetryConfig(ctx context.Context, containerID string) { + retryScript := `mkdir -p /etc/apt/apt.conf.d && printf 'Acquire::Retries "3";\nAcquire::http::Timeout "30";\nAcquire::https::Timeout "30";\n' > /etc/apt/apt.conf.d/80-retries` + cmd := exec.CommandContext(ctx, "docker", "exec", "-u", "root", containerID, "sh", "-c", retryScript) + output, err := cmd.CombinedOutput() + if err != nil { + slog.Warn("Failed to inject apt retry config into container (non-fatal)", "error", err, "output", strings.TrimSpace(string(output))) + return + } + slog.Info("Injected apt retry config into container", "containerID", containerID) +} + // injectAptMirrorConfig injects provider-specific apt mirror configuration into a running container. // This ensures containers on Hetzner use mirror.hetzner.com instead of archive.ubuntu.com, // which is slow/unreachable through Docker bridge NAT on Hetzner networks. @@ -928,25 +944,20 @@ func injectAptMirrorConfig(ctx context.Context, cfg *config.Config, containerID return } - // Resolve the apt mirror from the host-side config written by cloud-init. mirror := resolveAptMirror(cfg.Provider) if mirror == "" { return } - // Validate mirror is a safe hostname (alphanumeric, dots, hyphens only). if !isValidMirrorHostname(mirror) { slog.Warn("APT_MIRROR value looks unsafe, skipping injection", "mirror", mirror, "provider", cfg.Provider) return } - // Build the inner script that runs inside the container. // Uses exec.Command with containerID as a direct argument (not shell-interpolated) // to prevent any injection via containerID. innerScript := fmt.Sprintf( - `mkdir -p /etc/apt/apt.conf.d && `+ - `printf 'Acquire::Retries "3";\nAcquire::http::Timeout "30";\nAcquire::https::Timeout "30";\n' > /etc/apt/apt.conf.d/80-retries && `+ - `{ [ -f /etc/apt/sources.list ] && sed -i 's|http://archive.ubuntu.com|http://%[1]s|g; s|http://security.ubuntu.com|http://%[1]s|g' /etc/apt/sources.list || true; } && `+ + `{ [ -f /etc/apt/sources.list ] && sed -i 's|http://archive.ubuntu.com|http://%[1]s|g; s|http://security.ubuntu.com|http://%[1]s|g' /etc/apt/sources.list || true; } && `+ `{ [ -f /etc/apt/sources.list.d/ubuntu.sources ] && sed -i 's|http://archive.ubuntu.com|http://%[1]s|g; s|http://security.ubuntu.com|http://%[1]s|g' /etc/apt/sources.list.d/ubuntu.sources || true; }`, mirror, ) diff --git a/packages/vm-agent/internal/bootstrap/bootstrap_test.go b/packages/vm-agent/internal/bootstrap/bootstrap_test.go index 470cc8979..30c633e79 100644 --- a/packages/vm-agent/internal/bootstrap/bootstrap_test.go +++ b/packages/vm-agent/internal/bootstrap/bootstrap_test.go @@ -3024,6 +3024,11 @@ func TestDevcontainerBuildContext_ZeroTimeout(t *testing.T) { } } +func TestInjectAptRetryConfig(t *testing.T) { + // Should not panic when docker is unavailable — non-fatal by design. + injectAptRetryConfig(context.Background(), "nonexistent-container-id") +} + func TestInjectAptMirrorConfig_EmptyProvider(t *testing.T) { cfg := &config.Config{ Provider: "", @@ -3031,3 +3036,56 @@ func TestInjectAptMirrorConfig_EmptyProvider(t *testing.T) { // Should not panic or error with empty provider — it's a no-op injectAptMirrorConfig(context.Background(), cfg, "nonexistent-container-id") } + +func TestInjectAptMirrorConfig_HetznerProvider(t *testing.T) { + cfg := &config.Config{ + Provider: "hetzner", + } + // With hetzner provider, the function will attempt docker exec with mirror.hetzner.com. + // It will fail (no Docker) but should not panic. The key assertion is that it proceeds + // past the guards and attempts the docker exec (logged as a warning, not a crash). + injectAptMirrorConfig(context.Background(), cfg, "test-container-abc123") +} + +func TestResolveAptMirror(t *testing.T) { + t.Parallel() + + tests := []struct { + provider string + want string + }{ + {"hetzner", "mirror.hetzner.com"}, + {"scaleway", ""}, + {"gcp", ""}, + {"", ""}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.provider, func(t *testing.T) { + t.Parallel() + got := resolveAptMirror(tc.provider) + if got != tc.want { + t.Fatalf("resolveAptMirror(%q) = %q, want %q", tc.provider, got, tc.want) + } + }) + } +} + +func TestIsValidMirrorHostname(t *testing.T) { + t.Parallel() + + valid := []string{"mirror.hetzner.com", "apt.example.org", "mirror123.test"} + for _, h := range valid { + if !isValidMirrorHostname(h) { + t.Errorf("isValidMirrorHostname(%q) = false, want true", h) + } + } + + invalid := []string{"mirror;rm -rf /", "$(cmd)", "mirror hetzner.com", "", "mirror\n.com"} + for _, h := range invalid { + if isValidMirrorHostname(h) { + t.Errorf("isValidMirrorHostname(%q) = true, want false", h) + } + } +} From 8c951f5e18072b97285feef6f1b864aac83c1fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 10:19:27 +0000 Subject: [PATCH 5/6] task: archive devcontainer-network-resilience task Co-Authored-By: Claude Opus 4.6 --- .../2026-05-05-devcontainer-network-resilience.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tasks/{active => archive}/2026-05-05-devcontainer-network-resilience.md (100%) diff --git a/tasks/active/2026-05-05-devcontainer-network-resilience.md b/tasks/archive/2026-05-05-devcontainer-network-resilience.md similarity index 100% rename from tasks/active/2026-05-05-devcontainer-network-resilience.md rename to tasks/archive/2026-05-05-devcontainer-network-resilience.md From 614fd7041b4152fc5af10848a9e7afe736933959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 10:25:31 +0000 Subject: [PATCH 6/6] ci: retrigger checks after PR body update Co-Authored-By: Claude Opus 4.6