From 2ffcef28f025b3bff07c1363028117d6f7c82f8b Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Sat, 9 May 2026 16:21:13 -0500 Subject: [PATCH 01/14] fix(gastown): unblock /agents/start during boot hydration; preserve mayor tools on prewarm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent fixes for the startAgentInContainer timeout regression introduced by #2974, plus a tighter container-instance cap. 1. Hydration gate (control-server.ts, process-manager.ts) The control server starts accepting requests immediately at boot, while bootHydration runs concurrently and serialises every registry agent + the mayor prewarm through the global sdkServerLock. Fresh /agents/start, /refresh-token, and PATCH /agents/:id/model requests queued behind that work and the DO-side AbortSignal.timeout(60s) fired before they ever got the lock — surfacing as "TimeoutError: aborted due to timeout" and "timeout after 6000ms: ensureSDKServer for ". A new awaitHydration() promise is awaited at the top of those handlers (before any process.env mutation in the model PATCH path) so they don't compound the queue. 2. Prewarm config matches /agents/start (Town.do.ts, gastown.worker.ts, process-manager.ts) buildPrewarmEnv was constructing KILO_CONFIG_CONTENT from hardcoded defaults (anthropic/claude-sonnet-4.6 / claude-haiku-4.5), so the real /agents/start with the user's actual model triggered ensureSDKServer's "config mismatch, evicting prewarmed server" path on every warm restart — doubling lock-holding time on the critical path the prewarm was supposed to speed up. The /api/towns/:id/mayor-id endpoint now returns the full prewarm context (model, smallModel, kilocodeToken, organizationId) resolved the same way _ensureMayor resolves it, and the container builds the prewarm KILO_CONFIG_CONTENT to match. Falls back gracefully to a skip when the worker hasn't deployed the richer endpoint yet. 3. Mayor workdir + plugin env (agent-runner.ts, process-manager.ts) prewarmMayorSDK called mayorWorkdirForTown (which only returns a string) and went straight to ensureSDKServer's process.chdir, throwing ENOENT on cold containers because createMayorWorkspace only ran from runAgent. Exported ensureMayorWorkspaceForTown so prewarm materialises the workspace first. More critically, buildPrewarmEnv was missing GASTOWN_AGENT_ROLE, GASTOWN_AGENT_ID, and GASTOWN_TOWN_ID — env vars the kilo serve plugin (plugin/index.ts) reads at spawn to decide whether to register mayor tools. Without them the prewarmed server booted with NO mayor tools, and the cache hit on the next /agents/start handed that defective instance back to the user. Now mirrors the mayor- shaped subset of buildAgentEnv. Added an end-to-end test that intercepts createKilo and asserts the env at spawn time. 4. wrangler.jsonc: lower TownContainerDO max_instances from 800 to 500. Verified with pnpm --filter gastown-container test (67/67 pass), pnpm --filter cloudflare-gastown typecheck, oxlint, and pnpm format. --- .../gastown/container/src/agent-runner.ts | 13 ++ .../gastown/container/src/control-server.ts | 25 +++ .../container/src/process-manager.test.ts | 165 +++++++++++++++- .../gastown/container/src/process-manager.ts | 180 ++++++++++++++---- services/gastown/src/dos/Town.do.ts | 37 ++++ services/gastown/src/gastown.worker.ts | 13 ++ services/gastown/wrangler.jsonc | 2 +- 7 files changed, 391 insertions(+), 44 deletions(-) diff --git a/services/gastown/container/src/agent-runner.ts b/services/gastown/container/src/agent-runner.ts index d31511a68c..dfbf868495 100644 --- a/services/gastown/container/src/agent-runner.ts +++ b/services/gastown/container/src/agent-runner.ts @@ -413,6 +413,19 @@ async function createMayorWorkspace(rigId: string): Promise { return createLightweightWorkspace('mayor', rigId); } +/** + * Ensure the mayor workdir exists on disk for a given town, creating + * a lightweight git-initialized workspace if needed. + * + * Used by `prewarmMayorSDK`, which runs before `runAgent` and so cannot + * rely on `createMayorWorkspace` having been called yet — without this, + * `ensureSDKServer` would throw `ENOENT` from `process.chdir(workdir)` + * and the prewarm benefit would never materialize on cold containers. + */ +export async function ensureMayorWorkspaceForTown(townId: string): Promise { + return createMayorWorkspace(`mayor-${townId}`); +} + /** * Write the mayor's system prompt to AGENTS.md in the workspace. * diff --git a/services/gastown/container/src/control-server.ts b/services/gastown/container/src/control-server.ts index eddf78c026..36196f1b41 100644 --- a/services/gastown/container/src/control-server.ts +++ b/services/gastown/container/src/control-server.ts @@ -18,6 +18,7 @@ import { registerEventSink, refreshTokenForAllAgents, listAgents, + awaitHydration, } from './process-manager'; import { log } from './logger'; import { startHeartbeat, stopHeartbeat, notifyContainerReady } from './heartbeat'; @@ -264,6 +265,12 @@ app.post('/refresh-token', async c => { } process.env.GASTOWN_CONTAINER_TOKEN = body.token; + // Wait for boot hydration to release the global sdkServerLock before + // we serialise N agent restarts through it. Without this, agent k + // queues behind every in-flight hydration ensureSDKServer and easily + // blows past REFRESH_AGENT_TIMEOUT_MS (6s) on warm-restart containers. + await awaitHydration(); + const activeAgents = listAgents().filter(a => a.status === 'running' || a.status === 'starting'); log.info('refresh_token.received', { agentCount: activeAgents.length, @@ -312,6 +319,15 @@ app.post('/agents/start', async c => { return c.json({ error: 'Invalid request body', issues: parsed.error.issues }, 400); } + // Wait for boot hydration to release the global sdkServerLock. The + // control server starts accepting requests immediately at boot, before + // bootHydration finishes resuming registry agents and prewarming the + // mayor — without this gate, fresh dispatches queue behind every + // serialised SDK spawn and the DO-side AbortSignal.timeout(60s) fires + // before they ever get the lock, surfacing as the + // "startAgentInContainer EXCEPTION TimeoutError" pattern. + await awaitHydration(); + // Persist the organization ID as a standalone env var so it survives // config rebuilds (e.g. model hot-swap). The env var is the primary // source of truth; KILO_CONFIG_CONTENT extraction is the fallback. @@ -386,6 +402,15 @@ app.patch('/agents/:agentId/model', async c => { return c.json({ error: 'Invalid request body', issues: parsed.error.issues }, 400); } + // Model hot-swap restarts the SDK server (see updateAgentModel) and + // contends for the same global sdkServerLock that boot hydration is + // holding. Wait for hydration to drain BEFORE the env mutations + // below: concurrent PATCH requests landing during hydration would + // otherwise race on process.env writes before any of them holds the + // SDK lock, and the env visible to the eventual `kilo serve` spawn + // would be non-deterministic. + await awaitHydration(); + // Update org billing context from the request body if provided. if (parsed.data.organizationId) { process.env.GASTOWN_ORGANIZATION_ID = parsed.data.organizationId; diff --git a/services/gastown/container/src/process-manager.test.ts b/services/gastown/container/src/process-manager.test.ts index d2c8906c2a..1afd68112c 100644 --- a/services/gastown/container/src/process-manager.test.ts +++ b/services/gastown/container/src/process-manager.test.ts @@ -5,11 +5,19 @@ import { describe, it, expect, vi } from 'vitest'; vi.mock('@kilocode/sdk', () => ({ createKilo: vi.fn(), })); +// Mock workspace helpers to return a path that actually exists on the +// test runner so ensureSDKServer's process.chdir doesn't ENOENT. +const TEST_WORKSPACE = process.cwd(); vi.mock('./agent-runner', () => ({ runAgent: vi.fn(), - buildKiloConfigContent: vi.fn(), + buildKiloConfigContent: vi.fn( + (kilocodeToken: string, model: string, smallModel: string, organizationId?: string) => + JSON.stringify({ kilocodeToken, model, smallModel, organizationId }) + ), resolveGitCredentials: vi.fn(), writeMayorSystemPromptToAgentsMd: vi.fn(), + ensureMayorWorkspaceForTown: vi.fn(async (_townId: string) => TEST_WORKSPACE), + mayorWorkdirForTown: vi.fn((_townId: string) => TEST_WORKSPACE), })); vi.mock('./control-server', () => ({ getCurrentTownConfig: vi.fn(() => ({})), @@ -24,7 +32,8 @@ vi.mock('./token-refresh', () => ({ refreshTokenIfNearExpiry: vi.fn(), })); -const { applyModelToSession, withStartAgentLock } = await import('./process-manager'); +const { applyModelToSession, withStartAgentLock, awaitHydration, bootHydration } = + await import('./process-manager'); type PromptCall = { path: { id: string }; @@ -178,3 +187,155 @@ describe('withStartAgentLock', () => { expect(result).toBe('ok'); }); }); + +describe('awaitHydration', () => { + it('resolves immediately before any bootHydration call', async () => { + // Module-init state must not block /agents/start in test/dev contexts + // where bootHydration never runs. + let resolved = false; + void awaitHydration().then(() => { + resolved = true; + }); + await new Promise(r => setTimeout(r, 0)); + expect(resolved).toBe(true); + }); + + it('prewarms mayor SDK with env that mirrors buildAgentEnv (mayor tools require GASTOWN_AGENT_ROLE/AGENT_ID/TOWN_ID)', async () => { + // Without these env vars in the snapshot kilo serve takes at spawn, + // GastownPlugin (plugin/index.ts) treats the prewarmed mayor as a + // rig agent (or fails the createMayorClientFromEnv guard) and the + // server boots with NO mayor tools. ensureSDKServer's cache hit on + // the next /agents/start hands back that defective server. + const { createKilo } = (await import('@kilocode/sdk')) as unknown as { + createKilo: ReturnType; + }; + + const prev = { + apiUrl: process.env.GASTOWN_API_URL, + townId: process.env.GASTOWN_TOWN_ID, + token: process.env.GASTOWN_CONTAINER_TOKEN, + }; + process.env.GASTOWN_API_URL = 'http://test.invalid'; + process.env.GASTOWN_TOWN_ID = 'town-prewarm'; + process.env.GASTOWN_CONTAINER_TOKEN = 'tok-prewarm'; + + let capturedEnv: Record | null = null; + createKilo.mockImplementationOnce(() => { + // Snapshot the keys plugin/index.ts and plugin/client.ts read. + capturedEnv = { + GASTOWN_AGENT_ID: process.env.GASTOWN_AGENT_ID, + GASTOWN_AGENT_ROLE: process.env.GASTOWN_AGENT_ROLE, + GASTOWN_TOWN_ID: process.env.GASTOWN_TOWN_ID, + GASTOWN_API_URL: process.env.GASTOWN_API_URL, + GASTOWN_CONTAINER_TOKEN: process.env.GASTOWN_CONTAINER_TOKEN, + KILO_CONFIG_CONTENT: process.env.KILO_CONFIG_CONTENT, + }; + return Promise.resolve({ + client: {} as unknown, + server: { url: 'http://127.0.0.1:9999/', close: () => {} }, + }); + }); + + const originalFetch = globalThis.fetch; + globalThis.fetch = vi.fn(async (input: Parameters[0]) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + if (url.includes('/container-registry')) { + return new Response(JSON.stringify({ data: [] }), { status: 200 }); + } + if (url.includes('/mayor-id')) { + return new Response( + JSON.stringify({ + success: true, + agentId: 'mayor-agent-1', + model: 'anthropic/claude-sonnet-4.6', + smallModel: 'anthropic/claude-haiku-4.5', + kilocodeToken: 'kc-tok', + organizationId: null, + }), + { status: 200 } + ); + } + // db-snapshot etc: 404 -> fresh start + return new Response('not found', { status: 404 }); + }) as unknown as typeof fetch; + + try { + await bootHydration(); + const env = capturedEnv as Record | null; + expect(env).not.toBeNull(); + expect(env).toMatchObject({ + GASTOWN_AGENT_ID: 'mayor-agent-1', + GASTOWN_AGENT_ROLE: 'mayor', + GASTOWN_TOWN_ID: 'town-prewarm', + GASTOWN_CONTAINER_TOKEN: 'tok-prewarm', + }); + expect(env?.KILO_CONFIG_CONTENT).toBeTruthy(); + } finally { + globalThis.fetch = originalFetch; + if (prev.apiUrl !== undefined) process.env.GASTOWN_API_URL = prev.apiUrl; + else delete process.env.GASTOWN_API_URL; + if (prev.townId !== undefined) process.env.GASTOWN_TOWN_ID = prev.townId; + else delete process.env.GASTOWN_TOWN_ID; + if (prev.token !== undefined) process.env.GASTOWN_CONTAINER_TOKEN = prev.token; + else delete process.env.GASTOWN_CONTAINER_TOKEN; + } + }); + + it('blocks awaiters while bootHydration is in flight and releases them when it returns', async () => { + // Drive bootHydration into its registry-fetch path with a fetch + // stub that we can hold open from the test, so we can observe a + // real "in flight" window for the gate. + const prev = { + apiUrl: process.env.GASTOWN_API_URL, + townId: process.env.GASTOWN_TOWN_ID, + token: process.env.GASTOWN_CONTAINER_TOKEN, + }; + process.env.GASTOWN_API_URL = 'http://test.invalid'; + process.env.GASTOWN_TOWN_ID = 'town-test'; + process.env.GASTOWN_CONTAINER_TOKEN = 'tok-test'; + + // Use a single barrier the fetch stub awaits so every call (registry + // fetch + prewarm endpoints) holds the gate until we release it, + // and each call gets its own Response (avoids "body already read"). + let releaseFetch!: () => void; + const fetchBarrier = new Promise(resolve => { + releaseFetch = resolve; + }); + const originalFetch = globalThis.fetch; + globalThis.fetch = vi.fn(async (input: Parameters[0]) => { + await fetchBarrier; + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + if (url.includes('/container-registry')) { + return new Response(JSON.stringify({ data: [] }), { status: 200 }); + } + return new Response(JSON.stringify({ success: true, agentId: null }), { status: 200 }); + }) as unknown as typeof fetch; + + try { + const hydrationPromise = bootHydration(); + let awaiterResolved = false; + void awaitHydration().then(() => { + awaiterResolved = true; + }); + + // Yield to let the registry fetch start. The gate is now held + // until the fetch resolves. + await new Promise(r => setTimeout(r, 10)); + expect(awaiterResolved).toBe(false); + + releaseFetch(); + await hydrationPromise; + // After bootHydration returns, the gate must release any awaiters. + await new Promise(r => setTimeout(r, 0)); + expect(awaiterResolved).toBe(true); + } finally { + globalThis.fetch = originalFetch; + if (prev.apiUrl !== undefined) process.env.GASTOWN_API_URL = prev.apiUrl; + else delete process.env.GASTOWN_API_URL; + if (prev.townId !== undefined) process.env.GASTOWN_TOWN_ID = prev.townId; + else delete process.env.GASTOWN_TOWN_ID; + if (prev.token !== undefined) process.env.GASTOWN_CONTAINER_TOKEN = prev.token; + else delete process.env.GASTOWN_CONTAINER_TOKEN; + } + }); +}); diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index cd626d9a04..440a24d318 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -11,7 +11,11 @@ import { z } from 'zod'; import * as fs from 'node:fs/promises'; import type { ManagedAgent, StartAgentRequest } from './types'; import { reportAgentCompleted, reportMayorWaiting } from './completion-reporter'; -import { buildKiloConfigContent, mayorWorkdirForTown } from './agent-runner'; +import { + buildKiloConfigContent, + ensureMayorWorkspaceForTown, + mayorWorkdirForTown, +} from './agent-runner'; import { getCurrentTownConfig, getLastAppliedEnvVarKeys, @@ -66,6 +70,20 @@ export function isDraining(): boolean { return _draining; } +// Resolved when bootHydration() returns. /agents/start and /refresh-token +// must await this before contending for the global sdkServerLock — without +// this gate, fresh dispatches arriving during boot queue behind every +// in-flight registry agent + the mayor prewarm and the DO-side 60s +// AbortSignal.timeout fires before they ever get the lock. We resolve +// the promise immediately so non-hydrating containers (tests, dev) +// don't block; bootHydration replaces it on entry and resolves it on exit. +let _hydrationComplete: Promise = Promise.resolve(); +let _resolveHydration: () => void = () => {}; + +export function awaitHydration(): Promise { + return _hydrationComplete; +} + // Mutex for ensureSDKServer — createKilo() reads process.cwd() and // process.env during startup, so concurrent calls with different workdirs // would corrupt each other's globals. This serializes server creation only; @@ -2609,47 +2627,75 @@ function postEventToWorker(event: string, data: Record): void { }); } -async function fetchMayorAgentId( +type MayorPrewarmContext = { + agentId: string; + model?: string; + smallModel?: string; + kilocodeToken?: string; + organizationId?: string | null; +}; + +// Mirrors the response contract documented at +// `/api/towns/:townId/mayor-id` in gastown.worker.ts. agentId is nullable +// because the worker returns `{ agentId: null }` when no mayor exists. +const MayorPrewarmResponse = z + .object({ + success: z.boolean().optional(), + agentId: z.string().nullable().optional(), + model: z.string().optional(), + smallModel: z.string().optional(), + kilocodeToken: z.string().optional(), + organizationId: z.string().nullable().optional(), + }) + .passthrough(); + +async function fetchMayorPrewarmContext( townId: string, apiUrl: string, token: string -): Promise { +): Promise { try { const resp = await fetch(`${apiUrl}/api/towns/${townId}/mayor-id`, { headers: { Authorization: `Bearer ${token}` }, signal: AbortSignal.timeout(10_000), }); if (!resp.ok) { - console.log(`${MANAGER_LOG} fetchMayorAgentId: ${resp.status} for town ${townId}`); + console.log(`${MANAGER_LOG} fetchMayorPrewarmContext: ${resp.status} for town ${townId}`); return null; } const json: unknown = await resp.json(); - if ( - typeof json === 'object' && - json !== null && - 'agentId' in json && - typeof (json as { agentId: unknown }).agentId === 'string' - ) { - return (json as { agentId: string }).agentId; - } - return null; + const parsed = MayorPrewarmResponse.safeParse(json); + if (!parsed.success) return null; + const { agentId, model, smallModel, kilocodeToken, organizationId } = parsed.data; + if (!agentId) return null; + return { agentId, model, smallModel, kilocodeToken, organizationId }; } catch (err) { - console.warn(`${MANAGER_LOG} fetchMayorAgentId failed:`, err); + console.warn(`${MANAGER_LOG} fetchMayorPrewarmContext failed:`, err); return null; } } -function buildPrewarmEnv(mayorAgentId: string): Record { +function buildPrewarmEnv(ctx: MayorPrewarmContext, townId: string): Record | null { + // Must mirror the mayor-shaped subset of buildAgentEnv (agent-runner.ts): + // the kilo serve child snapshots process.env at spawn and loads + // GastownPlugin (plugin/index.ts), which gates mayor-tool registration + // on GASTOWN_AGENT_ROLE === 'mayor' and createMayorClientFromEnv() + // requires GASTOWN_AGENT_ID + GASTOWN_TOWN_ID. If we omit them, the + // prewarmed mayor boots with NO tools, and ensureSDKServer's cache + // hit on the next /agents/start hands back that defective server + // (KILO_CONFIG_CONTENT matches, so the eviction path doesn't fire). const env: Record = { - KILO_TEST_HOME: `/tmp/agent-home-${mayorAgentId}`, - XDG_DATA_HOME: `/tmp/agent-home-${mayorAgentId}/.local/share`, + GASTOWN_AGENT_ID: ctx.agentId, + GASTOWN_TOWN_ID: townId, + GASTOWN_AGENT_ROLE: 'mayor', + KILOCODE_FEATURE: 'gastown', + KILO_TEST_HOME: `/tmp/agent-home-${ctx.agentId}`, + XDG_DATA_HOME: `/tmp/agent-home-${ctx.agentId}/.local/share`, }; const keys = [ 'GASTOWN_API_URL', 'GASTOWN_CONTAINER_TOKEN', - 'GASTOWN_TOWN_ID', - 'KILOCODE_TOKEN', - 'GASTOWN_ORGANIZATION_ID', + 'GASTOWN_SESSION_TOKEN', 'KILO_API_URL', 'KILO_OPENROUTER_BASE', ]; @@ -2658,18 +2704,37 @@ function buildPrewarmEnv(mayorAgentId: string): Record { if (value) env[key] = value; } - const kilocodeToken = env.KILOCODE_TOKEN; - if (kilocodeToken) { - const organizationId = env.GASTOWN_ORGANIZATION_ID || undefined; - const configJson = buildKiloConfigContent( - kilocodeToken, - 'anthropic/claude-sonnet-4.6', - 'anthropic/claude-haiku-4.5', - organizationId - ); - env.KILO_CONFIG_CONTENT = configJson; - env.OPENCODE_CONFIG_CONTENT = configJson; - } + // Prefer the worker-supplied token/org so KILO_CONFIG_CONTENT matches + // what /agents/start will send. Fall back to process.env for back- + // compat with workers that haven't deployed the richer endpoint yet. + const kilocodeToken = ctx.kilocodeToken ?? process.env.KILOCODE_TOKEN; + if (!kilocodeToken) return null; + env.KILOCODE_TOKEN = kilocodeToken; + + // When the worker explicitly returned organizationId (including null + // for "this town has no org"), trust it. Only fall back to process.env + // when the field was omitted entirely (older worker version that + // didn't yet include the prewarm context). + const organizationId = + ctx.organizationId !== undefined + ? ctx.organizationId + : (process.env.GASTOWN_ORGANIZATION_ID ?? null); + if (organizationId) env.GASTOWN_ORGANIZATION_ID = organizationId; + + // Without the worker-resolved model, skip prewarm: any guess we make + // here will almost certainly differ from /agents/start's resolved + // model and trigger ensureSDKServer's eviction-and-respawn path, + // making the prewarm a net negative on the critical path. + if (!ctx.model || !ctx.smallModel) return null; + + const configJson = buildKiloConfigContent( + kilocodeToken, + ctx.model, + ctx.smallModel, + organizationId ?? undefined + ); + env.KILO_CONFIG_CONTENT = configJson; + env.OPENCODE_CONFIG_CONTENT = configJson; return env; } @@ -2677,30 +2742,47 @@ function buildPrewarmEnv(mayorAgentId: string): Record { async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): Promise { const t0 = Date.now(); - const mayorAgentId = await fetchMayorAgentId(townId, apiUrl, token); - if (!mayorAgentId) { + const ctx = await fetchMayorPrewarmContext(townId, apiUrl, token); + if (!ctx) { console.log(`${MANAGER_LOG} prewarmMayorSDK: no mayor agent for town ${townId}`); return; } - const workdir = mayorWorkdirForTown(townId); + const env = buildPrewarmEnv(ctx, townId); + if (!env) { + console.log( + `${MANAGER_LOG} prewarmMayorSDK: skipping for town ${townId} — missing model/token (would cause eviction churn)` + ); + return; + } - await hydrateDbFromSnapshot(mayorAgentId, apiUrl, token, `mayor-${townId}`, townId); + // Materialize the mayor workdir before ensureSDKServer's process.chdir. + // Without this, prewarm on a cold container throws ENOENT because + // createMayorWorkspace runs from runAgent (i.e. /agents/start) only. + const workdir = await ensureMayorWorkspaceForTown(townId); + if (workdir !== mayorWorkdirForTown(townId)) { + // Defensive: if the workspace helper ever changes its layout, the + // sdkInstances key (workdir) and the path /agents/start uses must + // stay aligned or the cache hit won't fire. + console.warn( + `${MANAGER_LOG} prewarmMayorSDK: workdir mismatch (got=${workdir}, expected=${mayorWorkdirForTown(townId)})` + ); + } - const env = buildPrewarmEnv(mayorAgentId); + await hydrateDbFromSnapshot(ctx.agentId, apiUrl, token, `mayor-${townId}`, townId); const existing = sdkInstances.get(workdir); if (existing) { const durationMs = Date.now() - t0; log.info('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, townId, port: parseInt(new URL(existing.server.url).port), durationMs, alreadyRunning: true, }); postEventToWorker('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, role: 'mayor', durationMs, }); @@ -2711,14 +2793,14 @@ async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): P const durationMs = Date.now() - t0; log.info('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, townId, port, durationMs, alreadyRunning: false, }); postEventToWorker('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, role: 'mayor', durationMs, }); @@ -2729,9 +2811,25 @@ async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): P * Gastown worker and resumes all registered agents. * * Called from main.ts when GASTOWN_TOWN_ID and GASTOWN_API_URL are set. + * + * Installs a hydration gate (see `awaitHydration`) for the duration of + * the call so /agents/start and /refresh-token wait for the registry + * loop and mayor prewarm to release the global sdkServerLock before + * contending for it themselves. */ export async function bootHydration(): Promise { const LOG = '[boot-hydration]'; + _hydrationComplete = new Promise(resolve => { + _resolveHydration = resolve; + }); + try { + await bootHydrationImpl(LOG); + } finally { + _resolveHydration(); + } +} + +async function bootHydrationImpl(LOG: string): Promise { const apiUrl = process.env.GASTOWN_API_URL; const townId = process.env.GASTOWN_TOWN_ID; const initialToken = process.env.GASTOWN_CONTAINER_TOKEN; diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index 700a774342..b9a69f76dc 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -2659,6 +2659,43 @@ export class TownDO extends DurableObject { return mayor?.id ?? null; } + /** + * Returns everything the container needs to prewarm the mayor SDK + * server with a config that matches what the next /agents/start will + * use — so the prewarm cache hit is real instead of triggering the + * "config mismatch, evicting prewarmed server" eviction path. + * + * Returns null when there's no mayor yet, or when the kilocode token + * isn't available (in which case prewarm should skip — the next + * /agents/start will defer too). + */ + async getMayorPrewarmContext(): Promise<{ + agentId: string; + model: string; + smallModel: string; + kilocodeToken: string; + organizationId: string | null; + } | null> { + const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; + if (!mayor) return null; + + const kilocodeToken = await this.resolveKilocodeToken(); + if (!kilocodeToken) return null; + + const townConfig = await this.getTownConfig(); + // _ensureMayor dispatches the mayor without a per-rig override + // (Town.do.ts:2766-2790). Match that resolution here so the prewarm + // KILO_CONFIG_CONTENT is byte-identical to what /agents/start will + // build, and ensureSDKServer's config-mismatch eviction never fires. + return { + agentId: mayor.id, + model: config.resolveModel(townConfig, null, 'mayor'), + smallModel: config.resolveSmallModel(townConfig), + kilocodeToken, + organizationId: townConfig.organization_id ?? null, + }; + } + async ensureMayor(): Promise<{ agentId: string; sessionStatus: 'idle' | 'active' | 'starting'; diff --git a/services/gastown/src/gastown.worker.ts b/services/gastown/src/gastown.worker.ts index 9048a95a14..81d9c6ce10 100644 --- a/services/gastown/src/gastown.worker.ts +++ b/services/gastown/src/gastown.worker.ts @@ -672,6 +672,19 @@ app.use('/api/towns/:townId/mayor-id', async (c: Context, ne app.get('/api/towns/:townId/mayor-id', async c => { const townId = c.req.param('townId'); const town = getTownDOStub(c.env, townId); + // Response contract (consumed by fetchMayorPrewarmContext in the + // container's process-manager.ts): + // - When the town has a mayor AND a kilocode token, return the full + // prewarm context so KILO_CONFIG_CONTENT matches what /agents/start + // will send (no eviction churn in ensureSDKServer). + // - When the mayor agent exists but no kilocode token is available, + // return { agentId } only — the container will skip prewarm. + // - When there is no mayor at all, return { agentId: null } — the + // container treats missing/null agentId as "no mayor, skip prewarm". + const ctx = await town.getMayorPrewarmContext(); + if (ctx) { + return c.json({ success: true, ...ctx }); + } const agentId = await town.getMayorAgentId(); return c.json({ success: true, agentId }); }); diff --git a/services/gastown/wrangler.jsonc b/services/gastown/wrangler.jsonc index e613dc666a..ecf79d7894 100644 --- a/services/gastown/wrangler.jsonc +++ b/services/gastown/wrangler.jsonc @@ -37,7 +37,7 @@ "class_name": "TownContainerDO", "image": "./container/Dockerfile", "instance_type": "standard-4", - "max_instances": 800, + "max_instances": 500, }, ], From a6cf1029bb095e794abdb97c02e7c5d208663dec Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Sun, 10 May 2026 18:00:01 -0500 Subject: [PATCH 02/14] chore(gastown): remove manual request logging middleware (#3158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(gastown): remove manual request logging middleware * fix(gastown): unblock /agents/start during boot hydration; preserve mayor tools on prewarm Three independent fixes for the startAgentInContainer timeout regression introduced by #2974, plus a tighter container-instance cap. 1. Hydration gate (control-server.ts, process-manager.ts) The control server starts accepting requests immediately at boot, while bootHydration runs concurrently and serialises every registry agent + the mayor prewarm through the global sdkServerLock. Fresh /agents/start, /refresh-token, and PATCH /agents/:id/model requests queued behind that work and the DO-side AbortSignal.timeout(60s) fired before they ever got the lock — surfacing as "TimeoutError: aborted due to timeout" and "timeout after 6000ms: ensureSDKServer for ". A new awaitHydration() promise is awaited at the top of those handlers (before any process.env mutation in the model PATCH path) so they don't compound the queue. 2. Prewarm config matches /agents/start (Town.do.ts, gastown.worker.ts, process-manager.ts) buildPrewarmEnv was constructing KILO_CONFIG_CONTENT from hardcoded defaults (anthropic/claude-sonnet-4.6 / claude-haiku-4.5), so the real /agents/start with the user's actual model triggered ensureSDKServer's "config mismatch, evicting prewarmed server" path on every warm restart — doubling lock-holding time on the critical path the prewarm was supposed to speed up. The /api/towns/:id/mayor-id endpoint now returns the full prewarm context (model, smallModel, kilocodeToken, organizationId) resolved the same way _ensureMayor resolves it, and the container builds the prewarm KILO_CONFIG_CONTENT to match. Falls back gracefully to a skip when the worker hasn't deployed the richer endpoint yet. 3. Mayor workdir + plugin env (agent-runner.ts, process-manager.ts) prewarmMayorSDK called mayorWorkdirForTown (which only returns a string) and went straight to ensureSDKServer's process.chdir, throwing ENOENT on cold containers because createMayorWorkspace only ran from runAgent. Exported ensureMayorWorkspaceForTown so prewarm materialises the workspace first. More critically, buildPrewarmEnv was missing GASTOWN_AGENT_ROLE, GASTOWN_AGENT_ID, and GASTOWN_TOWN_ID — env vars the kilo serve plugin (plugin/index.ts) reads at spawn to decide whether to register mayor tools. Without them the prewarmed server booted with NO mayor tools, and the cache hit on the next /agents/start handed that defective instance back to the user. Now mirrors the mayor- shaped subset of buildAgentEnv. Added an end-to-end test that intercepts createKilo and asserts the env at spawn time. 4. wrangler.jsonc: lower TownContainerDO max_instances from 800 to 500. Verified with pnpm --filter gastown-container test (67/67 pass), pnpm --filter cloudflare-gastown typecheck, oxlint, and pnpm format. * feat(gastown): per-route logger tagging via Hono params (review on #3158) --------- Co-authored-by: John Fawcett --- services/gastown/src/gastown.worker.ts | 81 +++++++++++++++++--------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/services/gastown/src/gastown.worker.ts b/services/gastown/src/gastown.worker.ts index 81d9c6ce10..6289def7c4 100644 --- a/services/gastown/src/gastown.worker.ts +++ b/services/gastown/src/gastown.worker.ts @@ -9,6 +9,7 @@ import { getTownDOStub } from './dos/Town.do'; import { TownConfigUpdateSchema } from './types'; import { resError } from './util/res.util'; import { writeEvent } from './util/analytics.util'; +import { logger } from './util/log.util'; import { authMiddleware, agentOnlyMiddleware, @@ -128,7 +129,6 @@ import { townAuthMiddleware } from './middleware/town-auth.middleware'; import { orgAuthMiddleware } from './middleware/org-auth.middleware'; import { adminAuditMiddleware } from './middleware/admin-audit.middleware'; import { timingMiddleware, instrumented } from './middleware/analytics.middleware'; -import { logger } from './util/log.util'; import { useWorkersLogger } from 'workers-tagged-logger'; import type { MiddlewareHandler } from 'hono'; import { handleGetTownConfig, handleUpdateTownConfig } from './handlers/town-config.handler'; @@ -173,33 +173,60 @@ app.use('*', timingMiddleware); // Cast needed: workers-tagged-logger@1.0.0 was built against an older Hono. app.use('*', useWorkersLogger('gastown-worker') as unknown as MiddlewareHandler); -// ── Request logging ───────────────────────────────────────────────────── -// Extract IDs from the URL path directly — c.req.param() only works -// after Hono has matched a route, which hasn't happened yet in a -// wildcard middleware. -// Matches /orgs/:orgId, /towns/:townId, /rigs/:rigId, /agents/:agentId -// in any combination that appears in our route patterns. -const RE_ORG = /\/orgs\/(?[^/]+)/; -const RE_TOWN = /\/towns\/(?[^/]+)/; -const RE_RIG = /\/rigs\/(?[^/]+)/; -const RE_AGENT = /\/agents\/(?[^/]+)/; - -app.use('*', async (c, next) => { - const method = c.req.method; - const path = c.req.path; - // Tag with route params immediately so all downstream logs (auth, - // handlers, DO calls) inherit them. Auth-derived tags (userId, orgId) - // are set by kiloAuthMiddleware and orgAuthMiddleware when they run. - logger.setTags({ - orgId: RE_ORG.exec(path)?.groups?.orgId, - townId: RE_TOWN.exec(path)?.groups?.townId, - rigId: RE_RIG.exec(path)?.groups?.rigId, - agentId: RE_AGENT.exec(path)?.groups?.agentId, - }); - logger.info(`--> ${method} ${path}`); +// ── Per-route logger tagging ──────────────────────────────────────── +// Use Hono path matching (not regex) so tags are sourced from +// c.req.param() once the route is matched. Each handler runs only +// when its prefix matches; if a request hits /api/towns/:townId/rigs/:rigId, +// both town and rig handlers run in order. +app.use('/api/orgs/:orgId/*', async (c, next) => { + const orgId = c.req.param('orgId'); + if (orgId) logger.setTags({ orgId }); + await next(); +}); +app.use('/api/towns/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/mayor/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/orgs/:orgId/towns/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/users/:userId/towns/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/towns/:townId/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/orgs/:orgId/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/mayor/:townId/tools/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/towns/:townId/rigs/:rigId/agents/:agentId/*', async (c, next) => { + const agentId = c.req.param('agentId'); + if (agentId) logger.setTags({ agentId }); + await next(); +}); +app.use('/api/mayor/:townId/tools/rigs/:rigId/agents/:agentId/*', async (c, next) => { + const agentId = c.req.param('agentId'); + if (agentId) logger.setTags({ agentId }); await next(); - const elapsed = Math.round(performance.now() - (c.get('requestStartTime') ?? 0)); - logger.info(`<-- ${method} ${path} ${c.res.status}`, { durationMs: elapsed }); }); // ── CORS ──────────────────────────────────────────────────────────────── From 7f9121ffab78941f9e19a3fc93d5842fbec433af Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Sun, 10 May 2026 19:37:48 -0500 Subject: [PATCH 03/14] feat(gastown): add convoy debug endpoints and review-then-land E2E test procedure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three dev-only debug endpoints for autonomous convoy testing without going through the mayor LLM: - GET /debug/towns/:townId/rigs — list rigs in a town - POST /debug/towns/:townId/sling-convoy — call Town.slingConvoy() directly - GET /debug/towns/:townId/convoys — list active convoys with progress Documents the new endpoints and adds a Test C section to e2e-pr-feedback-testing.md with a deterministic procedure for verifying review-then-land convoys end-to-end (sub-bead PRs into the convoy feature branch, then a landing PR into main). Also captures known issues observed during verification: container MTU/TLS handshake failures with github.com, 'failed' blockers not gating dependents, and intermittent polecat skipping of sub-PR creation. --- .../gastown/docs/e2e-pr-feedback-testing.md | 197 ++++++++++++++++++ services/gastown/src/gastown.worker.ts | 43 ++++ 2 files changed, 240 insertions(+) diff --git a/services/gastown/docs/e2e-pr-feedback-testing.md b/services/gastown/docs/e2e-pr-feedback-testing.md index 0712cedcee..febd3be03a 100644 --- a/services/gastown/docs/e2e-pr-feedback-testing.md +++ b/services/gastown/docs/e2e-pr-feedback-testing.md @@ -11,6 +11,13 @@ When `merge_strategy: 'pr'` is configured: 3. **Auto-resolve detects comments** — `poll_pr` checks for unresolved review threads, dispatches polecat to fix 4. **Auto-merge** — once all comments resolved and CI passes, grace period timer starts, then PR is merged via API +### Convoy Merge Modes + +Convoys (multi-bead jobs) come in two flavors, controlled by `convoy_merge_mode` in town config: + +- **`review-and-merge`** — each bead opens a PR directly to `main`. Independent landings. +- **`review-then-land`** (default) — each bead's PR targets a shared **convoy feature branch**, and a single **landing PR** is opened from that feature branch to `main` after all beads close. Test this with [Test C](#test-c-review-then-land-convoy-via-direct-sling). + ## Prerequisites - Wrangler dev server running for gastown (`pnpm dev` in `cloudflare-gastown/`, port 8803) @@ -759,6 +766,193 @@ for pr in prs: --- +## Test C: review-then-land Convoy via Direct Sling + +This tests the **review-then-land** convoy mode where each sub-bead gets its own PR into the convoy's feature branch, and a final landing PR is created from the feature branch into the default branch (`main`). + +Unlike Test B (which goes through the mayor's `gt_sling_batch` tool via a chat message), this test directly slings a convoy through a debug endpoint, eliminating mayor LLM variability. Use this when you want a deterministic E2E test of the convoy plumbing itself. + +### Prereqs and Debug Endpoints + +These dev-only endpoints are used by this test (all `application/json`, no auth in dev): + +| Method | Path | Description | +| ------ | ----------------------------------- | ---------------------------------------------------------------- | +| GET | `/debug/towns/:townId/rigs` | List rigs registered with the town (returns `{ rigs: [...] }`) | +| POST | `/debug/towns/:townId/sling-convoy` | Directly call `Town.slingConvoy()` — bypasses the mayor | +| GET | `/debug/towns/:townId/convoys` | List active convoys with progress (`closed_beads`/`total_beads`) | + +The `sling-convoy` body matches `Town.slingConvoy()` input: + +```json +{ + "rigId": "", + "convoyTitle": "Bogus convoy E2E test 175337", + "merge_mode": "review-then-land", + "staged": false, + "tasks": [ + { "title": "Add src/utils/foo.ts with function bar" }, + { "title": "Add src/utils/baz.ts ...", "depends_on": [0] } + ] +} +``` + +### C.1. Look Up the Rig + +```bash +RIG_ID=$(curl -s $BASE/debug/towns/$TOWN_ID/rigs | python3 -c "import sys, json; print(json.load(sys.stdin)['rigs'][0]['id'])") +echo "RIG_ID=$RIG_ID" +``` + +### C.2. Confirm Town Configured for review-then-land + +```bash +curl -s $BASE/debug/towns/$TOWN_ID/config | python3 -c " +import sys, json +d = json.load(sys.stdin) +print('merge_strategy:', d.get('merge_strategy')) +print('convoy_merge_mode:', d.get('convoy_merge_mode')) +print('refinery.auto_merge:', d.get('refinery',{}).get('auto_merge')) +" +``` + +If `convoy_merge_mode` is not `review-then-land`, set it: + +```bash +curl -s -X PATCH $BASE/debug/towns/$TOWN_ID/config \ + -H "Content-Type: application/json" \ + -d '{"convoy_merge_mode":"review-then-land","merge_strategy":"pr","refinery":{"auto_merge":true,"auto_merge_delay_minutes":2}}' +``` + +### C.3. Sling a 3-Bead Convoy + +Use a unique title (timestamp suffix) so subsequent runs don't collide and the feature branch name is easy to grep for: + +```bash +TIMESTAMP=$(date +%H%M%S) +TITLE="Bogus convoy E2E test $TIMESTAMP" +RESPONSE=$(curl -s -X POST $BASE/debug/towns/$TOWN_ID/sling-convoy \ + -H "Content-Type: application/json" \ + -d "{ + \"rigId\": \"$RIG_ID\", + \"convoyTitle\": \"$TITLE\", + \"merge_mode\": \"review-then-land\", + \"staged\": false, + \"tasks\": [ + {\"title\": \"Add src/utils/bogus-step1.ts with a single function bogusGreet that returns 'hello bogus'. Include JSDoc. Commit and push.\"}, + {\"title\": \"Add src/utils/bogus-step2.ts with a single function bogusFarewell that returns 'goodbye bogus'. Include JSDoc. Commit and push.\", \"depends_on\": [0]}, + {\"title\": \"Add src/utils/bogus-step3.ts with a single function bogusEcho that takes a string and returns it prefixed with 'echo: '. Include JSDoc. Commit and push.\", \"depends_on\": [1]} + ] + }") +echo "$RESPONSE" | python3 -c " +import sys, json +r = json.load(sys.stdin) +c = r['convoy'] +print(f'CONVOY_ID={c[\"id\"]}') +print(f'FEATURE_BRANCH={c[\"feature_branch\"]}') +for i, b in enumerate(r['beads']): + print(f'BEAD{i+1}={b[\"bead\"][\"bead_id\"]}') +" +``` + +Capture the printed env vars (`CONVOY_ID`, `FEATURE_BRANCH`, `BEAD1..3`) into a sourceable file like `/tmp/convoy-test.env` for the rest of the test. + +### C.4. Monitor Through the Lifecycle + +In **review-then-land** mode you should observe (linear chain of 3 beads, each `depends_on` the previous): + +1. Bead 1 → `in_progress` → polecat creates a sub-PR targeting `` → MR bead → refinery merges sub-PR → bead 1 `closed` +2. Bead 2 unblocks → same cycle, sub-PR into feature branch +3. Bead 3 same +4. After last sub-bead closes: a **landing MR bead** is created with `feature_branch` → `main` PR +5. Refinery reviews landing PR, auto-merge fires, convoy → `closed` + +Monitor with this loop: + +```bash +SLUG=$(echo "$TITLE" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]\{1,\}/-/g; s/^-\|-$//g' | head -c 40) +for i in $(seq 1 80); do + echo "=== $(date +%H:%M:%S) ===" + curl -s $BASE/debug/towns/$TOWN_ID/status | python3 -c " +import sys, json +d = json.load(sys.stdin) +alarm = d.get('alarmStatus', {}) +print(f'agents={json.dumps(alarm.get(\"agents\",{}))} beads={json.dumps(alarm.get(\"beads\",{}))}') +for b in d.get('beadSummary', []): + title = (b.get('title','') or '')[:60] + print(f' {b.get(\"type\",\"?\"):16s} {b.get(\"status\",\"?\"):12s} {title}') +" + curl -s $BASE/debug/towns/$TOWN_ID/convoys | python3 -c " +import sys, json +for c in json.load(sys.stdin).get('convoys', []): + if c['id'] == '$CONVOY_ID': + print(f' convoy {c[\"status\"]} closed={c[\"closed_beads\"]}/{c[\"total_beads\"]} landed={c[\"landed_at\"]}') +" 2>/dev/null + gh pr list --repo $REPO --state all --search "head:convoy/$SLUG" --limit 10 \ + --json number,title,headRefName,baseRefName,state,mergedAt 2>/dev/null | python3 -c " +import sys, json +prs = json.load(sys.stdin) +for pr in prs: + print(f' PR #{pr[\"number\"]:3d} {pr[\"state\"]:8s} {pr[\"headRefName\"][:40]} -> {pr[\"baseRefName\"][:40]}') +" 2>/dev/null + # Stop when convoy is gone from active list (indicates closed) + ACTIVE=$(curl -s $BASE/debug/towns/$TOWN_ID/convoys | python3 -c " +import sys, json +ids = [c['id'] for c in json.load(sys.stdin).get('convoys', [])] +print('YES' if '$CONVOY_ID' in ids else 'NO') +" 2>/dev/null) + if [ "$ACTIVE" = "NO" ]; then echo '=== CONVOY CLOSED ==='; break; fi + sleep 30 +done +``` + +### C.5. Verify Final State + +After the convoy closes, check: + +1. **Sub-bead PRs** — one per non-failed bead, each targeting the convoy feature branch, all merged: + + ```bash + gh pr list --repo $REPO --state all --search "head:$FEATURE_BRANCH" --limit 10 \ + --json number,title,baseRefName,headRefName,state,mergedAt | python3 -m json.tool + ``` + + Expect: each sub-bead → `baseRefName: `, `state: MERGED`. The landing PR → `baseRefName: main`, `state: MERGED`. + +2. **Landing PR** — base=`main`, head=``, merged: + + ```bash + gh pr view --repo $REPO --json state,mergedAt,additions,deletions,changedFiles + ``` + +3. **Files actually landed on main**: + + ```bash + gh api "repos/$REPO/contents/src/utils?ref=main" --jq '.[] | select(.name | startswith("bogus")) | .name' + ``` + +4. **Convoy progress**: `closed_beads == total_beads`, `landed_at` set, convoy bead `status=closed`. + +### Expected Timeline (review-then-land, 3 beads) + +| Step | Duration | +| --------------------------------------------------- | ------------- | +| Sling-convoy creates 3 issue beads + convoy bead | ~1s | +| Bead 1 polecat work + sub-PR + refinery merge | 3-5 min | +| Bead 2 polecat work + sub-PR + refinery merge | 3-5 min | +| Bead 3 polecat work + sub-PR + refinery merge | 3-5 min | +| Landing MR created, refinery reviews PR into `main` | 2-3 min | +| Auto-merge grace period | 2 min | +| **Total** | **15-25 min** | + +### Known Issues Observed in This Flow + +- **Polecat occasionally pushes directly to feature branch instead of opening a sub-PR.** The MR bead is still created (via `review_submitted`) and the refinery still merges the work into the feature branch, but `reviewMetadata.pr_url` is `null` and there is no GitHub PR for that sub-bead. This is an LLM compliance issue in the polecat prompt, not a code bug. The convoy still lands successfully. +- **A failed sub-bead does not block the convoy from landing.** The reconciler treats `failed` blockers the same as `closed` (see `reconciler.ts:960`), so dependents will dispatch and the convoy will land whatever did succeed. If sub-bead 1 of 3 fails, the landing PR will only contain commits from beads 2 and 3. This is by design but worth knowing when interpreting "successful" landings. +- **Container TLS handshake failures with `github.com`.** A wrangler-managed container may start with a 65535 MTU that breaks outgoing TLS to GitHub (`GnuTLS, handshake failed: The TLS connection was non-properly terminated`). Symptom: every `/agents/start` returns "FAILED for X: git fetch --all --prune failed". Fix: `docker kill $(docker ps -q --filter ancestor=cloudflare-dev/towncontainerdo:*)` — wrangler will spin up a replacement that usually has a working network. The `cloneRepoInner` retry path treats this as non-auth so doesn't retry; the bead burns through `MAX_DISPATCH_ATTEMPTS=5` and ends up `failed` if the container isn't restarted. + +--- + ## Expected Timelines ### Scenario 1 (No Review + Auto-Merge) @@ -875,6 +1069,9 @@ All `/debug/` endpoints are unauthenticated in development. In production, they' | `POST` | `/debug/towns/:townId/send-message` | Send message to mayor (dev only) | | `GET` | `/debug/towns/:townId/beads/:beadId` | Full bead details + review metadata + dependencies | | `POST` | `/debug/towns/:townId/graceful-stop` | Trigger SIGTERM on container (dev only) | +| `GET` | `/debug/towns/:townId/rigs` | List rigs registered with the town (dev only) | +| `POST` | `/debug/towns/:townId/sling-convoy` | Directly call `Town.slingConvoy()` (dev only) | +| `GET` | `/debug/towns/:townId/convoys` | List active convoys with progress counts (dev only) | ### Inspect a Bead diff --git a/services/gastown/src/gastown.worker.ts b/services/gastown/src/gastown.worker.ts index 6289def7c4..b5a6cd715d 100644 --- a/services/gastown/src/gastown.worker.ts +++ b/services/gastown/src/gastown.worker.ts @@ -366,6 +366,49 @@ app.patch('/debug/towns/:townId/config', async c => { return c.json(result); }); +app.get('/debug/towns/:townId/rigs', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const rigs = await town.listRigs(); + return c.json({ rigs }); +}); + +app.post('/debug/towns/:townId/sling-convoy', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const body: { + rigId: string; + convoyTitle: string; + tasks: Array<{ title: string; body?: string; depends_on?: number[] }>; + merge_mode?: 'review-then-land' | 'review-and-merge'; + staged?: boolean; + } = await c.req.json(); + if (!body.rigId || !body.convoyTitle || !Array.isArray(body.tasks)) { + return c.json({ error: 'Missing required fields: rigId, convoyTitle, tasks' }, 400); + } + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const result = await town.slingConvoy({ + rigId: body.rigId, + convoyTitle: body.convoyTitle, + tasks: body.tasks, + merge_mode: body.merge_mode, + staged: body.staged, + }); + return c.json(result); +}); + +app.get('/debug/towns/:townId/convoys', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const convoys = await town.listConvoys(); + return c.json({ convoys }); +}); + // ── Town ID + Auth ────────────────────────────────────────────────────── // All rig routes live under /api/towns/:townId/rigs/:rigId so the townId // is always available from the URL path. From ce15a6fe756934da5e565cbb69b4ae6988843d03 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 10:22:09 -0500 Subject: [PATCH 04/14] fix(gastown): use fresh integration tokens for GitHub auth instead of stale stored value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit resolveGitHubToken previously preferred git_auth.github_token over the platform integration. Since GitHub App installation tokens have a 1h TTL but git_auth.github_token is only written at rig creation (or rare manual refresh), every long-lived town with an integration was handing out an expired token to: - Polecat/refinery 'gh' CLI (via GH_TOKEN derived from GIT_TOKEN in the container), surfacing as 'Failed to log in to github.com using token (GH_TOKEN). The token in GH_TOKEN is invalid.' - The worker-side PR poller (checkPRStatus, checkPRFeedback, mergePR, areThreadsBlocking) — 401 from api.github.com. - The /refresh-git-token endpoint the container falls back to on auth failure — it returned the same expired token, so the retry just re-failed. Verified by hitting api.github.com with a local town's stored token: 401 even though the integration service mints fresh ones fine. Fix: - Flip resolveGitHubToken's priority to github_cli_pat -> live integration -> stored github_token (last-resort fallback for towns with no integration). Empty-string responses from the integration service now warn and fall back instead of silently failing. - Resolve a fresh token at agent dispatch (startAgentInContainer), merge dispatch (startMergeInContainer), and rig setup (setupRigRepoInContainer) before stuffing GIT_TOKEN into envVars. - buildContainerConfig now resolves a fresh token before serializing git_auth.github_token into the X-Town-Config header — the container's syncTownConfigToProcessEnv path reads this on every request to update process.env.GIT_TOKEN, which buildLiveHotSwapEnv then derives GH_TOKEN from on token-refresh hot-swaps. townId is required (not optional) so a forgotten arg can't silently regress to the stale-token shape. - syncConfigToContainer resolves a fresh token before persisting GIT_TOKEN to DO storage for next boot. Adds 6 unit tests covering the priority chain (cli_pat preferred, fresh integration over stale stored, fallback on lookup failure, rig-level integration ID, no-config returns null). --- services/gastown/src/dos/Town.do.ts | 54 +++++++-- services/gastown/src/dos/town/config.ts | 35 +++++- .../src/dos/town/container-dispatch.ts | 34 ++++-- services/gastown/src/dos/town/town-scm.ts | 55 ++++++--- services/gastown/test/unit/town-scm.test.ts | 111 ++++++++++++++++++ 5 files changed, 259 insertions(+), 30 deletions(-) create mode 100644 services/gastown/test/unit/town-scm.test.ts diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index b9a69f76dc..d46e5960b4 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -786,9 +786,22 @@ export class TownDO extends DurableObject { const townConfig = await this.getTownConfig(); const container = getTownContainerStub(this.env, townId); + // Resolve a fresh GitHub token here too — this method runs both at + // initial config push and on every config change, so the persisted + // GIT_TOKEN must be live rather than the stale value stored in + // git_auth.github_token from rig creation. The container's + // syncTownConfigToProcessEnv path reads `git_auth.github_token` + // from the X-Town-Config header on every request, so the in-process + // GIT_TOKEN follows the same source-of-truth as the persisted one. + const githubToken = await scm.resolveGitHubToken({ + env: this.env, + townId, + getTownConfig: () => Promise.resolve(townConfig), + }); + // Phase 1: Persist to DO storage for next boot. const envMapping: Array<[string, string | undefined]> = [ - ['GIT_TOKEN', townConfig.git_auth?.github_token], + ['GIT_TOKEN', githubToken ?? undefined], ['GITLAB_TOKEN', townConfig.git_auth?.gitlab_token], ['GITLAB_INSTANCE_URL', townConfig.git_auth?.gitlab_instance_url], ['GITHUB_CLI_PAT', townConfig.github_cli_pat], @@ -861,7 +874,11 @@ export class TownDO extends DurableObject { // /sync-config endpoint. The X-Town-Config header delivers the // full config; the endpoint applies CONFIG_ENV_MAP to process.env. try { - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); await container.fetch('http://container/sync-config', { method: 'POST', headers: { @@ -985,8 +1002,19 @@ export class TownDO extends DurableObject { logger.setTags({ rigId: rigConfig.rigId }); const townConfig = await this.getTownConfig(); const envVars: Record = {}; - if (townConfig.git_auth?.github_token) { - envVars.GIT_TOKEN = townConfig.git_auth.github_token; + // Resolve GitHub token through scm.resolveGitHubToken so the rig + // setup uses a fresh installation token when a platform integration + // is configured. The rig's own integration ID takes precedence over + // the town-level one (this rig may be wired to a different repo + // installation than the rest of the town). + const githubToken = await scm.resolveGitHubToken({ + env: this.env, + townId: this.townId, + getTownConfig: () => Promise.resolve(townConfig), + platformIntegrationId: rigConfig.platformIntegrationId, + }); + if (githubToken) { + envVars.GIT_TOKEN = githubToken; } if (townConfig.git_auth?.gitlab_token) { envVars.GITLAB_TOKEN = townConfig.git_auth.gitlab_token; @@ -1001,7 +1029,11 @@ export class TownDO extends DurableObject { envVars.KILOCODE_TOKEN = kilocodeToken; } - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); const container = getTownContainerStub(this.env, this.townId); const response = await container.fetch('http://container/repos/setup', { method: 'POST', @@ -2851,7 +2883,11 @@ export class TownDO extends DurableObject { if (isAlive) { // Attach fresh town config so the container can update process.env // before restarting the SDK server (tokens, git identity, etc.). - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); // Resolve townConfig to thread the organization_id into the request body // (belt-and-suspenders: ensures org billing survives even if X-Town-Config @@ -4677,7 +4713,11 @@ export class TownDO extends DurableObject { // This ensures org context and credentials are available immediately // after a container restart when the first request is a model update // (PATCH /model) rather than a new agent start. - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); const headers: Record = { 'X-Town-Config': JSON.stringify(containerConfig), }; diff --git a/services/gastown/src/dos/town/config.ts b/services/gastown/src/dos/town/config.ts index e02404a745..041b999736 100644 --- a/services/gastown/src/dos/town/config.ts +++ b/services/gastown/src/dos/town/config.ts @@ -9,6 +9,7 @@ import { type MergeStrategy, type RigOverrideConfig, } from '../../types'; +import { resolveGitHubToken } from './town-scm'; const CONFIG_KEY = 'town:config'; const NEW_TOWN_DEFAULTS_SEEDED_KEY = 'town:config:newDefaultsSeeded'; @@ -295,17 +296,47 @@ export function resolveRigConfig( /** * Build the ContainerConfig payload for X-Town-Config header. * Sent with every fetch() to the container. + * + * The container's `syncTownConfigToProcessEnv` reads `git_auth.github_token` + * from this payload on every request and writes it to `process.env.GIT_TOKEN`, + * which the SDK server's `gh` CLI inherits via `GH_TOKEN`. To prevent serving + * an expired installation token (TTL ~1h) we resolve through `resolveGitHubToken` + * so a configured platform integration always returns a fresh value. + * + * `townId` is required so we can always perform the integration lookup. + * Making it optional was a foot-gun — a forgotten arg silently re-introduces + * the stale-token bug this function exists to prevent. */ export async function buildContainerConfig( storage: DurableObjectStorage, - env: Env + env: Env, + townId: string ): Promise> { const config = await getTownConfig(storage); + + let resolvedGithubToken = config.git_auth?.github_token; + try { + const fresh = await resolveGitHubToken({ + env, + townId, + getTownConfig: () => Promise.resolve(config), + }); + if (fresh) resolvedGithubToken = fresh; + } catch (err) { + console.warn( + `${TOWN_LOG} buildContainerConfig: resolveGitHubToken failed; falling back to stored token`, + err + ); + } + return { env_vars: config.env_vars, default_model: resolveModel(config, null, ''), small_model: resolveSmallModel(config), - git_auth: config.git_auth, + git_auth: { + ...config.git_auth, + github_token: resolvedGithubToken, + }, kilocode_token: config.kilocode_token, github_cli_pat: config.github_cli_pat, git_author_name: config.git_author_name, diff --git a/services/gastown/src/dos/town/container-dispatch.ts b/services/gastown/src/dos/town/container-dispatch.ts index 4c37825376..d84d59d9f9 100644 --- a/services/gastown/src/dos/town/container-dispatch.ts +++ b/services/gastown/src/dos/town/container-dispatch.ts @@ -10,6 +10,7 @@ import { buildMayorSystemPrompt } from '../../prompts/mayor-system.prompt'; import type { TownConfig, RigOverrideConfig } from '../../types'; import { buildContainerConfig, resolveModel, resolveSmallModel, resolveRigConfig } from './config'; import { writeEvent } from '../../util/analytics.util'; +import { resolveGitHubToken } from './town-scm'; const TOWN_LOG = '[Town.do]'; @@ -409,9 +410,20 @@ export async function startAgentInContainer( // Build env vars from town config const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; - // Map git_auth tokens - if (params.townConfig.git_auth?.github_token) { - envVars.GIT_TOKEN = params.townConfig.git_auth.github_token; + // Map git_auth tokens. Resolve GitHub token through resolveGitHubToken so + // we mint a fresh installation token when a platform integration is + // configured; otherwise we'd hand the agent a `git_auth.github_token` + // value that may have been written hours ago and is well past its 1h + // installation-token TTL. The resolved value is also what the agent's + // `gh` CLI sees as `GH_TOKEN`. + const githubToken = await resolveGitHubToken({ + env, + townId: params.townId, + getTownConfig: () => Promise.resolve(params.townConfig), + platformIntegrationId: params.platformIntegrationId, + }); + if (githubToken) { + envVars.GIT_TOKEN = githubToken; } if (params.townConfig.git_auth?.gitlab_token) { envVars.GITLAB_TOKEN = params.townConfig.git_auth.gitlab_token; @@ -449,7 +461,7 @@ export async function startAgentInContainer( `${TOWN_LOG} startAgentInContainer: envVars built: keys=[${Object.keys(envVars).join(',')}] hasGitToken=${!!envVars.GIT_TOKEN} hasGitlabToken=${!!envVars.GITLAB_TOKEN} hasContainerToken=${!!containerToken} hasAgentJwt=${!!agentToken} hasKilocodeToken=${!!kilocodeToken} git_auth_keys=[${Object.keys(params.townConfig.git_auth ?? {}).join(',')}]` ); - const containerConfig = await buildContainerConfig(storage, env); + const containerConfig = await buildContainerConfig(storage, env, params.townId); const container = getTownContainerStub(env, params.townId); const rigOverride = params.rigOverride ?? null; @@ -613,8 +625,16 @@ export async function startMergeInContainer( } const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; - if (params.townConfig.git_auth?.github_token) { - envVars.GIT_TOKEN = params.townConfig.git_auth.github_token; + // Resolve GitHub token through resolveGitHubToken so a configured + // platform integration mints a fresh installation token for the + // merge process. See startAgentInContainer for the rationale. + const mergeGithubToken = await resolveGitHubToken({ + env, + townId: params.townId, + getTownConfig: () => Promise.resolve(params.townConfig), + }); + if (mergeGithubToken) { + envVars.GIT_TOKEN = mergeGithubToken; } if (params.townConfig.git_auth?.gitlab_token) { envVars.GITLAB_TOKEN = params.townConfig.git_auth.gitlab_token; @@ -628,7 +648,7 @@ export async function startMergeInContainer( const mergeKilocodeToken = params.kilocodeToken ?? params.townConfig.kilocode_token; if (mergeKilocodeToken) envVars.KILOCODE_TOKEN = mergeKilocodeToken; - const containerConfig = await buildContainerConfig(storage, env); + const containerConfig = await buildContainerConfig(storage, env, params.townId); const container = getTownContainerStub(env, params.townId); const response = await container.fetch('http://container/git/merge', { diff --git a/services/gastown/src/dos/town/town-scm.ts b/services/gastown/src/dos/town/town-scm.ts index 315d0abba3..3b9c84a431 100644 --- a/services/gastown/src/dos/town/town-scm.ts +++ b/services/gastown/src/dos/town/town-scm.ts @@ -24,25 +24,52 @@ export type SCMContext = { /** * Resolve a GitHub API token from the town config. - * Fallback chain: github_token → github_cli_pat → town platform integration → rig platform integration. + * + * Priority chain (most to least preferred): + * 1. `github_cli_pat` — user-supplied long-lived PAT, never expires. + * 2. Platform integration (GitHub App installation) — minted fresh by + * git-token-service with KV-backed caching that auto-invalidates + * before each token's 1h TTL elapses. This is the authoritative + * live source whenever an integration is configured. + * 3. `git_auth.github_token` — stored installation token from a past + * `refreshGitCredentials()` write. Kept as a fallback for towns that + * never had an integration wired up. NOT used when an integration is + * configured because the stored value is typically stale (1h TTL, + * never updated by anything in the request path). + * + * Historically this preferred the stored token over the integration, + * which made every consumer (PR poller, /refresh-git-token, agent + * dispatch's `GIT_TOKEN`) hand out an expired token whenever the rig + * had been registered more than ~1 hour ago. That broke the polecat's + * `gh` CLI ("Failed to log in to github.com using token (GH_TOKEN)"), + * the worker-side PR poller (401 from api.github.com), and the + * container's git refresh fallback (refreshed itself to the same + * expired token). */ export async function resolveGitHubToken(ctx: SCMContext): Promise { const townConfig = await ctx.getTownConfig(); - let token = townConfig.git_auth?.github_token ?? townConfig.github_cli_pat; - if (!token) { - const integrationId = townConfig.git_auth?.platform_integration_id ?? ctx.platformIntegrationId; - if (integrationId && ctx.env.GIT_TOKEN_SERVICE) { - try { - token = await ctx.env.GIT_TOKEN_SERVICE.getToken(integrationId); - } catch (err) { - console.warn( - `${TOWN_LOG} resolveGitHubToken: platform integration token lookup failed for ${integrationId}`, - err - ); - } + + if (townConfig.github_cli_pat) { + return townConfig.github_cli_pat; + } + + const integrationId = townConfig.git_auth?.platform_integration_id ?? ctx.platformIntegrationId; + if (integrationId && ctx.env.GIT_TOKEN_SERVICE) { + try { + const fresh = await ctx.env.GIT_TOKEN_SERVICE.getToken(integrationId); + if (typeof fresh === 'string' && fresh.length > 0) return fresh; + console.warn( + `${TOWN_LOG} resolveGitHubToken: platform integration ${integrationId} returned empty token; falling back to stored github_token` + ); + } catch (err) { + console.warn( + `${TOWN_LOG} resolveGitHubToken: platform integration token lookup failed for ${integrationId}; falling back to stored github_token`, + err + ); } } - return token ?? null; + + return townConfig.git_auth?.github_token ?? null; } export type PRStatusResult = { diff --git a/services/gastown/test/unit/town-scm.test.ts b/services/gastown/test/unit/town-scm.test.ts new file mode 100644 index 0000000000..3b9bc8ffdc --- /dev/null +++ b/services/gastown/test/unit/town-scm.test.ts @@ -0,0 +1,111 @@ +import { describe, it, expect } from 'vitest'; +import { resolveGitHubToken } from '../../src/dos/town/town-scm'; +import { TownConfigSchema, type TownConfig } from '../../src/types'; + +const STORED_GITHUB_TOKEN = 'ghs_stored_stale_token'; +const FRESH_INSTALLATION_TOKEN = 'ghs_fresh_from_integration'; +const USER_PAT = 'ghp_user_long_lived_pat'; +const INTEGRATION_ID = '119277743'; + +function buildConfig(overrides: { + github_token?: string; + github_cli_pat?: string; + platform_integration_id?: string; +}): TownConfig { + return TownConfigSchema.parse({ + git_auth: { + github_token: overrides.github_token, + platform_integration_id: overrides.platform_integration_id, + }, + github_cli_pat: overrides.github_cli_pat, + }); +} + +function fakeEnv(opts: { + tokenServiceResponse?: string | null; + tokenServiceShouldThrow?: boolean; +}): Env { + return { + GIT_TOKEN_SERVICE: { + getToken: async (_id: string) => { + if (opts.tokenServiceShouldThrow) { + throw new Error('integration lookup failed'); + } + return opts.tokenServiceResponse ?? FRESH_INSTALLATION_TOKEN; + }, + }, + } as unknown as Env; +} + +describe('resolveGitHubToken priority chain', () => { + it('prefers github_cli_pat over everything else', async () => { + const cfg = buildConfig({ + github_cli_pat: USER_PAT, + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const token = await resolveGitHubToken({ + env: fakeEnv({}), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(token).toBe(USER_PAT); + }); + + it('returns a fresh integration token when an integration is configured, ignoring the stale stored token', async () => { + const cfg = buildConfig({ + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const token = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceResponse: FRESH_INSTALLATION_TOKEN }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(token).toBe(FRESH_INSTALLATION_TOKEN); + }); + + it('falls back to stored github_token when no integration is configured', async () => { + const cfg = buildConfig({ github_token: STORED_GITHUB_TOKEN }); + const token = await resolveGitHubToken({ + env: fakeEnv({}), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(token).toBe(STORED_GITHUB_TOKEN); + }); + + it('falls back to stored github_token when integration lookup throws', async () => { + const cfg = buildConfig({ + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const token = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceShouldThrow: true }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(token).toBe(STORED_GITHUB_TOKEN); + }); + + it('uses the rig-level platformIntegrationId when town config does not carry one', async () => { + const cfg = buildConfig({}); + const token = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceResponse: FRESH_INSTALLATION_TOKEN }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + platformIntegrationId: INTEGRATION_ID, + }); + expect(token).toBe(FRESH_INSTALLATION_TOKEN); + }); + + it('returns null when nothing is configured', async () => { + const cfg = buildConfig({}); + const token = await resolveGitHubToken({ + env: fakeEnv({}), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(token).toBeNull(); + }); +}); From 63873e425975173928f4fe6f990d91dd10604fa5 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 11:01:17 -0500 Subject: [PATCH 05/14] fix(gastown): distinguish null causes in PR status polling (#3149) (#3160) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gastown): distinguish null causes in PR status polling (#3149) Replace PRStatusResult | null return type with discriminated PRStatusOutcome union in checkPRStatus. Each null cause (no token, HTTP error, invalid response, unrecognized URL, host mismatch) now surfaces a structured PRStatusError with actionable failure messages. - resolveGitHubToken returns GitHubTokenResolution with resolution chain - no_token and non-transient HTTP errors (401/403/404) fail immediately - invalid_response/unrecognized_url/host_mismatch fail after 3 strikes - Transient HTTP errors (5xx/429) keep existing 10-strike behavior - poll_null_count resets to 0 on successful poll at both call sites - failureKind persisted to bead metadata for analytics - AE event pr.poll_failed emitted on terminal failure - Unit tests for checkPRStatus, resolveGitHubToken, failureMessageFor, and threshold logic - Integration test for no_token immediate-fail path * style: apply oxfmt formatting * fix(gastown): track integration source when GIT_TOKEN_SERVICE unbound (review on town-scm.ts:66) When integrationId is set but GIT_TOKEN_SERVICE binding is missing, the configured integration source was silently omitted from the tried array. Add an else branch that pushes the source label with a '(GIT_TOKEN_SERVICE not bound)' annotation so the no_token error message lists all attempted sources. * fix(gastown): fail immediately for unrecognized_url and host_mismatch (review on actions.ts:374) Both are deterministic configuration errors that cannot self-resolve on retry. Move them from the 3-strike bucket to the fail-immediately bucket alongside no_token and non-transient http_error. Only invalid_response remains in the 3-strike category. * fix(gastown): use separate counters for transient vs non-transient poll errors (review on actions.ts:1350) Replace the shared poll_null_count with poll_transient_count and poll_non_transient_count. Each error category increments only its own counter and resets the other, preventing cross-contamination where 9 transient errors followed by 1 non-transient error would incorrectly fail the bead. Legacy poll_null_count is migrated on first read: the transient branch falls back to poll_null_count when poll_transient_count is absent. This ensures in-flight beads at deploy time retain their existing counter value. The non-transient branch does not read the legacy field since these counters reset on every success anyway — at worst an in-flight bead gets one extra retry for invalid_response. * fix(gastown): resolve merge conflict in resolveGitHubToken - merge staging priority with PR #3160 structured return type - resolveGitHubToken now uses staging's priority: cli_pat → integration → stored token - Returns GitHubTokenResolution discriminated union (from PR #3160) - Includes unbound-service else branch (GIT_TOKEN_SERVICE not bound) - Adds resolveGitHubTokenString helper for non-error-aware callers - Updates Town.do.ts, container-dispatch.ts, config.ts to use helper - Updates town-scm.test.ts for GitHubTokenResolution return shape - Updates pr-poll-errors.test.ts for new priority order --------- Co-authored-by: John Fawcett --- services/gastown/src/dos/Town.do.ts | 6 +- services/gastown/src/dos/town/actions.ts | 269 ++++++++++++-- services/gastown/src/dos/town/config.ts | 6 +- .../src/dos/town/container-dispatch.ts | 10 +- services/gastown/src/dos/town/town-scm.ts | 212 ++++++++--- .../src/handlers/refresh-git-token.handler.ts | 4 +- .../test/integration/pr-poll-errors.test.ts | 101 ++++++ .../gastown/test/unit/pr-poll-errors.test.ts | 317 +++++++++++++++++ .../test/unit/pr-poll-thresholds.test.ts | 330 ++++++++++++++++++ services/gastown/test/unit/town-scm.test.ts | 42 ++- 10 files changed, 1187 insertions(+), 110 deletions(-) create mode 100644 services/gastown/test/integration/pr-poll-errors.test.ts create mode 100644 services/gastown/test/unit/pr-poll-errors.test.ts create mode 100644 services/gastown/test/unit/pr-poll-thresholds.test.ts diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index d46e5960b4..dfb1f62aff 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -793,7 +793,7 @@ export class TownDO extends DurableObject { // syncTownConfigToProcessEnv path reads `git_auth.github_token` // from the X-Town-Config header on every request, so the in-process // GIT_TOKEN follows the same source-of-truth as the persisted one. - const githubToken = await scm.resolveGitHubToken({ + const githubToken = await scm.resolveGitHubTokenString({ env: this.env, townId, getTownConfig: () => Promise.resolve(townConfig), @@ -1002,12 +1002,12 @@ export class TownDO extends DurableObject { logger.setTags({ rigId: rigConfig.rigId }); const townConfig = await this.getTownConfig(); const envVars: Record = {}; - // Resolve GitHub token through scm.resolveGitHubToken so the rig + // Resolve GitHub token through scm.resolveGitHubTokenString so the rig // setup uses a fresh installation token when a platform integration // is configured. The rig's own integration ID takes precedence over // the town-level one (this rig may be wired to a different repo // installation than the rest of the town). - const githubToken = await scm.resolveGitHubToken({ + const githubToken = await scm.resolveGitHubTokenString({ env: this.env, townId: this.townId, getTownConfig: () => Promise.resolve(townConfig), diff --git a/services/gastown/src/dos/town/actions.ts b/services/gastown/src/dos/town/actions.ts index 89da498fae..977151abd8 100644 --- a/services/gastown/src/dos/town/actions.ts +++ b/services/gastown/src/dos/town/actions.ts @@ -22,7 +22,7 @@ import * as reviewQueue from './review-queue'; import * as patrol from './patrol'; import { getRig } from './rigs'; import { parseGitUrl } from '../../util/platform-pr.util'; -import type { PRStatusResult } from './town-scm'; +import type { PRStatusOutcome, PRStatusError } from './town-scm'; // ── Bead mutations ────────────────────────────────────────────────── @@ -300,8 +300,8 @@ export type ApplyActionContext = { dispatchAgent: (agentId: string, beadId: string, rigId: string) => Promise; /** Stop an agent's container process. */ stopAgent: (agentId: string) => Promise; - /** Check a PR's status via GitHub/GitLab API. Returns PRStatusResult or null. */ - checkPRStatus: (prUrl: string) => Promise; + /** Check a PR's status via GitHub/GitLab API. Returns PRStatusOutcome. */ + checkPRStatus: (prUrl: string) => Promise; /** Check PR for unresolved review comments and failing CI checks. */ checkPRFeedback: (prUrl: string) => Promise; /** Merge a PR via GitHub/GitLab API. */ @@ -330,9 +330,66 @@ const LOG = '[actions]'; /** Fail MR bead after this many consecutive null poll results (#1632). */ const PR_POLL_NULL_THRESHOLD = 10; +/** Fail MR bead after this many consecutive non-transient errors (invalid_response). */ +const PR_POLL_NON_TRANSIENT_THRESHOLD = 3; + /** Minimum interval between PR polls per MR bead (ms) (#1632). */ export const PR_POLL_INTERVAL_MS = 60_000; // 1 minute +function failureMessageFor(error: PRStatusError): string { + switch (error.kind) { + case 'no_token': + return ( + `No GitHub token resolved for this town. Tried (in order): ` + + error.resolutionChain.map(s => `\`${s}\``).join(', ') + + `. Configure one of these in town or rig settings. ` + + `Note: polecat agents use their own container credentials and ` + + `may have created the PR successfully — that does not imply the ` + + `town worker can poll PR status.` + ); + case 'http_error': + if (error.status === 401) { + return `Town's GitHub token is invalid or expired (HTTP 401). Refresh the token in town settings.`; + } + if (error.status === 403) { + return `Town's GitHub token lacks permission for this PR (HTTP 403). Ensure the token has \`pull-requests: read\` scope on the repo, or check for a secondary rate limit.`; + } + if (error.status === 404) { + return `PR not found (HTTP 404). Was the branch deleted before the PR could be polled, or is the PR URL wrong?`; + } + return `${error.provider} API returned HTTP ${error.status} ${error.statusText}. ${error.transient ? 'Retrying.' : 'Not retryable.'}`; + case 'invalid_response': + return ( + `${error.provider} API returned an unexpected response shape ` + + `(${error.reason})${error.sampleKeys ? `; top-level keys: ${error.sampleKeys.join(', ')}` : ''}. ` + + `Please file a bug — the API contract may have drifted.` + ); + case 'unrecognized_url': + return `PR URL format not recognized: ${error.url}. Expected GitHub PR or GitLab MR URL.`; + case 'host_mismatch': + return `Refusing to send GitLab token to unexpected host \`${error.got}\` (configured: \`${error.expected}\`).`; + } +} + +function shouldFailImmediately(error: PRStatusError): boolean { + switch (error.kind) { + case 'no_token': + return true; + case 'unrecognized_url': + return true; + case 'host_mismatch': + return true; + case 'http_error': + return !error.transient; + case 'invalid_response': + return false; + } +} + +function shouldCountAsTransient(error: PRStatusError): boolean { + return error.kind === 'http_error' && error.transient; +} + function now(): string { return new Date().toISOString(); } @@ -745,22 +802,24 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro return async () => { try { - const prStatusResult = await ctx.checkPRStatus(action.pr_url); - if (prStatusResult !== null) { - // Any non-null result resets the consecutive null counter + const outcome = await ctx.checkPRStatus(action.pr_url); + if (outcome.ok) { + // Successful poll — reset both consecutive error counters query( sql, /* sql */ ` UPDATE ${beads} SET ${beads.columns.metadata} = json_set( COALESCE(${beads.columns.metadata}, '{}'), - '$.poll_null_count', 0 + '$.poll_transient_count', 0, + '$.poll_non_transient_count', 0, + '$.poll_error_kind', NULL ) WHERE ${beads.bead_id} = ? `, [action.bead_id] ); - const { status, mergeable_state } = prStatusResult; + const { status, mergeable_state } = outcome.result; if (status !== 'open') { ctx.insertEvent('pr_status_changed', { bead_id: action.bead_id, @@ -1048,10 +1107,10 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro // If the PR was merged externally during that window, inserting // pr_feedback_detected would create a feedback bead for a merged // PR — leading to a duplicate PR on an already-merged branch. - const freshStatusResult = await ctx.checkPRStatus(action.pr_url); - if (freshStatusResult?.status !== 'open') { + const freshOutcome = await ctx.checkPRStatus(action.pr_url); + if (!freshOutcome.ok || freshOutcome.result.status !== 'open') { console.log( - `${LOG} poll_pr: PR status changed to '${freshStatusResult?.status ?? 'null'}' during feedback check, skipping feedback for bead=${action.bead_id}` + `${LOG} poll_pr: PR status changed to '${freshOutcome.ok ? freshOutcome.result.status : 'error'}' during feedback check, skipping feedback for bead=${action.bead_id}` ); } else { const existingFeedback = hasExistingFeedbackBead(sql, action.bead_id); @@ -1179,57 +1238,190 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro } } } else { - // Null result — GitHub API unreachable (token missing, expired, rate-limited, or 5xx). - // Increment consecutive null counter; fail the bead after PR_POLL_NULL_THRESHOLD. + const error = outcome.error; + + // Store the latest error kind for analytics query( sql, /* sql */ ` UPDATE ${beads} SET ${beads.columns.metadata} = json_set( COALESCE(${beads.columns.metadata}, '{}'), - '$.poll_null_count', - COALESCE( - json_extract(${beads.columns.metadata}, '$.poll_null_count'), - 0 - ) + 1 + '$.poll_error_kind', ? ) WHERE ${beads.bead_id} = ? `, - [action.bead_id] + [error.kind, action.bead_id] ); - const rows = [ - ...query( + + const failRigRows = z + .object({ rig_id: z.string().nullable() }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` + SELECT ${beads.columns.rig_id} + FROM ${beads} + WHERE ${beads.bead_id} = ? + `, + [action.bead_id] + ), + ]); + const failRigId = failRigRows[0]?.rig_id ?? ''; + + if (shouldFailImmediately(error)) { + console.warn( + `${LOG} poll_pr: immediate-fail error kind=${error.kind} for bead=${action.bead_id}, failing` + ); + beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'pr_poll_failed', + '$.failureKind', ?, + '$.failureMessage', ? + ) + WHERE ${beads.bead_id} = ? + `, + [error.kind, failureMessageFor(error), action.bead_id] + ); + ctx.emitEvent({ + event: 'pr.poll_failed', + townId, + beadId: action.bead_id, + rigId: failRigId, + reason: error.kind, + label: 'provider' in error ? error.provider : '', + statusCode: error.kind === 'http_error' ? error.status : undefined, + }); + } else if (shouldCountAsTransient(error)) { + // Transient HTTP errors (5xx, 429) count toward the 10-strike threshold. + // Migrate legacy poll_null_count into poll_transient_count on first read. + query( sql, /* sql */ ` - SELECT json_extract(${beads.columns.metadata}, '$.poll_null_count') AS null_count - FROM ${beads} + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.poll_transient_count', + COALESCE( + json_extract(${beads.columns.metadata}, '$.poll_transient_count'), + json_extract(${beads.columns.metadata}, '$.poll_null_count'), + 0 + ) + 1, + '$.poll_non_transient_count', 0 + ) WHERE ${beads.bead_id} = ? `, [action.bead_id] - ), - ]; - const nullCount = Number(rows[0]?.null_count ?? 0); - if (nullCount >= PR_POLL_NULL_THRESHOLD) { - console.warn( - `${LOG} poll_pr: ${nullCount} consecutive null results for bead=${action.bead_id}, failing` ); - beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + const rows = [ + ...query( + sql, + /* sql */ ` + SELECT json_extract(${beads.columns.metadata}, '$.poll_transient_count') AS transient_count + FROM ${beads} + WHERE ${beads.bead_id} = ? + `, + [action.bead_id] + ), + ]; + const transientCount = Number(rows[0]?.transient_count ?? 0); + if (transientCount >= PR_POLL_NULL_THRESHOLD) { + console.warn( + `${LOG} poll_pr: ${transientCount} consecutive transient errors for bead=${action.bead_id}, failing` + ); + beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'pr_poll_failed', + '$.failureKind', ?, + '$.failureMessage', ? + ) + WHERE ${beads.bead_id} = ? + `, + [error.kind, failureMessageFor(error), action.bead_id] + ); + ctx.emitEvent({ + event: 'pr.poll_failed', + townId, + beadId: action.bead_id, + rigId: failRigId, + reason: error.kind, + label: 'provider' in error ? error.provider : '', + statusCode: error.kind === 'http_error' ? error.status : undefined, + }); + } + } else { + // Non-transient, non-immediate errors (invalid_response only) + // count toward a lower 3-strike threshold. + // Migrate legacy poll_null_count into poll_non_transient_count on first read. query( sql, /* sql */ ` UPDATE ${beads} SET ${beads.columns.metadata} = json_set( COALESCE(${beads.columns.metadata}, '{}'), - '$.failureReason', 'pr_poll_failed', - '$.failureMessage', ? + '$.poll_non_transient_count', + COALESCE( + json_extract(${beads.columns.metadata}, '$.poll_non_transient_count'), + 0 + ) + 1, + '$.poll_transient_count', 0 ) WHERE ${beads.bead_id} = ? `, - [ - `Cannot poll PR status — GitHub API returned null ${nullCount} consecutive times. Check that a valid GitHub token is configured in town settings and that the GitHub API is reachable.`, - action.bead_id, - ] + [action.bead_id] ); + const rows = [ + ...query( + sql, + /* sql */ ` + SELECT json_extract(${beads.columns.metadata}, '$.poll_non_transient_count') AS non_transient_count + FROM ${beads} + WHERE ${beads.bead_id} = ? + `, + [action.bead_id] + ), + ]; + const nonTransientCount = Number(rows[0]?.non_transient_count ?? 0); + if (nonTransientCount >= PR_POLL_NON_TRANSIENT_THRESHOLD) { + console.warn( + `${LOG} poll_pr: ${nonTransientCount} consecutive non-transient errors kind=${error.kind} for bead=${action.bead_id}, failing` + ); + beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'pr_poll_failed', + '$.failureKind', ?, + '$.failureMessage', ? + ) + WHERE ${beads.bead_id} = ? + `, + [error.kind, failureMessageFor(error), action.bead_id] + ); + ctx.emitEvent({ + event: 'pr.poll_failed', + townId, + beadId: action.bead_id, + rigId: failRigId, + reason: error.kind, + label: 'provider' in error ? error.provider : '', + statusCode: error.kind === 'http_error' ? error.status : undefined, + }); + } } } // status === 'open' — no action needed, poll again next tick @@ -1483,3 +1675,8 @@ function parsePrUrl(prUrl: string): { repo: string; prNumber: number } | null { // Exported for testing export { hasExistingFeedbackBead as _hasExistingFeedbackBead, parsePrUrl as _parsePrUrl }; +export { + failureMessageFor as _failureMessageFor, + shouldFailImmediately as _shouldFailImmediately, + shouldCountAsTransient as _shouldCountAsTransient, +}; diff --git a/services/gastown/src/dos/town/config.ts b/services/gastown/src/dos/town/config.ts index 041b999736..85b3ca4715 100644 --- a/services/gastown/src/dos/town/config.ts +++ b/services/gastown/src/dos/town/config.ts @@ -9,7 +9,7 @@ import { type MergeStrategy, type RigOverrideConfig, } from '../../types'; -import { resolveGitHubToken } from './town-scm'; +import { resolveGitHubTokenString } from './town-scm'; const CONFIG_KEY = 'town:config'; const NEW_TOWN_DEFAULTS_SEEDED_KEY = 'town:config:newDefaultsSeeded'; @@ -316,7 +316,7 @@ export async function buildContainerConfig( let resolvedGithubToken = config.git_auth?.github_token; try { - const fresh = await resolveGitHubToken({ + const fresh = await resolveGitHubTokenString({ env, townId, getTownConfig: () => Promise.resolve(config), @@ -324,7 +324,7 @@ export async function buildContainerConfig( if (fresh) resolvedGithubToken = fresh; } catch (err) { console.warn( - `${TOWN_LOG} buildContainerConfig: resolveGitHubToken failed; falling back to stored token`, + `${TOWN_LOG} buildContainerConfig: resolveGitHubTokenString failed; falling back to stored token`, err ); } diff --git a/services/gastown/src/dos/town/container-dispatch.ts b/services/gastown/src/dos/town/container-dispatch.ts index d84d59d9f9..b5deab4f62 100644 --- a/services/gastown/src/dos/town/container-dispatch.ts +++ b/services/gastown/src/dos/town/container-dispatch.ts @@ -10,7 +10,7 @@ import { buildMayorSystemPrompt } from '../../prompts/mayor-system.prompt'; import type { TownConfig, RigOverrideConfig } from '../../types'; import { buildContainerConfig, resolveModel, resolveSmallModel, resolveRigConfig } from './config'; import { writeEvent } from '../../util/analytics.util'; -import { resolveGitHubToken } from './town-scm'; +import { resolveGitHubTokenString } from './town-scm'; const TOWN_LOG = '[Town.do]'; @@ -410,13 +410,13 @@ export async function startAgentInContainer( // Build env vars from town config const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; - // Map git_auth tokens. Resolve GitHub token through resolveGitHubToken so + // Map git_auth tokens. Resolve GitHub token through resolveGitHubTokenString so // we mint a fresh installation token when a platform integration is // configured; otherwise we'd hand the agent a `git_auth.github_token` // value that may have been written hours ago and is well past its 1h // installation-token TTL. The resolved value is also what the agent's // `gh` CLI sees as `GH_TOKEN`. - const githubToken = await resolveGitHubToken({ + const githubToken = await resolveGitHubTokenString({ env, townId: params.townId, getTownConfig: () => Promise.resolve(params.townConfig), @@ -625,10 +625,10 @@ export async function startMergeInContainer( } const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; - // Resolve GitHub token through resolveGitHubToken so a configured + // Resolve GitHub token through resolveGitHubTokenString so a configured // platform integration mints a fresh installation token for the // merge process. See startAgentInContainer for the rationale. - const mergeGithubToken = await resolveGitHubToken({ + const mergeGithubToken = await resolveGitHubTokenString({ env, townId: params.townId, getTownConfig: () => Promise.resolve(params.townConfig), diff --git a/services/gastown/src/dos/town/town-scm.ts b/services/gastown/src/dos/town/town-scm.ts index 3b9c84a431..c83d42d601 100644 --- a/services/gastown/src/dos/town/town-scm.ts +++ b/services/gastown/src/dos/town/town-scm.ts @@ -22,6 +22,10 @@ export type SCMContext = { platformIntegrationId?: string; }; +export type GitHubTokenResolution = + | { ok: true; token: string; source: string } + | { ok: false; tried: string[] }; + /** * Resolve a GitHub API token from the town config. * @@ -33,31 +37,41 @@ export type SCMContext = { * live source whenever an integration is configured. * 3. `git_auth.github_token` — stored installation token from a past * `refreshGitCredentials()` write. Kept as a fallback for towns that - * never had an integration wired up. NOT used when an integration is - * configured because the stored value is typically stale (1h TTL, - * never updated by anything in the request path). + * never had an integration wired up. NOT preferred over the integration + * because the stored value is typically stale (1h TTL, never updated + * by anything in the request path). * * Historically this preferred the stored token over the integration, * which made every consumer (PR poller, /refresh-git-token, agent * dispatch's `GIT_TOKEN`) hand out an expired token whenever the rig - * had been registered more than ~1 hour ago. That broke the polecat's - * `gh` CLI ("Failed to log in to github.com using token (GH_TOKEN)"), - * the worker-side PR poller (401 from api.github.com), and the - * container's git refresh fallback (refreshed itself to the same - * expired token). + * had been registered more than ~1 hour ago. See ce15a6fe7 for the fix. + * + * Returns a `GitHubTokenResolution`: `ok: true` with the token + source on + * success, or `ok: false` with the `tried` chain on failure. Used by + * `checkPRStatus`'s `no_token` failure path to surface a specific message. */ -export async function resolveGitHubToken(ctx: SCMContext): Promise { +export async function resolveGitHubToken(ctx: SCMContext): Promise { + const tried: string[] = []; const townConfig = await ctx.getTownConfig(); + // 1. github_cli_pat — long-lived user PAT if (townConfig.github_cli_pat) { - return townConfig.github_cli_pat; + return { ok: true, token: townConfig.github_cli_pat, source: 'town.github_cli_pat' }; } + tried.push('town.github_cli_pat'); + // 2. Platform integration — fresh App installation token const integrationId = townConfig.git_auth?.platform_integration_id ?? ctx.platformIntegrationId; + const sourceLabel = townConfig.git_auth?.platform_integration_id + ? 'town platform integration' + : 'rig platform integration'; if (integrationId && ctx.env.GIT_TOKEN_SERVICE) { + tried.push(sourceLabel); try { const fresh = await ctx.env.GIT_TOKEN_SERVICE.getToken(integrationId); - if (typeof fresh === 'string' && fresh.length > 0) return fresh; + if (typeof fresh === 'string' && fresh.length > 0) { + return { ok: true, token: fresh, source: sourceLabel }; + } console.warn( `${TOWN_LOG} resolveGitHubToken: platform integration ${integrationId} returned empty token; falling back to stored github_token` ); @@ -67,9 +81,24 @@ export async function resolveGitHubToken(ctx: SCMContext): Promise { + const r = await resolveGitHubToken(ctx); + return r.ok ? r.token : null; } export type PRStatusResult = { @@ -77,32 +106,58 @@ export type PRStatusResult = { mergeable_state?: string; }; +export type PRStatusError = + | { kind: 'no_token'; provider: 'github' | 'gitlab'; resolutionChain: string[] } + | { kind: 'unrecognized_url'; url: string } + | { + kind: 'http_error'; + provider: 'github' | 'gitlab'; + status: number; + statusText: string; + transient: boolean; + } + | { + kind: 'invalid_response'; + provider: 'github' | 'gitlab'; + reason: 'json_parse' | 'schema_mismatch'; + sampleKeys?: string[]; + } + | { kind: 'host_mismatch'; provider: 'gitlab'; expected: string; got: string }; + +export type PRStatusOutcome = + | { ok: true; result: PRStatusResult } + | { ok: false; error: PRStatusError }; + +function isTransientHttpStatus(status: number): boolean { + return status >= 500 || status === 429; +} + /** * Check the status of a PR/MR via its URL. - * Returns a PRStatusResult with status and optional mergeable_state (GitHub only), - * or null if the status cannot be determined. + * Returns a PRStatusOutcome discriminated union — { ok: true, result } on success, + * or { ok: false, error } with a structured PRStatusError describing why. */ -export async function checkPRStatus( - ctx: SCMContext, - prUrl: string -): Promise { +export async function checkPRStatus(ctx: SCMContext, prUrl: string): Promise { const townConfig = await ctx.getTownConfig(); // GitHub PR URL format: https://github.com/{owner}/{repo}/pull/{number} const ghMatch = prUrl.match(/^https:\/\/github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/); if (ghMatch) { const [, owner, repo, numberStr] = ghMatch; - const token = await resolveGitHubToken(ctx); - if (!token) { + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) { console.warn(`${TOWN_LOG} checkPRStatus: no GitHub token available, cannot poll ${prUrl}`); - return null; + return { + ok: false, + error: { kind: 'no_token', provider: 'github', resolutionChain: resolution.tried }, + }; } const response = await fetch( `https://api.github.com/repos/${owner}/${repo}/pulls/${numberStr}`, { headers: { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'User-Agent': 'Gastown-Refinery/1.0', }, @@ -112,17 +167,41 @@ export async function checkPRStatus( console.warn( `${TOWN_LOG} checkPRStatus: GitHub API returned ${response.status} for ${prUrl}` ); - return null; + return { + ok: false, + error: { + kind: 'http_error', + provider: 'github', + status: response.status, + statusText: response.statusText, + transient: isTransientHttpStatus(response.status), + }, + }; } const json = await response.json().catch(() => null); - if (!json) return null; + if (!json) { + return { + ok: false, + error: { kind: 'invalid_response', provider: 'github', reason: 'json_parse' }, + }; + } const data = GitHubPRStatusSchema.safeParse(json); - if (!data.success) return null; + if (!data.success) { + return { + ok: false, + error: { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + sampleKeys: Object.keys(json).slice(0, 8), + }, + }; + } - if (data.data.merged) return { status: 'merged' }; - if (data.data.state === 'closed') return { status: 'closed' }; - return { status: 'open', mergeable_state: data.data.mergeable_state }; + if (data.data.merged) return { ok: true, result: { status: 'merged' } }; + if (data.data.state === 'closed') return { ok: true, result: { status: 'closed' } }; + return { ok: true, result: { status: 'open', mergeable_state: data.data.mergeable_state } }; } // GitLab MR URL format: https://{host}/{path}/-/merge_requests/{iid} @@ -132,7 +211,14 @@ export async function checkPRStatus( const token = townConfig.git_auth?.gitlab_token; if (!token) { console.warn(`${TOWN_LOG} checkPRStatus: no gitlab_token configured, cannot poll ${prUrl}`); - return null; + return { + ok: false, + error: { + kind: 'no_token', + provider: 'gitlab', + resolutionChain: ['town.git_auth.gitlab_token'], + }, + }; } // Validate the host against known GitLab hosts to prevent SSRF/token leak. @@ -144,7 +230,15 @@ export async function checkPRStatus( console.warn( `${TOWN_LOG} checkPRStatus: refusing to send gitlab_token to unknown host: ${prHost}` ); - return null; + return { + ok: false, + error: { + kind: 'host_mismatch', + provider: 'gitlab', + expected: configuredHost ?? 'gitlab.com', + got: prHost, + }, + }; } const encodedPath = encodeURIComponent(projectPath); @@ -158,21 +252,45 @@ export async function checkPRStatus( console.warn( `${TOWN_LOG} checkPRStatus: GitLab API returned ${response.status} for ${prUrl}` ); - return null; + return { + ok: false, + error: { + kind: 'http_error', + provider: 'gitlab', + status: response.status, + statusText: response.statusText, + transient: isTransientHttpStatus(response.status), + }, + }; } const glJson = await response.json().catch(() => null); - if (!glJson) return null; + if (!glJson) { + return { + ok: false, + error: { kind: 'invalid_response', provider: 'gitlab', reason: 'json_parse' }, + }; + } const data = GitLabMRStatusSchema.safeParse(glJson); - if (!data.success) return null; + if (!data.success) { + return { + ok: false, + error: { + kind: 'invalid_response', + provider: 'gitlab', + reason: 'schema_mismatch', + sampleKeys: Object.keys(glJson).slice(0, 8), + }, + }; + } - if (data.data.state === 'merged') return { status: 'merged' }; - if (data.data.state === 'closed') return { status: 'closed' }; - return { status: 'open' }; + if (data.data.state === 'merged') return { ok: true, result: { status: 'merged' } }; + if (data.data.state === 'closed') return { ok: true, result: { status: 'closed' } }; + return { ok: true, result: { status: 'open' } }; } console.warn(`${TOWN_LOG} checkPRStatus: unrecognized PR URL format: ${prUrl}`); - return null; + return { ok: false, error: { kind: 'unrecognized_url', url: prUrl } }; } /** @@ -288,11 +406,11 @@ export async function checkPRFeedback( } const [, owner, repo, numberStr] = ghMatch; - const token = await resolveGitHubToken(ctx); - if (!token) return null; + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) return null; const headers = { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'User-Agent': 'Gastown-Refinery/1.0', }; @@ -508,15 +626,15 @@ export async function mergePR(ctx: SCMContext, prUrl: string): Promise } const [, owner, repo, numberStr] = ghMatch; - const token = await resolveGitHubToken(ctx); - if (!token) { + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) { console.warn(`${TOWN_LOG} mergePR: no GitHub token available`); return false; } const mergeUrl = `https://api.github.com/repos/${owner}/${repo}/pulls/${numberStr}/merge`; const mergeHeaders = { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'Content-Type': 'application/json', 'User-Agent': 'Gastown-Refinery/1.0', @@ -565,8 +683,8 @@ export async function createConvoyBranch( featureBranch: string; } ): Promise { - const token = await resolveGitHubToken(ctx); - if (!token) { + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) { console.warn( `${TOWN_LOG} createConvoyBranch: no GitHub token available — skipping branch creation for ${opts.featureBranch}` ); @@ -575,15 +693,13 @@ export async function createConvoyBranch( const coords = parseGitUrl(opts.gitUrl); if (!coords || coords.platform !== 'github') { - // Non-GitHub repos or unparseable URLs: skip silently (GitLab uses a - // different flow; nothing breaks if the branch doesn't pre-exist there). return; } const { owner, repo } = coords; const apiBase = `https://api.github.com/repos/${owner}/${repo}`; const headers = { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'User-Agent': 'Gastown/1.0', 'Content-Type': 'application/json', diff --git a/services/gastown/src/handlers/refresh-git-token.handler.ts b/services/gastown/src/handlers/refresh-git-token.handler.ts index 69afa61f28..c11d200016 100644 --- a/services/gastown/src/handlers/refresh-git-token.handler.ts +++ b/services/gastown/src/handlers/refresh-git-token.handler.ts @@ -52,7 +52,7 @@ export async function handleRefreshGitToken( platformIntegrationId: rigConfig?.platformIntegrationId, }); - if (!freshToken) { + if (!freshToken.ok) { console.warn( `[refresh-git-token] no token available for town=${params.townId} rig=${params.rigId}` ); @@ -62,5 +62,5 @@ export async function handleRefreshGitToken( console.log( `[refresh-git-token] refreshed git token for town=${params.townId} rig=${params.rigId}` ); - return c.json(resSuccess({ token: freshToken }), 200); + return c.json(resSuccess({ token: freshToken.token }), 200); } diff --git a/services/gastown/test/integration/pr-poll-errors.test.ts b/services/gastown/test/integration/pr-poll-errors.test.ts new file mode 100644 index 0000000000..bbc7dfb291 --- /dev/null +++ b/services/gastown/test/integration/pr-poll-errors.test.ts @@ -0,0 +1,101 @@ +/** + * Integration tests for PR poll error discrimination (#3149). + * + * Tests verify that the poll_pr action handler correctly handles the + * discriminated PRStatusOutcome: immediate-fail for no_token / non-transient + * HTTP errors, 3-strike threshold for invalid_response / unrecognized_url / + * host_mismatch, and 10-strike threshold for transient HTTP errors (5xx, 429). + * + * The no-token test runs fully end-to-end through the DO alarm. HTTP error + * scenarios are covered by unit tests (test/unit/pr-poll-*.test.ts) since + * mocking fetch is not practical in the Cloudflare Workers test runtime. + */ + +import { env, runDurableObjectAlarm } from 'cloudflare:test'; +import { describe, it, expect, beforeEach } from 'vitest'; + +function getTownStub(name = 'test-town') { + const id = env.TOWN.idFromName(name); + return env.TOWN.get(id); +} + +describe('PR poll error discrimination (#3149)', () => { + let town: ReturnType; + let townName: string; + + beforeEach(async () => { + townName = `pr-poll-${crypto.randomUUID()}`; + town = getTownStub(townName); + await town.setTownId(townName); + }); + + async function setupMrBeadWithPrUrl(prUrl: string) { + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + + const result = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'PR Poll Test', + tasks: [{ title: 'Task 1' }], + }); + + await runDurableObjectAlarm(town); + + const beadId = result.beads[0].bead.bead_id; + const bead = await town.getBeadAsync(beadId); + const agentId = bead!.assignee_agent_bead_id!; + expect(agentId).toBeTruthy(); + + await town.agentDone(agentId, { + branch: 'gt/polecat/test-branch', + summary: 'Completed task', + }); + + await runDurableObjectAlarm(town); + + const allMrs = await town.listBeads({ type: 'merge_request' }); + const mrBead = allMrs.find(b => b.metadata?.source_bead_id === beadId); + expect(mrBead).toBeTruthy(); + + // Set the PR URL and put the MR bead back to in_progress so the + // reconciler will schedule a poll_pr action on the next alarm tick. + await town.updateBead( + mrBead!.bead_id, + { + metadata: { + ...(mrBead!.metadata ?? {}), + pr_url: prUrl, + }, + }, + 'system' + ); + await town.updateBeadStatus(mrBead!.bead_id, 'in_progress', 'system'); + + return { beadId, mrBeadId: mrBead!.bead_id, agentId, convoyId: result.convoy.id }; + } + + describe('no_token: town with no GitHub token', () => { + it('should fail the MR bead immediately (1 strike) with failureKind "no_token"', async () => { + // Town is set up with no git_auth, so resolveGitHubToken will return + // { ok: false, tried: [...] } and checkPRStatus will return a no_token error. + const { mrBeadId } = await setupMrBeadWithPrUrl('https://github.com/test/repo/pull/1'); + + // Run alarm — the reconciler should generate a poll_pr action, which + // calls checkPRStatus and gets a no_token error. Since no_token is an + // immediate-fail error, the bead should fail on the first poll. + await runDurableObjectAlarm(town); + + const mrBead = await town.getBeadAsync(mrBeadId); + expect(mrBead?.status).toBe('failed'); + expect(mrBead?.metadata?.failureReason).toBe('pr_poll_failed'); + expect(mrBead?.metadata?.failureKind).toBe('no_token'); + expect(mrBead?.metadata?.failureMessage).toContain('No GitHub token resolved'); + expect(mrBead?.metadata?.failureMessage).toContain('town.git_auth.github_token'); + expect(mrBead?.metadata?.failureMessage).toContain('town.github_cli_pat'); + }); + }); +}); diff --git a/services/gastown/test/unit/pr-poll-errors.test.ts b/services/gastown/test/unit/pr-poll-errors.test.ts new file mode 100644 index 0000000000..e37fcc66b6 --- /dev/null +++ b/services/gastown/test/unit/pr-poll-errors.test.ts @@ -0,0 +1,317 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { + checkPRStatus, + resolveGitHubToken, + type SCMContext, + type PRStatusOutcome, +} from '../../src/dos/town/town-scm'; +import type { TownConfig } from '../../src/types'; + +function mockSCMContext(overrides: Partial = {}): SCMContext { + return { + env: {} as SCMContext['env'], + townId: 'test-town', + getTownConfig: async () => ({}) as TownConfig, + ...overrides, + }; +} + +describe('resolveGitHubToken', () => { + it('returns ok:true with source when github_token is configured', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_abc123' } }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.token).toBe('ghp_abc123'); + expect(result.source).toBe('town.git_auth.github_token'); + } + }); + + it('prefers github_cli_pat over stored github_token', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_stored' }, github_cli_pat: 'ghp_pat123' }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.token).toBe('ghp_pat123'); + expect(result.source).toBe('town.github_cli_pat'); + } + }); + + it('returns ok:false with resolution chain when no token is configured', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({}) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('town.git_auth.github_token'); + expect(result.tried).toContain('town.github_cli_pat'); + expect(result.tried).toContain('platform integration (none configured)'); + } + }); + + it('includes platform integration source label in resolution chain', async () => { + const ctx = mockSCMContext({ + platformIntegrationId: 'integration-123', + env: { GIT_TOKEN_SERVICE: { getToken: async () => null } } as unknown as SCMContext['env'], + getTownConfig: async () => + ({ git_auth: { platform_integration_id: 'integration-123' } }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('town platform integration'); + } + }); + + it('uses rig platform integration label when town has no platform_integration_id', async () => { + const ctx = mockSCMContext({ + platformIntegrationId: 'rig-integration-456', + env: { GIT_TOKEN_SERVICE: { getToken: async () => null } } as unknown as SCMContext['env'], + getTownConfig: async () => ({}) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('rig platform integration'); + } + }); + + it('includes GIT_TOKEN_SERVICE not bound annotation when integrationId set but service missing', async () => { + const ctx = mockSCMContext({ + platformIntegrationId: 'integration-789', + env: {} as SCMContext['env'], + getTownConfig: async () => + ({ git_auth: { platform_integration_id: 'integration-789' } }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('town platform integration (GIT_TOKEN_SERVICE not bound)'); + } + }); +}); + +describe('checkPRStatus', () => { + beforeEach(() => { + vi.restoreAllMocks(); + }); + + it('returns no_token error when no GitHub token is available', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({}) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok) { + expect(outcome.error.kind).toBe('no_token'); + expect(outcome.error.provider).toBe('github'); + if (outcome.error.kind === 'no_token') { + expect(outcome.error.resolutionChain).toContain('town.git_auth.github_token'); + expect(outcome.error.resolutionChain).toContain('town.github_cli_pat'); + } + } + }); + + it('returns http_error with transient:true for 5xx status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Server Error', { status: 503, statusText: 'Service Unavailable' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(503); + expect(outcome.error.transient).toBe(true); + expect(outcome.error.provider).toBe('github'); + } + }); + + it('returns http_error with transient:true for 429 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Rate Limited', { status: 429, statusText: 'Too Many Requests' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(429); + expect(outcome.error.transient).toBe(true); + } + }); + + it('returns http_error with transient:false for 401 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Unauthorized', { status: 401, statusText: 'Unauthorized' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_bad' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(401); + expect(outcome.error.transient).toBe(false); + } + }); + + it('returns http_error with transient:false for 403 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Forbidden', { status: 403, statusText: 'Forbidden' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_limited' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(403); + expect(outcome.error.transient).toBe(false); + } + }); + + it('returns http_error with transient:false for 404 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Not Found', { status: 404, statusText: 'Not Found' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(404); + expect(outcome.error.transient).toBe(false); + } + }); + + it('returns invalid_response with json_parse when response body is not JSON', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('not json', { + status: 200, + headers: { 'Content-Type': 'text/plain' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'invalid_response') { + expect(outcome.error.reason).toBe('json_parse'); + expect(outcome.error.provider).toBe('github'); + } + }); + + it('returns invalid_response with schema_mismatch and sampleKeys when response shape is wrong', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ id: 42, title: 'not a PR', random_field: true }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'invalid_response') { + expect(outcome.error.reason).toBe('schema_mismatch'); + expect(outcome.error.sampleKeys).toBeDefined(); + expect(outcome.error.sampleKeys!.length).toBeGreaterThan(0); + expect(outcome.error.provider).toBe('github'); + } + }); + + it('returns unrecognized_url for non-GitHub/GitLab URLs', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({}) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://example.com/something'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'unrecognized_url') { + expect(outcome.error.url).toBe('https://example.com/something'); + } + }); + + it('returns ok:true with merged status for merged PR', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ state: 'closed', merged: true, mergeable_state: 'clean' }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(true); + if (outcome.ok) { + expect(outcome.result.status).toBe('merged'); + } + }); + + it('returns ok:true with open status for open PR', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ state: 'open', merged: false, mergeable_state: 'clean' }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(true); + if (outcome.ok) { + expect(outcome.result.status).toBe('open'); + expect(outcome.result.mergeable_state).toBe('clean'); + } + }); + + it('returns host_mismatch for GitLab URL with unknown host', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => + ({ + git_auth: { + gitlab_token: 'glpat_test', + gitlab_instance_url: 'https://gitlab.mycompany.com', + }, + }) as TownConfig, + }); + const outcome = await checkPRStatus( + ctx, + 'https://gitlab.evil.com/group/project/-/merge_requests/5' + ); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'host_mismatch') { + expect(outcome.error.got).toBe('gitlab.evil.com'); + expect(outcome.error.expected).toBe('gitlab.mycompany.com'); + } + }); + + it('allows gitlab.com without host validation', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ state: 'merged' }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { gitlab_token: 'glpat_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://gitlab.com/group/project/-/merge_requests/5'); + expect(outcome.ok).toBe(true); + if (outcome.ok) { + expect(outcome.result.status).toBe('merged'); + } + }); +}); diff --git a/services/gastown/test/unit/pr-poll-thresholds.test.ts b/services/gastown/test/unit/pr-poll-thresholds.test.ts new file mode 100644 index 0000000000..a29e136d3a --- /dev/null +++ b/services/gastown/test/unit/pr-poll-thresholds.test.ts @@ -0,0 +1,330 @@ +import { describe, it, expect } from 'vitest'; +import { + _failureMessageFor, + _shouldFailImmediately, + _shouldCountAsTransient, +} from '../../src/dos/town/actions'; +import type { PRStatusError } from '../../src/dos/town/town-scm'; + +describe('failureMessageFor', () => { + it('produces actionable message for no_token with resolution chain', () => { + const error: PRStatusError = { + kind: 'no_token', + provider: 'github', + resolutionChain: [ + 'town.git_auth.github_token', + 'town.github_cli_pat', + 'town platform integration', + 'rig platform integration', + ], + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('No GitHub token resolved'); + expect(msg).toContain('`town.git_auth.github_token`'); + expect(msg).toContain('`town.github_cli_pat`'); + expect(msg).toContain('`town platform integration`'); + expect(msg).toContain('`rig platform integration`'); + expect(msg).toContain('polecat agents use their own container credentials'); + }); + + it('produces specific message for HTTP 401', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('invalid or expired'); + expect(msg).toContain('HTTP 401'); + }); + + it('produces specific message for HTTP 403', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 403, + statusText: 'Forbidden', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('lacks permission'); + expect(msg).toContain('HTTP 403'); + }); + + it('produces specific message for HTTP 404', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 404, + statusText: 'Not Found', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('not found'); + expect(msg).toContain('HTTP 404'); + }); + + it('produces generic HTTP message for other status codes', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 422, + statusText: 'Unprocessable Entity', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('HTTP 422'); + expect(msg).toContain('Not retryable'); + }); + + it('indicates Retrying for transient HTTP errors', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('Retrying'); + }); + + it('produces message with sampleKeys for schema_mismatch', () => { + const error: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + sampleKeys: ['id', 'title', 'random_field'], + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('unexpected response shape'); + expect(msg).toContain('schema_mismatch'); + expect(msg).toContain('id, title, random_field'); + expect(msg).toContain('file a bug'); + }); + + it('produces message without sampleKeys for json_parse', () => { + const error: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'json_parse', + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('json_parse'); + expect(msg).not.toContain('top-level keys'); + }); + + it('produces message for unrecognized_url', () => { + const error: PRStatusError = { + kind: 'unrecognized_url', + url: 'https://bitbucket.org/repo/pull/1', + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('not recognized'); + expect(msg).toContain('https://bitbucket.org/repo/pull/1'); + }); + + it('produces message for host_mismatch', () => { + const error: PRStatusError = { + kind: 'host_mismatch', + provider: 'gitlab', + expected: 'gitlab.mycompany.com', + got: 'gitlab.evil.com', + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('Refusing to send GitLab token'); + expect(msg).toContain('gitlab.evil.com'); + expect(msg).toContain('gitlab.mycompany.com'); + }); +}); + +describe('shouldFailImmediately', () => { + it('returns true for no_token', () => { + const error: PRStatusError = { kind: 'no_token', provider: 'github', resolutionChain: [] }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for http_error with transient:false (401)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for http_error with transient:false (403)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 403, + statusText: 'Forbidden', + transient: false, + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for http_error with transient:false (404)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 404, + statusText: 'Not Found', + transient: false, + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns false for http_error with transient:true (5xx)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + expect(_shouldFailImmediately(error)).toBe(false); + }); + + it('returns false for invalid_response', () => { + const error: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + }; + expect(_shouldFailImmediately(error)).toBe(false); + }); + + it('returns true for unrecognized_url', () => { + const error: PRStatusError = { kind: 'unrecognized_url', url: 'https://example.com' }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for host_mismatch', () => { + const error: PRStatusError = { + kind: 'host_mismatch', + provider: 'gitlab', + expected: 'gitlab.com', + got: 'evil.com', + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); +}); + +describe('shouldCountAsTransient', () => { + it('returns true for http_error with transient:true (5xx)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + expect(_shouldCountAsTransient(error)).toBe(true); + }); + + it('returns true for http_error with transient:true (429)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 429, + statusText: 'Too Many Requests', + transient: true, + }; + expect(_shouldCountAsTransient(error)).toBe(true); + }); + + it('returns false for http_error with transient:false', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }; + expect(_shouldCountAsTransient(error)).toBe(false); + }); + + it('returns false for non-http_error kinds', () => { + const error: PRStatusError = { kind: 'no_token', provider: 'github', resolutionChain: [] }; + expect(_shouldCountAsTransient(error)).toBe(false); + }); +}); + +describe('error categorization is mutually exclusive', () => { + const allKinds: PRStatusError[] = [ + { kind: 'no_token', provider: 'github', resolutionChain: [] }, + { kind: 'http_error', provider: 'github', status: 401, statusText: 'Unauthorized', transient: false }, + { kind: 'http_error', provider: 'github', status: 503, statusText: 'Service Unavailable', transient: true }, + { kind: 'invalid_response', provider: 'github', reason: 'schema_mismatch' }, + { kind: 'unrecognized_url', url: 'https://example.com' }, + { kind: 'host_mismatch', provider: 'gitlab', expected: 'gitlab.com', got: 'evil.com' }, + ]; + + it('each error kind falls into exactly one bucket', () => { + for (const error of allKinds) { + const buckets = [ + _shouldFailImmediately(error), + _shouldCountAsTransient(error), + ].filter(Boolean); + expect(buckets.length, `${error.kind} should match exactly one bucket`).toBeLessThanOrEqual(1); + } + }); + + it('invalid_response is the only kind in the 3-strike (non-transient) bucket', () => { + for (const error of allKinds) { + const isNonTransient = !_shouldFailImmediately(error) && !_shouldCountAsTransient(error); + if (error.kind === 'invalid_response') { + expect(isNonTransient, `invalid_response should be in 3-strike bucket`).toBe(true); + } else { + expect(isNonTransient, `${error.kind} should NOT be in 3-strike bucket`).toBe(false); + } + } + }); +}); + +describe('counter cross-contamination', () => { + it('9 transient errors then 1 non-transient should not fail (separate counters)', () => { + const transientError: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + const nonTransientError: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + }; + + expect(_shouldFailImmediately(transientError)).toBe(false); + expect(_shouldCountAsTransient(transientError)).toBe(true); + + expect(_shouldFailImmediately(nonTransientError)).toBe(false); + expect(_shouldCountAsTransient(nonTransientError)).toBe(false); + + // After 9 transient errors, poll_transient_count=9, poll_non_transient_count=0. + // Then 1 non-transient error resets poll_transient_count=0, increments poll_non_transient_count=1. + // 1 < 3, so bead should NOT fail. This test documents the expected behavior; + // the actual counter logic lives in the SQL within applyAction. + }); + + it('3 consecutive non-transient errors should fail (non-transient counter reaches threshold)', () => { + const nonTransientError: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + }; + + expect(_shouldFailImmediately(nonTransientError)).toBe(false); + expect(_shouldCountAsTransient(nonTransientError)).toBe(false); + + // 3 consecutive non-transient errors: poll_non_transient_count reaches 3. + // 3 >= PR_POLL_NON_TRANSIENT_THRESHOLD (3), so bead SHOULD fail. + }); +}); diff --git a/services/gastown/test/unit/town-scm.test.ts b/services/gastown/test/unit/town-scm.test.ts index 3b9bc8ffdc..1d0cb8a935 100644 --- a/services/gastown/test/unit/town-scm.test.ts +++ b/services/gastown/test/unit/town-scm.test.ts @@ -44,12 +44,12 @@ describe('resolveGitHubToken priority chain', () => { github_token: STORED_GITHUB_TOKEN, platform_integration_id: INTEGRATION_ID, }); - const token = await resolveGitHubToken({ + const result = await resolveGitHubToken({ env: fakeEnv({}), townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(token).toBe(USER_PAT); + expect(result).toEqual({ ok: true, token: USER_PAT, source: 'town.github_cli_pat' }); }); it('returns a fresh integration token when an integration is configured, ignoring the stale stored token', async () => { @@ -57,22 +57,22 @@ describe('resolveGitHubToken priority chain', () => { github_token: STORED_GITHUB_TOKEN, platform_integration_id: INTEGRATION_ID, }); - const token = await resolveGitHubToken({ + const result = await resolveGitHubToken({ env: fakeEnv({ tokenServiceResponse: FRESH_INSTALLATION_TOKEN }), townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(token).toBe(FRESH_INSTALLATION_TOKEN); + expect(result).toEqual({ ok: true, token: FRESH_INSTALLATION_TOKEN, source: 'town platform integration' }); }); it('falls back to stored github_token when no integration is configured', async () => { const cfg = buildConfig({ github_token: STORED_GITHUB_TOKEN }); - const token = await resolveGitHubToken({ + const result = await resolveGitHubToken({ env: fakeEnv({}), townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(token).toBe(STORED_GITHUB_TOKEN); + expect(result).toEqual({ ok: true, token: STORED_GITHUB_TOKEN, source: 'town.git_auth.github_token' }); }); it('falls back to stored github_token when integration lookup throws', async () => { @@ -80,32 +80,48 @@ describe('resolveGitHubToken priority chain', () => { github_token: STORED_GITHUB_TOKEN, platform_integration_id: INTEGRATION_ID, }); - const token = await resolveGitHubToken({ + const result = await resolveGitHubToken({ env: fakeEnv({ tokenServiceShouldThrow: true }), townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(token).toBe(STORED_GITHUB_TOKEN); + expect(result).toEqual({ ok: true, token: STORED_GITHUB_TOKEN, source: 'town.git_auth.github_token' }); }); it('uses the rig-level platformIntegrationId when town config does not carry one', async () => { const cfg = buildConfig({}); - const token = await resolveGitHubToken({ + const result = await resolveGitHubToken({ env: fakeEnv({ tokenServiceResponse: FRESH_INSTALLATION_TOKEN }), townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), platformIntegrationId: INTEGRATION_ID, }); - expect(token).toBe(FRESH_INSTALLATION_TOKEN); + expect(result).toEqual({ ok: true, token: FRESH_INSTALLATION_TOKEN, source: 'rig platform integration' }); }); - it('returns null when nothing is configured', async () => { + it('returns ok:false with tried chain when nothing is configured', async () => { const cfg = buildConfig({}); - const token = await resolveGitHubToken({ + const result = await resolveGitHubToken({ env: fakeEnv({}), townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(token).toBeNull(); + expect(result).toEqual({ + ok: false, + tried: ['town.github_cli_pat', 'platform integration (none configured)', 'town.git_auth.github_token'], + }); + }); + + it('falls back to stored github_token when integration returns empty string', async () => { + const cfg = buildConfig({ + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const result = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceResponse: '' }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ ok: true, token: STORED_GITHUB_TOKEN, source: 'town.git_auth.github_token' }); }); }); From ff992a3832ea581338bc9be6dca5df3cf1698b6f Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 16:21:45 +0000 Subject: [PATCH 06/14] fix(gastown/container): move GASTOWN_CONTAINER_TOKEN assignment after awaitHydration in /refresh-token The /refresh-token handler assigned process.env.GASTOWN_CONTAINER_TOKEN before awaiting hydration, inconsistent with PATCH /agents/:id/model which gates first. Mid-hydration token refresh could cause buildPrewarmEnv to pick up a different token than the one hydration captured locally. --- services/gastown/container/src/control-server.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/services/gastown/container/src/control-server.ts b/services/gastown/container/src/control-server.ts index 36196f1b41..761feefbaa 100644 --- a/services/gastown/container/src/control-server.ts +++ b/services/gastown/container/src/control-server.ts @@ -263,14 +263,20 @@ app.post('/refresh-token', async c => { if (!body || typeof body !== 'object' || !('token' in body) || typeof body.token !== 'string') { return c.json({ error: 'Missing or invalid token field' }, 400); } - process.env.GASTOWN_CONTAINER_TOKEN = body.token; + // Capture the new token into a local so it survives the await below. + const newToken = body.token; // Wait for boot hydration to release the global sdkServerLock before - // we serialise N agent restarts through it. Without this, agent k - // queues behind every in-flight hydration ensureSDKServer and easily - // blows past REFRESH_AGENT_TIMEOUT_MS (6s) on warm-restart containers. + // we mutate process.env or serialise N agent restarts through it. + // Without this gate, a mid-hydration token refresh can cause + // buildPrewarmEnv to pick up a different token than the one hydration + // captured locally — matching the PATCH /agents/:id/model handler + // which also gates first. await awaitHydration(); + // Now safe to assign: hydration is done, no concurrent env readers. + process.env.GASTOWN_CONTAINER_TOKEN = newToken; + const activeAgents = listAgents().filter(a => a.status === 'running' || a.status === 'starting'); log.info('refresh_token.received', { agentCount: activeAgents.length, From acdec102827c18e477393081776672ecdd2bf346 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 16:22:44 +0000 Subject: [PATCH 07/14] refactor(gastown/container): capture bootHydration resolve locally instead of module global The _resolveHydration module-global stale-capture pattern would orphan the first promise's resolver if bootHydration() were ever called concurrently. Capturing resolve as a local inside bootHydration() itself eliminates the risk and removes the module-global. --- services/gastown/container/src/process-manager.ts | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 440a24d318..bdc39ac795 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -78,7 +78,6 @@ export function isDraining(): boolean { // the promise immediately so non-hydrating containers (tests, dev) // don't block; bootHydration replaces it on entry and resolves it on exit. let _hydrationComplete: Promise = Promise.resolve(); -let _resolveHydration: () => void = () => {}; export function awaitHydration(): Promise { return _hydrationComplete; @@ -2818,14 +2817,12 @@ async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): P * contending for it themselves. */ export async function bootHydration(): Promise { - const LOG = '[boot-hydration]'; - _hydrationComplete = new Promise(resolve => { - _resolveHydration = resolve; - }); + let resolve!: () => void; + _hydrationComplete = new Promise(r => { resolve = r; }); try { - await bootHydrationImpl(LOG); + await bootHydrationImpl('[boot-hydration]'); } finally { - _resolveHydration(); + resolve(); } } From c0c17ee67fc7dfcd42fed5965cd5be7b1cc125ff Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 16:24:11 +0000 Subject: [PATCH 08/14] refactor(gastown): eliminate double RPC in /mayor-id by consolidating getMayorPrewarmContext getMayorPrewarmContext now returns { agentId } even when the kilocode token is unavailable (instead of null), so the worker route no longer needs to fall through to getMayorAgentId. This eliminates the redundant agents.listAgents SQL query over a second RPC hop. --- services/gastown/src/dos/Town.do.ts | 19 +++++++++++-------- services/gastown/src/gastown.worker.ts | 7 +++---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index dfb1f62aff..a9f689e1de 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -2697,22 +2697,25 @@ export class TownDO extends DurableObject { * use — so the prewarm cache hit is real instead of triggering the * "config mismatch, evicting prewarmed server" eviction path. * - * Returns null when there's no mayor yet, or when the kilocode token - * isn't available (in which case prewarm should skip — the next - * /agents/start will defer too). + * Returns null only when there's no mayor at all. When the mayor + * exists but the kilocode token isn't available, returns a partial + * shape with just { agentId } so callers can derive the fallback + * agentId without a second RPC hop. */ async getMayorPrewarmContext(): Promise<{ agentId: string; - model: string; - smallModel: string; - kilocodeToken: string; - organizationId: string | null; + model?: string; + smallModel?: string; + kilocodeToken?: string; + organizationId?: string | null; } | null> { const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; if (!mayor) return null; const kilocodeToken = await this.resolveKilocodeToken(); - if (!kilocodeToken) return null; + if (!kilocodeToken) { + return { agentId: mayor.id }; + } const townConfig = await this.getTownConfig(); // _ensureMayor dispatches the mayor without a per-rig override diff --git a/services/gastown/src/gastown.worker.ts b/services/gastown/src/gastown.worker.ts index b5a6cd715d..0c7d4cabb0 100644 --- a/services/gastown/src/gastown.worker.ts +++ b/services/gastown/src/gastown.worker.ts @@ -752,11 +752,10 @@ app.get('/api/towns/:townId/mayor-id', async c => { // - When there is no mayor at all, return { agentId: null } — the // container treats missing/null agentId as "no mayor, skip prewarm". const ctx = await town.getMayorPrewarmContext(); - if (ctx) { - return c.json({ success: true, ...ctx }); + if (!ctx) { + return c.json({ success: true, agentId: null }); } - const agentId = await town.getMayorAgentId(); - return c.json({ success: true, agentId }); + return c.json({ success: true, ...ctx }); }); // ── Container Events ───────────────────────────────────────────────────── From eb0525ffc5889f57453f341c2c7e4b37179dc1ff Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 16:25:02 +0000 Subject: [PATCH 09/14] fix(gastown): add rigId tag middleware for /api/users/:userId/rigs/:rigId routes The per-route tagging middleware registered prefixes under /api/orgs/:orgId/... but missed the parallel /api/users/:userId/rigs/:rigId family. Without this, requests to those routes lack rigId in structured log tags. --- services/gastown/src/gastown.worker.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/gastown/src/gastown.worker.ts b/services/gastown/src/gastown.worker.ts index 0c7d4cabb0..8a117eb598 100644 --- a/services/gastown/src/gastown.worker.ts +++ b/services/gastown/src/gastown.worker.ts @@ -203,6 +203,11 @@ app.use('/api/users/:userId/towns/:townId/*', async (c, next) => { if (townId) logger.setTags({ townId }); await next(); }); +app.use('/api/users/:userId/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); app.use('/api/towns/:townId/rigs/:rigId/*', async (c, next) => { const rigId = c.req.param('rigId'); if (rigId) logger.setTags({ rigId }); From c65fbc22665e6437ed3c09f444d20ae8df6ec7fa Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 16:40:07 +0000 Subject: [PATCH 10/14] fix(gastown): correct misleading migration comment in non-transient poll counter --- services/gastown/src/dos/town/actions.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/gastown/src/dos/town/actions.ts b/services/gastown/src/dos/town/actions.ts index 977151abd8..02564a95f3 100644 --- a/services/gastown/src/dos/town/actions.ts +++ b/services/gastown/src/dos/town/actions.ts @@ -1363,7 +1363,9 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro } else { // Non-transient, non-immediate errors (invalid_response only) // count toward a lower 3-strike threshold. - // Migrate legacy poll_null_count into poll_non_transient_count on first read. + // No legacy poll_null_count migration needed: invalid_response + // is a new error kind that couldn't have accumulated under the + // old transient/null classification. query( sql, /* sql */ ` From 60224676dc3227c525ff36b68aa06b6f515ab49a Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 18:37:35 +0000 Subject: [PATCH 11/14] chore(gastown): apply oxfmt formatting --- .../gastown/container/src/process-manager.ts | 4 ++- services/gastown/src/dos/town/town-scm.ts | 6 +++- .../gastown/test/unit/pr-poll-errors.test.ts | 3 +- .../test/unit/pr-poll-thresholds.test.ts | 27 ++++++++++---- services/gastown/test/unit/town-scm.test.ts | 36 +++++++++++++++---- 5 files changed, 60 insertions(+), 16 deletions(-) diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index bdc39ac795..9914776f3e 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -2818,7 +2818,9 @@ async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): P */ export async function bootHydration(): Promise { let resolve!: () => void; - _hydrationComplete = new Promise(r => { resolve = r; }); + _hydrationComplete = new Promise(r => { + resolve = r; + }); try { await bootHydrationImpl('[boot-hydration]'); } finally { diff --git a/services/gastown/src/dos/town/town-scm.ts b/services/gastown/src/dos/town/town-scm.ts index c83d42d601..a077d713d3 100644 --- a/services/gastown/src/dos/town/town-scm.ts +++ b/services/gastown/src/dos/town/town-scm.ts @@ -89,7 +89,11 @@ export async function resolveGitHubToken(ctx: SCMContext): Promise { it('prefers github_cli_pat over stored github_token', async () => { const ctx = mockSCMContext({ - getTownConfig: async () => ({ git_auth: { github_token: 'ghp_stored' }, github_cli_pat: 'ghp_pat123' }) as TownConfig, + getTownConfig: async () => + ({ git_auth: { github_token: 'ghp_stored' }, github_cli_pat: 'ghp_pat123' }) as TownConfig, }); const result = await resolveGitHubToken(ctx); expect(result.ok).toBe(true); diff --git a/services/gastown/test/unit/pr-poll-thresholds.test.ts b/services/gastown/test/unit/pr-poll-thresholds.test.ts index a29e136d3a..678e33cadb 100644 --- a/services/gastown/test/unit/pr-poll-thresholds.test.ts +++ b/services/gastown/test/unit/pr-poll-thresholds.test.ts @@ -258,8 +258,20 @@ describe('shouldCountAsTransient', () => { describe('error categorization is mutually exclusive', () => { const allKinds: PRStatusError[] = [ { kind: 'no_token', provider: 'github', resolutionChain: [] }, - { kind: 'http_error', provider: 'github', status: 401, statusText: 'Unauthorized', transient: false }, - { kind: 'http_error', provider: 'github', status: 503, statusText: 'Service Unavailable', transient: true }, + { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }, + { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }, { kind: 'invalid_response', provider: 'github', reason: 'schema_mismatch' }, { kind: 'unrecognized_url', url: 'https://example.com' }, { kind: 'host_mismatch', provider: 'gitlab', expected: 'gitlab.com', got: 'evil.com' }, @@ -267,11 +279,12 @@ describe('error categorization is mutually exclusive', () => { it('each error kind falls into exactly one bucket', () => { for (const error of allKinds) { - const buckets = [ - _shouldFailImmediately(error), - _shouldCountAsTransient(error), - ].filter(Boolean); - expect(buckets.length, `${error.kind} should match exactly one bucket`).toBeLessThanOrEqual(1); + const buckets = [_shouldFailImmediately(error), _shouldCountAsTransient(error)].filter( + Boolean + ); + expect(buckets.length, `${error.kind} should match exactly one bucket`).toBeLessThanOrEqual( + 1 + ); } }); diff --git a/services/gastown/test/unit/town-scm.test.ts b/services/gastown/test/unit/town-scm.test.ts index 1d0cb8a935..7f1d07cc18 100644 --- a/services/gastown/test/unit/town-scm.test.ts +++ b/services/gastown/test/unit/town-scm.test.ts @@ -62,7 +62,11 @@ describe('resolveGitHubToken priority chain', () => { townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(result).toEqual({ ok: true, token: FRESH_INSTALLATION_TOKEN, source: 'town platform integration' }); + expect(result).toEqual({ + ok: true, + token: FRESH_INSTALLATION_TOKEN, + source: 'town platform integration', + }); }); it('falls back to stored github_token when no integration is configured', async () => { @@ -72,7 +76,11 @@ describe('resolveGitHubToken priority chain', () => { townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(result).toEqual({ ok: true, token: STORED_GITHUB_TOKEN, source: 'town.git_auth.github_token' }); + expect(result).toEqual({ + ok: true, + token: STORED_GITHUB_TOKEN, + source: 'town.git_auth.github_token', + }); }); it('falls back to stored github_token when integration lookup throws', async () => { @@ -85,7 +93,11 @@ describe('resolveGitHubToken priority chain', () => { townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(result).toEqual({ ok: true, token: STORED_GITHUB_TOKEN, source: 'town.git_auth.github_token' }); + expect(result).toEqual({ + ok: true, + token: STORED_GITHUB_TOKEN, + source: 'town.git_auth.github_token', + }); }); it('uses the rig-level platformIntegrationId when town config does not carry one', async () => { @@ -96,7 +108,11 @@ describe('resolveGitHubToken priority chain', () => { getTownConfig: () => Promise.resolve(cfg), platformIntegrationId: INTEGRATION_ID, }); - expect(result).toEqual({ ok: true, token: FRESH_INSTALLATION_TOKEN, source: 'rig platform integration' }); + expect(result).toEqual({ + ok: true, + token: FRESH_INSTALLATION_TOKEN, + source: 'rig platform integration', + }); }); it('returns ok:false with tried chain when nothing is configured', async () => { @@ -108,7 +124,11 @@ describe('resolveGitHubToken priority chain', () => { }); expect(result).toEqual({ ok: false, - tried: ['town.github_cli_pat', 'platform integration (none configured)', 'town.git_auth.github_token'], + tried: [ + 'town.github_cli_pat', + 'platform integration (none configured)', + 'town.git_auth.github_token', + ], }); }); @@ -122,6 +142,10 @@ describe('resolveGitHubToken priority chain', () => { townId: 'town-1', getTownConfig: () => Promise.resolve(cfg), }); - expect(result).toEqual({ ok: true, token: STORED_GITHUB_TOKEN, source: 'town.git_auth.github_token' }); + expect(result).toEqual({ + ok: true, + token: STORED_GITHUB_TOKEN, + source: 'town.git_auth.github_token', + }); }); }); From 3f19681b9737eafa6bded459989b6c4803641b49 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 11 May 2026 18:07:40 -0500 Subject: [PATCH 12/14] chore(gastown): Lower trace sample rate --- services/gastown/wrangler.jsonc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/gastown/wrangler.jsonc b/services/gastown/wrangler.jsonc index ecf79d7894..8a92afc46b 100644 --- a/services/gastown/wrangler.jsonc +++ b/services/gastown/wrangler.jsonc @@ -19,7 +19,7 @@ "traces": { "enabled": true, "persist": true, - "head_sampling_rate": 1, + "head_sampling_rate": 0.1, }, }, "upload_source_maps": true, From cbbd120cf143c4048cd23b01b485dc9e29ed7cc8 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 12 May 2026 12:05:59 -0500 Subject: [PATCH 13/14] fix(gastown): close hook-leak race + plumb GH_TOKEN into mayor prewarm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent gastown reliability fixes that came out of debugging a production town stuck in a dispatch loop with the mayor reporting "GH_TOKEN is NOT set" while the bead it was working on cycled in_progress -> open every few minutes: 1. dispatchAgent's catch path no longer transitions the agent to 'idle'. The exception path can fire after the container has already accepted /agents/start (e.g. /refresh-token raced a token rotation and threw late), in which case the SDK session is alive and heartbeating. Marking the agent idle was tripping reconcileAgents' "idle agent hooked to live bead" rule, which tore the hook out from under the running session and made every subsequent gt_request_changes / gt_triage_resolve call fail with "is not hooked to a bead" until the session exited. Now we leave the agent as 'working' and let the heartbeat-staleness check do the cleanup if the agent really did die. Also surface getLastStartError() in the dispatch_failed analytics event so future debugging doesn't have to guess at the root cause. 2. reconcileAgents' "idle agent hooked to live bead" rule now skips agents whose last_activity_at is fresh (<90s). That 90s window matches reconcileAgents' own stale-heartbeat threshold, so a truly dead agent still gets reaped on a later tick — but a live SDK session caught mid-action by a phantom dispatch_failed keeps its hook. 3. review-queue's unhooked-refinery gt_done fallback no longer silently marks the MR as 'merged' when there's no pr_url. The same hook-leak race that fires while the refinery is mid-review would silently land a PR the refinery was actively trying to reject (after gt_request_changes hit "not hooked"). The fallback now fails the review (returning the source bead to 'open') and raises an escalation so a human can decide whether the refinery meant to approve or request changes. Also fixes the "mayor has no GH_TOKEN" symptom by plumbing GitHub auth into the prewarm path: - getMayorPrewarmContext now resolves the GitHub token via the standard chain (github_cli_pat / platform integration / stored fallback) and returns it alongside the kilocode token. - buildPrewarmEnv (container) sets GH_TOKEN, GIT_TOKEN, GITHUB_TOKEN, and GITHUB_CLI_PAT on the prewarmed SDK's process.env so the mayor's bash tool sees credentials from boot. Previously these were only set by buildAgentEnv on /agents/start, which never runs while ensureMayor short-circuits on a warm session. - PERSIST_ENV_KEYS now includes GH_TOKEN/GIT_TOKEN/GITHUB_TOKEN/ GITHUB_CLI_PAT so a token rotation arriving via /agents/start cache-hit reaches process.env. Includes prior staged work: hasActiveWork refactor (short-circuit SQL reads via thunks) and healthCheck alarm-cadence selection so idle towns don't all wake up on the 5s cadence after a deploy. Verification: pnpm --filter cloudflare-gastown {typecheck,lint,test} all pass (251 tests). Behavior verified by code inspection against the production AE event timeline for town 0532b9ef-04c8-488a-ac34-6eb1a3d47228 — every dispatch_failed in that town hit the catch path, every cycling unhook hit the idle+ hooked+live-bead rule, and every "GH_TOKEN is NOT set" mayor session was a prewarm cache-hit. --- .../gastown/container/src/process-manager.ts | 47 +++++- services/gastown/src/dos/Town.do.ts | 34 +++- services/gastown/src/dos/town/reconciler.ts | 22 ++- services/gastown/src/dos/town/review-queue.ts | 50 +++++- services/gastown/src/dos/town/scheduling.ts | 150 +++++++++++------- 5 files changed, 228 insertions(+), 75 deletions(-) diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 9914776f3e..7996ff07db 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -603,6 +603,15 @@ const PERSIST_ENV_KEYS = new Set([ 'KILO_CONFIG_CONTENT', 'OPENCODE_CONFIG_CONTENT', 'GASTOWN_ORGANIZATION_ID', + // Git auth: token rotations need to land in process.env so that + // bash subprocesses spawned by the SDK's bash tool see the fresh + // value. Without these here, a cache-hit on /agents/start returns + // the prewarmed SDK with whatever GH_TOKEN/GIT_TOKEN was set at + // prewarm time, even after the worker rotated them. + 'GH_TOKEN', + 'GIT_TOKEN', + 'GITHUB_TOKEN', + 'GITHUB_CLI_PAT', ]); async function ensureSDKServer( @@ -2632,6 +2641,8 @@ type MayorPrewarmContext = { smallModel?: string; kilocodeToken?: string; organizationId?: string | null; + githubToken?: string; + githubCliPat?: string; }; // Mirrors the response contract documented at @@ -2645,6 +2656,8 @@ const MayorPrewarmResponse = z smallModel: z.string().optional(), kilocodeToken: z.string().optional(), organizationId: z.string().nullable().optional(), + githubToken: z.string().optional(), + githubCliPat: z.string().optional(), }) .passthrough(); @@ -2665,9 +2678,18 @@ async function fetchMayorPrewarmContext( const json: unknown = await resp.json(); const parsed = MayorPrewarmResponse.safeParse(json); if (!parsed.success) return null; - const { agentId, model, smallModel, kilocodeToken, organizationId } = parsed.data; + const { agentId, model, smallModel, kilocodeToken, organizationId, githubToken, githubCliPat } = + parsed.data; if (!agentId) return null; - return { agentId, model, smallModel, kilocodeToken, organizationId }; + return { + agentId, + model, + smallModel, + kilocodeToken, + organizationId, + githubToken, + githubCliPat, + }; } catch (err) { console.warn(`${MANAGER_LOG} fetchMayorPrewarmContext failed:`, err); return null; @@ -2720,6 +2742,27 @@ function buildPrewarmEnv(ctx: MayorPrewarmContext, townId: string): Record { smallModel?: string; kilocodeToken?: string; organizationId?: string | null; + githubToken?: string; + githubCliPat?: string; } | null> { const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; if (!mayor) return null; @@ -2718,6 +2720,19 @@ export class TownDO extends DurableObject { } const townConfig = await this.getTownConfig(); + + // Resolve the GitHub token using the same chain as startAgentInContainer + // so the prewarmed mayor SDK boots with `gh` CLI auth (`GH_TOKEN`) + // already populated. Without this, the mayor's bash tool sees an + // empty environment for git/gh until the SDK is torn down and the + // /agents/start path's buildAgentEnv runs — which never happens + // while ensureMayor short-circuits on a warm session. + const githubToken = await scm.resolveGitHubTokenString({ + env: this.env, + townId: this.townId, + getTownConfig: () => Promise.resolve(townConfig), + }); + // _ensureMayor dispatches the mayor without a per-rig override // (Town.do.ts:2766-2790). Match that resolution here so the prewarm // KILO_CONFIG_CONTENT is byte-identical to what /agents/start will @@ -2728,6 +2743,8 @@ export class TownDO extends DurableObject { smallModel: config.resolveSmallModel(townConfig), kilocodeToken, organizationId: townConfig.organization_id ?? null, + ...(githubToken ? { githubToken } : {}), + ...(townConfig.github_cli_pat ? { githubCliPat: townConfig.github_cli_pat } : {}), }; } @@ -4856,7 +4873,9 @@ export class TownDO extends DurableObject { /** * Health check: verify the alarm is set and return basic town status. * Called by the GastownUserDO watchdog alarm to ensure each town's - * alarm loop is firing. Re-arms the alarm if it's missing. + * alarm loop is firing. Re-arms the alarm if it's missing, picking the + * cadence based on `hasActiveWork` so idle towns don't all wake up + * on the fast 5s cadence after a deploy. */ async healthCheck(): Promise<{ townId: string; @@ -4870,10 +4889,17 @@ export class TownDO extends DurableObject { const currentAlarm = await this.ctx.storage.getAlarm(); const alarmSet = currentAlarm !== null && currentAlarm > Date.now(); - // Re-arm if missing — this is the whole point of the watchdog + // Re-arm if missing — this is the whole point of the watchdog. Pick + // the cadence to match observed activity: active towns recover fast, + // idle towns don't pay the cost of a 5s wake-up storm across the fleet. if (!alarmSet) { - console.warn(`${TOWN_LOG} healthCheck: alarm not set for town=${townId}, re-arming`); - await this.ctx.storage.setAlarm(Date.now() + ACTIVE_ALARM_INTERVAL_MS); + const interval = scheduling.hasActiveWork(this.sql) + ? ACTIVE_ALARM_INTERVAL_MS + : IDLE_ALARM_INTERVAL_MS; + console.warn( + `${TOWN_LOG} healthCheck: alarm not set for town=${townId}, re-arming with ${interval}ms` + ); + await this.ctx.storage.setAlarm(Date.now() + interval); } const activeAgents = Number( diff --git a/services/gastown/src/dos/town/reconciler.ts b/services/gastown/src/dos/town/reconciler.ts index c7715a05ee..a1a9c6dbb9 100644 --- a/services/gastown/src/dos/town/reconciler.ts +++ b/services/gastown/src/dos/town/reconciler.ts @@ -883,10 +883,24 @@ export function reconcileAgents(sql: SqlStorage, opts?: { draining?: boolean }): agent_id: agent.bead_id, }); } else if (hookedStatus === 'in_progress' || hookedStatus === 'open') { - // Idle agent hooked to a live bead — the dispatch started but the - // agent died (container failed to start, OOM, etc.) and agentCompleted - // set it to idle without unhooking. Reset the bead to open and unhook - // so the scheduling rules can re-dispatch. + // Idle agent hooked to a live bead — usually means the dispatch + // started but the agent died (container failed to start, OOM, + // etc.) and agentCompleted set it to idle without unhooking. + // + // Guard against the phantom-failed-dispatch case: dispatchAgent + // can return started=false even when the container actually + // accepted the agent (e.g. /refresh-token raced a token rotation), + // and the SDK session keeps heartbeating happily. Tearing the + // hook out from under a live session causes tools that need a + // hooked bead (gt_request_changes, gt_triage_resolve) to fail + // with "is not hooked to a bead" until the session exits. + // + // If we've seen a heartbeat in the last 90s, treat the agent as + // alive and leave the hook in place. The 90s window matches + // reconcileAgents' stale-heartbeat threshold, so a truly dead + // agent still gets reaped by the heartbeat path on a later tick. + if (!staleMs(agent.last_activity_at, 90_000)) continue; + actions.push({ type: 'unhook_agent', agent_id: agent.bead_id, diff --git a/services/gastown/src/dos/town/review-queue.ts b/services/gastown/src/dos/town/review-queue.ts index 80fd1b6b1a..3246db07da 100644 --- a/services/gastown/src/dos/town/review-queue.ts +++ b/services/gastown/src/dos/town/review-queue.ts @@ -453,10 +453,10 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu // The agent was unhooked by a recovery path between when the agent // finished work and when it called gt_done. // - // For refineries, this is critical: the refinery successfully merged - // but the hook was cleared by zombie detection. We MUST still complete - // the review — otherwise the source bead stays open forever. Find the - // most recent non-closed MR bead assigned to this agent and complete it. + // For refineries, this is critical: the refinery may have actually + // completed work (merged a PR, posted a review) but its hook was + // cleared by zombie detection. We need to make progress without + // landing PRs the refinery did NOT actually approve. if (agent.role === 'refinery') { const recentMrRows = [ ...query( @@ -479,6 +479,8 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu `[review-queue] agentDone: unhooked refinery ${agentId} — recovering MR bead ${mrBeadId}` ); if (input.pr_url) { + // Refinery created/landed a PR. Trust the URL: storing it + // moves the MR to in_review where poll_pr decides the merge. const stored = setReviewPrUrl(sql, mrBeadId, input.pr_url); if (stored) { markReviewInReview(sql, mrBeadId); @@ -490,10 +492,46 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu }); } } else { + // No pr_url and no hook: we can NOT prove the refinery + // actually merged. Previously we optimistically marked the + // MR as 'merged' — but the same race that cleared the hook + // (idle+hooked+live-bead reconciler rule firing on a phantom + // dispatch_failed) ALSO fires when the refinery is mid-review + // and decided to call gt_request_changes. In that case the + // refinery wanted rework, gt_request_changes failed with + // "not hooked", and the refinery's fallback gt_done() call + // would silently land the PR it was trying to reject. + // + // Fail the review instead: this returns the source bead to + // 'open' and surfaces the problem so the next dispatch can + // re-review properly. Also raise an escalation so a human + // sees what happened. + console.warn( + `[review-queue] agentDone: unhooked refinery ${agentId} called gt_done without pr_url ` + + `for MR ${mrBeadId} — failing review (cannot confirm merge)` + ); completeReviewWithResult(sql, { entry_id: mrBeadId, - status: 'merged', - message: input.summary ?? 'Merged by refinery agent (recovered from unhook)', + status: 'failed', + message: + input.summary ?? + 'Refinery called gt_done without a pr_url after losing its hook — cannot confirm merge', + }); + createBead(sql, { + type: 'escalation', + title: 'Refinery gt_done without pr_url after hook loss', + body: + `Refinery ${agentId} called gt_done with no pr_url while unhooked from MR ${mrBeadId}. ` + + `The hook was likely cleared by the reconciler after a phantom dispatch failure ` + + `while the SDK session was still alive. The MR has been failed (not merged) so a human ` + + `can verify whether the refinery's intended outcome was 'approve and merge' or ` + + `'request changes'. Summary from refinery: ${input.summary ?? '(none)'}`, + priority: 'high', + metadata: { + source_bead_id: mrBeadId, + source_agent_id: agentId, + kind: 'refinery_unhooked_done', + }, }); } return; diff --git a/services/gastown/src/dos/town/scheduling.ts b/services/gastown/src/dos/town/scheduling.ts index 22eda157ee..2f9e47ff60 100644 --- a/services/gastown/src/dos/town/scheduling.ts +++ b/services/gastown/src/dos/town/scheduling.ts @@ -185,6 +185,7 @@ export async function dispatchAgent( // If the agent truly didn't start: reconcileAgents catches it // after 90s of missing heartbeats and transitions to 'idle'. // If the agent actually started: heartbeats keep it alive. (#1358) + const startError = dispatch.getLastStartError(); ctx.emitEvent({ event: 'agent.dispatch_failed', townId: ctx.townId, @@ -192,7 +193,7 @@ export async function dispatchAgent( agentId: agent.id, beadId: bead.bead_id, role: agent.role, - reason: 'container returned false', + reason: startError ?? 'container returned false', }); } return started; @@ -201,21 +202,18 @@ export async function dispatchAgent( Sentry.captureException(err, { extra: { agentId: agent.id, beadId: bead.bead_id }, }); - try { - query( - ctx.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle', - ${agent_metadata.columns.last_activity_at} = ? - WHERE ${agent_metadata.bead_id} = ? - `, - [now(), agent.id] - ); - // Don't roll back bead to open — same timeout race rationale - } catch (rollbackErr) { - console.error(`${LOG} dispatchAgent: rollback also failed:`, rollbackErr); - } + // Do NOT transition the agent to 'idle' here. The container may + // already have accepted /agents/start (e.g. /refresh-token failed + // late, after the agent process spawned), in which case the SDK + // session is alive and heartbeating. Marking it idle would trip + // reconcileAgents Rule 3 (idle agent + hooked + live bead → + // unhook + reset bead), tearing the hook out from under the + // running session and causing tools like gt_request_changes to + // fail with "is not hooked to a bead" (#1358 follow-up). + // + // Instead, leave the agent as 'working'. If the container truly + // didn't start, reconcileAgents catches it after 90s of missing + // heartbeats and transitions to 'idle' through the normal path. ctx.emitEvent({ event: 'agent.dispatch_failed', townId: ctx.townId, @@ -264,6 +262,12 @@ export function dispatchUnblockedBeads(ctx: SchedulingContext, closedBeadId: str /** * Returns true if the town has work that requires the fast (5s) alarm * interval. Used to decide between active and idle alarm cadence. + * + * Each signal is wrapped in a thunk so `||` short-circuits at the SQL + * layer: as soon as one signal returns true we skip the remaining + * queries. On a hot town with working agents this avoids 4 extra reads + * per check; on a cold idle town it costs the full 5 reads (same as + * before). */ export function hasActiveWork(sql: SqlStorage): boolean { // Stalled agents older than 30min no longer count as active work: they @@ -271,53 +275,81 @@ export function hasActiveWork(sql: SqlStorage): boolean { // returning running/unknown). Keeping them in the active set would pin // the alarm at its 5s fast cadence indefinitely. The stalled->idle // auto-transition in reconcileAgents cleans them up after 2h 30min. - const activeAgentRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} - WHERE ${agent_metadata.status} = 'working' - OR (${agent_metadata.status} = 'stalled' - AND ${agent_metadata.last_activity_at} > strftime('%Y-%m-%dT%H:%M:%fZ', 'now', '-30 minutes'))`, - [] - ), - ]; - const pendingBeadRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} WHERE ${agent_metadata.status} = 'idle' AND ${agent_metadata.current_hook_bead_id} IS NOT NULL`, - [] - ), - ]; - const pendingReviewRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'merge_request' AND ${beads.status} IN ('open', 'in_progress')`, - [] - ), - ]; - const pendingTriageRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? AND ${beads.status} = 'open'`, - [patrol.TRIAGE_LABEL_LIKE] - ), - ]; + const hasActiveAgents = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} + WHERE ${agent_metadata.status} = 'working' + OR (${agent_metadata.status} = 'stalled' + AND ${agent_metadata.last_activity_at} > strftime('%Y-%m-%dT%H:%M:%fZ', 'now', '-30 minutes'))`, + [] + ), + ]) > 0; + + // Idle agents that already hold a hook — the reconciler should dispatch + // them on the next tick. + const hasHookedIdleAgents = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} + WHERE ${agent_metadata.status} = 'idle' + AND ${agent_metadata.current_hook_bead_id} IS NOT NULL`, + [] + ), + ]) > 0; + + const hasOpenMergeRequests = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} + WHERE ${beads.type} = 'merge_request' + AND ${beads.status} IN ('open', 'in_progress')`, + [] + ), + ]) > 0; + + const hasOpenTriageBeads = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.labels} LIKE ? + AND ${beads.status} = 'open'`, + [patrol.TRIAGE_LABEL_LIKE] + ), + ]) > 0; + // Open issue beads with a rig (eligible for dispatch by reconcileBeads Rule 1) // but not yet assigned to any agent. Without this check, the alarm drops to // idle cadence after a container restart when agents lose their hooks and // beads revert to open+unassigned, delaying dispatch by up to 5 minutes. - const unassignedIssueRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.status} = 'open' AND ${beads.rig_id} IS NOT NULL AND ${beads.assignee_agent_bead_id} IS NULL`, - [] - ), - ]; + const hasUnassignedIssues = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.status} = 'open' + AND ${beads.rig_id} IS NOT NULL + AND ${beads.assignee_agent_bead_id} IS NULL`, + [] + ), + ]) > 0; + return ( - Number(activeAgentRows[0]?.cnt ?? 0) > 0 || - Number(pendingBeadRows[0]?.cnt ?? 0) > 0 || - Number(pendingReviewRows[0]?.cnt ?? 0) > 0 || - Number(pendingTriageRows[0]?.cnt ?? 0) > 0 || - Number(unassignedIssueRows[0]?.cnt ?? 0) > 0 + hasActiveAgents() || + hasHookedIdleAgents() || + hasOpenMergeRequests() || + hasOpenTriageBeads() || + hasUnassignedIssues() ); } + +/** Read the `cnt` column off the first row of a `SELECT COUNT(*) as cnt` query. */ +function countOf(rows: ReadonlyArray>): number { + return Number(rows[0]?.cnt ?? 0); +} From e16b8b643597d32a8aa4da12d1d8d2821b5baf2e Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 12 May 2026 15:33:19 -0500 Subject: [PATCH 14/14] fix(gastown): address PR review feedback --- .../gastown/container/src/process-manager.ts | 31 +++++----- services/gastown/src/dos/town/actions.ts | 41 +++++++++++-- .../test/unit/pr-poll-thresholds.test.ts | 58 +++++++++++++++---- 3 files changed, 101 insertions(+), 29 deletions(-) diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 7996ff07db..4d35c350c0 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -603,17 +603,27 @@ const PERSIST_ENV_KEYS = new Set([ 'KILO_CONFIG_CONTENT', 'OPENCODE_CONFIG_CONTENT', 'GASTOWN_ORGANIZATION_ID', - // Git auth: token rotations need to land in process.env so that - // bash subprocesses spawned by the SDK's bash tool see the fresh - // value. Without these here, a cache-hit on /agents/start returns - // the prewarmed SDK with whatever GH_TOKEN/GIT_TOKEN was set at - // prewarm time, even after the worker rotated them. +]); + +const CACHE_HIT_ENV_KEYS = new Set([ + ...PERSIST_ENV_KEYS, 'GH_TOKEN', 'GIT_TOKEN', 'GITHUB_TOKEN', 'GITHUB_CLI_PAT', ]); +function applyCacheHitEnv(env: Record): void { + for (const key of CACHE_HIT_ENV_KEYS) { + const value = env[key]; + if (value) { + process.env[key] = value; + } else if (!PERSIST_ENV_KEYS.has(key)) { + delete process.env[key]; + } + } +} + async function ensureSDKServer( workdir: string, env: Record @@ -629,10 +639,7 @@ async function ensureSDKServer( existing.server.close(); sdkInstances.delete(workdir); } else { - for (const key of PERSIST_ENV_KEYS) { - const value = env[key]; - if (value) process.env[key] = value; - } + applyCacheHitEnv(env); return { client: existing.client, port: parseInt(new URL(existing.server.url).port), @@ -665,10 +672,7 @@ async function ensureSDKServer( cached.server.close(); sdkInstances.delete(workdir); } else { - for (const key of PERSIST_ENV_KEYS) { - const value = env[key]; - if (value) process.env[key] = value; - } + applyCacheHitEnv(env); return { client: cached.client, port: parseInt(new URL(cached.server.url).port), @@ -2900,6 +2904,7 @@ async function bootHydrationImpl(LOG: string): Promise { try { const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-registry`, { headers: { Authorization: `Bearer ${token}` }, + signal: AbortSignal.timeout(10_000), }); if (!resp.ok) { console.warn(`${LOG} Failed to fetch registry: ${resp.status}`); diff --git a/services/gastown/src/dos/town/actions.ts b/services/gastown/src/dos/town/actions.ts index 02564a95f3..8a02c0e879 100644 --- a/services/gastown/src/dos/town/actions.ts +++ b/services/gastown/src/dos/town/actions.ts @@ -336,11 +336,15 @@ const PR_POLL_NON_TRANSIENT_THRESHOLD = 3; /** Minimum interval between PR polls per MR bead (ms) (#1632). */ export const PR_POLL_INTERVAL_MS = 60_000; // 1 minute +function providerLabel(provider: 'github' | 'gitlab'): string { + return provider === 'github' ? 'GitHub' : 'GitLab'; +} + function failureMessageFor(error: PRStatusError): string { switch (error.kind) { case 'no_token': return ( - `No GitHub token resolved for this town. Tried (in order): ` + + `No ${providerLabel(error.provider)} token resolved for this town. Tried (in order): ` + error.resolutionChain.map(s => `\`${s}\``).join(', ') + `. Configure one of these in town or rig settings. ` + `Note: polecat agents use their own container credentials and ` + @@ -349,13 +353,17 @@ function failureMessageFor(error: PRStatusError): string { ); case 'http_error': if (error.status === 401) { - return `Town's GitHub token is invalid or expired (HTTP 401). Refresh the token in town settings.`; + return `Town's ${providerLabel(error.provider)} token is invalid or expired (HTTP 401). Refresh the token in town settings.`; } if (error.status === 403) { - return `Town's GitHub token lacks permission for this PR (HTTP 403). Ensure the token has \`pull-requests: read\` scope on the repo, or check for a secondary rate limit.`; + const scopeHint = + error.provider === 'github' + ? 'Ensure the token has `pull-requests: read` scope on the repo, or check for a secondary rate limit.' + : 'Ensure the token has permission to read merge requests in the project.'; + return `Town's ${providerLabel(error.provider)} token lacks permission for this PR (HTTP 403). ${scopeHint}`; } if (error.status === 404) { - return `PR not found (HTTP 404). Was the branch deleted before the PR could be polled, or is the PR URL wrong?`; + return `${error.provider === 'github' ? 'PR' : 'MR'} not found (HTTP 404). Was the branch deleted before it could be polled, or is the URL wrong?`; } return `${error.provider} API returned HTTP ${error.status} ${error.statusText}. ${error.transient ? 'Retrying.' : 'Not retryable.'}`; case 'invalid_response': @@ -390,6 +398,30 @@ function shouldCountAsTransient(error: PRStatusError): boolean { return error.kind === 'http_error' && error.transient; } +type PollCounterState = { + pollTransientCount: number; + pollNonTransientCount: number; + shouldFail: boolean; +}; + +function nextPollCounterState(error: PRStatusError, current: PollCounterState): PollCounterState { + if (shouldCountAsTransient(error)) { + const pollTransientCount = current.pollTransientCount + 1; + return { + pollTransientCount, + pollNonTransientCount: 0, + shouldFail: pollTransientCount >= PR_POLL_NULL_THRESHOLD, + }; + } + + const pollNonTransientCount = current.pollNonTransientCount + 1; + return { + pollTransientCount: 0, + pollNonTransientCount, + shouldFail: pollNonTransientCount >= PR_POLL_NON_TRANSIENT_THRESHOLD, + }; +} + function now(): string { return new Date().toISOString(); } @@ -1679,6 +1711,7 @@ function parsePrUrl(prUrl: string): { repo: string; prNumber: number } | null { export { hasExistingFeedbackBead as _hasExistingFeedbackBead, parsePrUrl as _parsePrUrl }; export { failureMessageFor as _failureMessageFor, + nextPollCounterState as _nextPollCounterState, shouldFailImmediately as _shouldFailImmediately, shouldCountAsTransient as _shouldCountAsTransient, }; diff --git a/services/gastown/test/unit/pr-poll-thresholds.test.ts b/services/gastown/test/unit/pr-poll-thresholds.test.ts index 678e33cadb..d522fc0b6c 100644 --- a/services/gastown/test/unit/pr-poll-thresholds.test.ts +++ b/services/gastown/test/unit/pr-poll-thresholds.test.ts @@ -1,6 +1,7 @@ import { describe, it, expect } from 'vitest'; import { _failureMessageFor, + _nextPollCounterState, _shouldFailImmediately, _shouldCountAsTransient, } from '../../src/dos/town/actions'; @@ -66,6 +67,26 @@ describe('failureMessageFor', () => { expect(msg).toContain('HTTP 404'); }); + it('produces GitLab-specific messages', () => { + const noTokenMessage = _failureMessageFor({ + kind: 'no_token', + provider: 'gitlab', + resolutionChain: ['town.git_auth.gitlab_token'], + }); + expect(noTokenMessage).toContain('No GitLab token resolved'); + + const forbiddenMessage = _failureMessageFor({ + kind: 'http_error', + provider: 'gitlab', + status: 403, + statusText: 'Forbidden', + transient: false, + }); + expect(forbiddenMessage).toContain("Town's GitLab token lacks permission"); + expect(forbiddenMessage).toContain('merge requests'); + expect(forbiddenMessage).not.toContain('pull-requests: read'); + }); + it('produces generic HTTP message for other status codes', () => { const error: PRStatusError = { kind: 'http_error', @@ -315,16 +336,24 @@ describe('counter cross-contamination', () => { reason: 'schema_mismatch', }; - expect(_shouldFailImmediately(transientError)).toBe(false); - expect(_shouldCountAsTransient(transientError)).toBe(true); + let state = { pollTransientCount: 0, pollNonTransientCount: 0, shouldFail: false }; + for (let i = 0; i < 9; i++) { + state = _nextPollCounterState(transientError, state); + } + + expect(state).toEqual({ + pollTransientCount: 9, + pollNonTransientCount: 0, + shouldFail: false, + }); - expect(_shouldFailImmediately(nonTransientError)).toBe(false); - expect(_shouldCountAsTransient(nonTransientError)).toBe(false); + state = _nextPollCounterState(nonTransientError, state); - // After 9 transient errors, poll_transient_count=9, poll_non_transient_count=0. - // Then 1 non-transient error resets poll_transient_count=0, increments poll_non_transient_count=1. - // 1 < 3, so bead should NOT fail. This test documents the expected behavior; - // the actual counter logic lives in the SQL within applyAction. + expect(state).toEqual({ + pollTransientCount: 0, + pollNonTransientCount: 1, + shouldFail: false, + }); }); it('3 consecutive non-transient errors should fail (non-transient counter reaches threshold)', () => { @@ -334,10 +363,15 @@ describe('counter cross-contamination', () => { reason: 'schema_mismatch', }; - expect(_shouldFailImmediately(nonTransientError)).toBe(false); - expect(_shouldCountAsTransient(nonTransientError)).toBe(false); + let state = { pollTransientCount: 0, pollNonTransientCount: 0, shouldFail: false }; + for (let i = 0; i < 3; i++) { + state = _nextPollCounterState(nonTransientError, state); + } - // 3 consecutive non-transient errors: poll_non_transient_count reaches 3. - // 3 >= PR_POLL_NON_TRANSIENT_THRESHOLD (3), so bead SHOULD fail. + expect(state).toEqual({ + pollTransientCount: 0, + pollNonTransientCount: 3, + shouldFail: true, + }); }); });