diff --git a/services/gastown/container/src/agent-runner.ts b/services/gastown/container/src/agent-runner.ts index d31511a68c..dfbf868495 100644 --- a/services/gastown/container/src/agent-runner.ts +++ b/services/gastown/container/src/agent-runner.ts @@ -413,6 +413,19 @@ async function createMayorWorkspace(rigId: string): Promise { return createLightweightWorkspace('mayor', rigId); } +/** + * Ensure the mayor workdir exists on disk for a given town, creating + * a lightweight git-initialized workspace if needed. + * + * Used by `prewarmMayorSDK`, which runs before `runAgent` and so cannot + * rely on `createMayorWorkspace` having been called yet — without this, + * `ensureSDKServer` would throw `ENOENT` from `process.chdir(workdir)` + * and the prewarm benefit would never materialize on cold containers. + */ +export async function ensureMayorWorkspaceForTown(townId: string): Promise { + return createMayorWorkspace(`mayor-${townId}`); +} + /** * Write the mayor's system prompt to AGENTS.md in the workspace. * diff --git a/services/gastown/container/src/control-server.ts b/services/gastown/container/src/control-server.ts index eddf78c026..761feefbaa 100644 --- a/services/gastown/container/src/control-server.ts +++ b/services/gastown/container/src/control-server.ts @@ -18,6 +18,7 @@ import { registerEventSink, refreshTokenForAllAgents, listAgents, + awaitHydration, } from './process-manager'; import { log } from './logger'; import { startHeartbeat, stopHeartbeat, notifyContainerReady } from './heartbeat'; @@ -262,7 +263,19 @@ app.post('/refresh-token', async c => { if (!body || typeof body !== 'object' || !('token' in body) || typeof body.token !== 'string') { return c.json({ error: 'Missing or invalid token field' }, 400); } - process.env.GASTOWN_CONTAINER_TOKEN = body.token; + // Capture the new token into a local so it survives the await below. + const newToken = body.token; + + // Wait for boot hydration to release the global sdkServerLock before + // we mutate process.env or serialise N agent restarts through it. + // Without this gate, a mid-hydration token refresh can cause + // buildPrewarmEnv to pick up a different token than the one hydration + // captured locally — matching the PATCH /agents/:id/model handler + // which also gates first. + await awaitHydration(); + + // Now safe to assign: hydration is done, no concurrent env readers. + process.env.GASTOWN_CONTAINER_TOKEN = newToken; const activeAgents = listAgents().filter(a => a.status === 'running' || a.status === 'starting'); log.info('refresh_token.received', { @@ -312,6 +325,15 @@ app.post('/agents/start', async c => { return c.json({ error: 'Invalid request body', issues: parsed.error.issues }, 400); } + // Wait for boot hydration to release the global sdkServerLock. The + // control server starts accepting requests immediately at boot, before + // bootHydration finishes resuming registry agents and prewarming the + // mayor — without this gate, fresh dispatches queue behind every + // serialised SDK spawn and the DO-side AbortSignal.timeout(60s) fires + // before they ever get the lock, surfacing as the + // "startAgentInContainer EXCEPTION TimeoutError" pattern. + await awaitHydration(); + // Persist the organization ID as a standalone env var so it survives // config rebuilds (e.g. model hot-swap). The env var is the primary // source of truth; KILO_CONFIG_CONTENT extraction is the fallback. @@ -386,6 +408,15 @@ app.patch('/agents/:agentId/model', async c => { return c.json({ error: 'Invalid request body', issues: parsed.error.issues }, 400); } + // Model hot-swap restarts the SDK server (see updateAgentModel) and + // contends for the same global sdkServerLock that boot hydration is + // holding. Wait for hydration to drain BEFORE the env mutations + // below: concurrent PATCH requests landing during hydration would + // otherwise race on process.env writes before any of them holds the + // SDK lock, and the env visible to the eventual `kilo serve` spawn + // would be non-deterministic. + await awaitHydration(); + // Update org billing context from the request body if provided. if (parsed.data.organizationId) { process.env.GASTOWN_ORGANIZATION_ID = parsed.data.organizationId; diff --git a/services/gastown/container/src/process-manager.test.ts b/services/gastown/container/src/process-manager.test.ts index d2c8906c2a..1afd68112c 100644 --- a/services/gastown/container/src/process-manager.test.ts +++ b/services/gastown/container/src/process-manager.test.ts @@ -5,11 +5,19 @@ import { describe, it, expect, vi } from 'vitest'; vi.mock('@kilocode/sdk', () => ({ createKilo: vi.fn(), })); +// Mock workspace helpers to return a path that actually exists on the +// test runner so ensureSDKServer's process.chdir doesn't ENOENT. +const TEST_WORKSPACE = process.cwd(); vi.mock('./agent-runner', () => ({ runAgent: vi.fn(), - buildKiloConfigContent: vi.fn(), + buildKiloConfigContent: vi.fn( + (kilocodeToken: string, model: string, smallModel: string, organizationId?: string) => + JSON.stringify({ kilocodeToken, model, smallModel, organizationId }) + ), resolveGitCredentials: vi.fn(), writeMayorSystemPromptToAgentsMd: vi.fn(), + ensureMayorWorkspaceForTown: vi.fn(async (_townId: string) => TEST_WORKSPACE), + mayorWorkdirForTown: vi.fn((_townId: string) => TEST_WORKSPACE), })); vi.mock('./control-server', () => ({ getCurrentTownConfig: vi.fn(() => ({})), @@ -24,7 +32,8 @@ vi.mock('./token-refresh', () => ({ refreshTokenIfNearExpiry: vi.fn(), })); -const { applyModelToSession, withStartAgentLock } = await import('./process-manager'); +const { applyModelToSession, withStartAgentLock, awaitHydration, bootHydration } = + await import('./process-manager'); type PromptCall = { path: { id: string }; @@ -178,3 +187,155 @@ describe('withStartAgentLock', () => { expect(result).toBe('ok'); }); }); + +describe('awaitHydration', () => { + it('resolves immediately before any bootHydration call', async () => { + // Module-init state must not block /agents/start in test/dev contexts + // where bootHydration never runs. + let resolved = false; + void awaitHydration().then(() => { + resolved = true; + }); + await new Promise(r => setTimeout(r, 0)); + expect(resolved).toBe(true); + }); + + it('prewarms mayor SDK with env that mirrors buildAgentEnv (mayor tools require GASTOWN_AGENT_ROLE/AGENT_ID/TOWN_ID)', async () => { + // Without these env vars in the snapshot kilo serve takes at spawn, + // GastownPlugin (plugin/index.ts) treats the prewarmed mayor as a + // rig agent (or fails the createMayorClientFromEnv guard) and the + // server boots with NO mayor tools. ensureSDKServer's cache hit on + // the next /agents/start hands back that defective server. + const { createKilo } = (await import('@kilocode/sdk')) as unknown as { + createKilo: ReturnType; + }; + + const prev = { + apiUrl: process.env.GASTOWN_API_URL, + townId: process.env.GASTOWN_TOWN_ID, + token: process.env.GASTOWN_CONTAINER_TOKEN, + }; + process.env.GASTOWN_API_URL = 'http://test.invalid'; + process.env.GASTOWN_TOWN_ID = 'town-prewarm'; + process.env.GASTOWN_CONTAINER_TOKEN = 'tok-prewarm'; + + let capturedEnv: Record | null = null; + createKilo.mockImplementationOnce(() => { + // Snapshot the keys plugin/index.ts and plugin/client.ts read. + capturedEnv = { + GASTOWN_AGENT_ID: process.env.GASTOWN_AGENT_ID, + GASTOWN_AGENT_ROLE: process.env.GASTOWN_AGENT_ROLE, + GASTOWN_TOWN_ID: process.env.GASTOWN_TOWN_ID, + GASTOWN_API_URL: process.env.GASTOWN_API_URL, + GASTOWN_CONTAINER_TOKEN: process.env.GASTOWN_CONTAINER_TOKEN, + KILO_CONFIG_CONTENT: process.env.KILO_CONFIG_CONTENT, + }; + return Promise.resolve({ + client: {} as unknown, + server: { url: 'http://127.0.0.1:9999/', close: () => {} }, + }); + }); + + const originalFetch = globalThis.fetch; + globalThis.fetch = vi.fn(async (input: Parameters[0]) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + if (url.includes('/container-registry')) { + return new Response(JSON.stringify({ data: [] }), { status: 200 }); + } + if (url.includes('/mayor-id')) { + return new Response( + JSON.stringify({ + success: true, + agentId: 'mayor-agent-1', + model: 'anthropic/claude-sonnet-4.6', + smallModel: 'anthropic/claude-haiku-4.5', + kilocodeToken: 'kc-tok', + organizationId: null, + }), + { status: 200 } + ); + } + // db-snapshot etc: 404 -> fresh start + return new Response('not found', { status: 404 }); + }) as unknown as typeof fetch; + + try { + await bootHydration(); + const env = capturedEnv as Record | null; + expect(env).not.toBeNull(); + expect(env).toMatchObject({ + GASTOWN_AGENT_ID: 'mayor-agent-1', + GASTOWN_AGENT_ROLE: 'mayor', + GASTOWN_TOWN_ID: 'town-prewarm', + GASTOWN_CONTAINER_TOKEN: 'tok-prewarm', + }); + expect(env?.KILO_CONFIG_CONTENT).toBeTruthy(); + } finally { + globalThis.fetch = originalFetch; + if (prev.apiUrl !== undefined) process.env.GASTOWN_API_URL = prev.apiUrl; + else delete process.env.GASTOWN_API_URL; + if (prev.townId !== undefined) process.env.GASTOWN_TOWN_ID = prev.townId; + else delete process.env.GASTOWN_TOWN_ID; + if (prev.token !== undefined) process.env.GASTOWN_CONTAINER_TOKEN = prev.token; + else delete process.env.GASTOWN_CONTAINER_TOKEN; + } + }); + + it('blocks awaiters while bootHydration is in flight and releases them when it returns', async () => { + // Drive bootHydration into its registry-fetch path with a fetch + // stub that we can hold open from the test, so we can observe a + // real "in flight" window for the gate. + const prev = { + apiUrl: process.env.GASTOWN_API_URL, + townId: process.env.GASTOWN_TOWN_ID, + token: process.env.GASTOWN_CONTAINER_TOKEN, + }; + process.env.GASTOWN_API_URL = 'http://test.invalid'; + process.env.GASTOWN_TOWN_ID = 'town-test'; + process.env.GASTOWN_CONTAINER_TOKEN = 'tok-test'; + + // Use a single barrier the fetch stub awaits so every call (registry + // fetch + prewarm endpoints) holds the gate until we release it, + // and each call gets its own Response (avoids "body already read"). + let releaseFetch!: () => void; + const fetchBarrier = new Promise(resolve => { + releaseFetch = resolve; + }); + const originalFetch = globalThis.fetch; + globalThis.fetch = vi.fn(async (input: Parameters[0]) => { + await fetchBarrier; + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + if (url.includes('/container-registry')) { + return new Response(JSON.stringify({ data: [] }), { status: 200 }); + } + return new Response(JSON.stringify({ success: true, agentId: null }), { status: 200 }); + }) as unknown as typeof fetch; + + try { + const hydrationPromise = bootHydration(); + let awaiterResolved = false; + void awaitHydration().then(() => { + awaiterResolved = true; + }); + + // Yield to let the registry fetch start. The gate is now held + // until the fetch resolves. + await new Promise(r => setTimeout(r, 10)); + expect(awaiterResolved).toBe(false); + + releaseFetch(); + await hydrationPromise; + // After bootHydration returns, the gate must release any awaiters. + await new Promise(r => setTimeout(r, 0)); + expect(awaiterResolved).toBe(true); + } finally { + globalThis.fetch = originalFetch; + if (prev.apiUrl !== undefined) process.env.GASTOWN_API_URL = prev.apiUrl; + else delete process.env.GASTOWN_API_URL; + if (prev.townId !== undefined) process.env.GASTOWN_TOWN_ID = prev.townId; + else delete process.env.GASTOWN_TOWN_ID; + if (prev.token !== undefined) process.env.GASTOWN_CONTAINER_TOKEN = prev.token; + else delete process.env.GASTOWN_CONTAINER_TOKEN; + } + }); +}); diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index cd626d9a04..4d35c350c0 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -11,7 +11,11 @@ import { z } from 'zod'; import * as fs from 'node:fs/promises'; import type { ManagedAgent, StartAgentRequest } from './types'; import { reportAgentCompleted, reportMayorWaiting } from './completion-reporter'; -import { buildKiloConfigContent, mayorWorkdirForTown } from './agent-runner'; +import { + buildKiloConfigContent, + ensureMayorWorkspaceForTown, + mayorWorkdirForTown, +} from './agent-runner'; import { getCurrentTownConfig, getLastAppliedEnvVarKeys, @@ -66,6 +70,19 @@ export function isDraining(): boolean { return _draining; } +// Resolved when bootHydration() returns. /agents/start and /refresh-token +// must await this before contending for the global sdkServerLock — without +// this gate, fresh dispatches arriving during boot queue behind every +// in-flight registry agent + the mayor prewarm and the DO-side 60s +// AbortSignal.timeout fires before they ever get the lock. We resolve +// the promise immediately so non-hydrating containers (tests, dev) +// don't block; bootHydration replaces it on entry and resolves it on exit. +let _hydrationComplete: Promise = Promise.resolve(); + +export function awaitHydration(): Promise { + return _hydrationComplete; +} + // Mutex for ensureSDKServer — createKilo() reads process.cwd() and // process.env during startup, so concurrent calls with different workdirs // would corrupt each other's globals. This serializes server creation only; @@ -588,6 +605,25 @@ const PERSIST_ENV_KEYS = new Set([ 'GASTOWN_ORGANIZATION_ID', ]); +const CACHE_HIT_ENV_KEYS = new Set([ + ...PERSIST_ENV_KEYS, + 'GH_TOKEN', + 'GIT_TOKEN', + 'GITHUB_TOKEN', + 'GITHUB_CLI_PAT', +]); + +function applyCacheHitEnv(env: Record): void { + for (const key of CACHE_HIT_ENV_KEYS) { + const value = env[key]; + if (value) { + process.env[key] = value; + } else if (!PERSIST_ENV_KEYS.has(key)) { + delete process.env[key]; + } + } +} + async function ensureSDKServer( workdir: string, env: Record @@ -603,10 +639,7 @@ async function ensureSDKServer( existing.server.close(); sdkInstances.delete(workdir); } else { - for (const key of PERSIST_ENV_KEYS) { - const value = env[key]; - if (value) process.env[key] = value; - } + applyCacheHitEnv(env); return { client: existing.client, port: parseInt(new URL(existing.server.url).port), @@ -639,10 +672,7 @@ async function ensureSDKServer( cached.server.close(); sdkInstances.delete(workdir); } else { - for (const key of PERSIST_ENV_KEYS) { - const value = env[key]; - if (value) process.env[key] = value; - } + applyCacheHitEnv(env); return { client: cached.client, port: parseInt(new URL(cached.server.url).port), @@ -2609,47 +2639,88 @@ function postEventToWorker(event: string, data: Record): void { }); } -async function fetchMayorAgentId( +type MayorPrewarmContext = { + agentId: string; + model?: string; + smallModel?: string; + kilocodeToken?: string; + organizationId?: string | null; + githubToken?: string; + githubCliPat?: string; +}; + +// Mirrors the response contract documented at +// `/api/towns/:townId/mayor-id` in gastown.worker.ts. agentId is nullable +// because the worker returns `{ agentId: null }` when no mayor exists. +const MayorPrewarmResponse = z + .object({ + success: z.boolean().optional(), + agentId: z.string().nullable().optional(), + model: z.string().optional(), + smallModel: z.string().optional(), + kilocodeToken: z.string().optional(), + organizationId: z.string().nullable().optional(), + githubToken: z.string().optional(), + githubCliPat: z.string().optional(), + }) + .passthrough(); + +async function fetchMayorPrewarmContext( townId: string, apiUrl: string, token: string -): Promise { +): Promise { try { const resp = await fetch(`${apiUrl}/api/towns/${townId}/mayor-id`, { headers: { Authorization: `Bearer ${token}` }, signal: AbortSignal.timeout(10_000), }); if (!resp.ok) { - console.log(`${MANAGER_LOG} fetchMayorAgentId: ${resp.status} for town ${townId}`); + console.log(`${MANAGER_LOG} fetchMayorPrewarmContext: ${resp.status} for town ${townId}`); return null; } const json: unknown = await resp.json(); - if ( - typeof json === 'object' && - json !== null && - 'agentId' in json && - typeof (json as { agentId: unknown }).agentId === 'string' - ) { - return (json as { agentId: string }).agentId; - } - return null; + const parsed = MayorPrewarmResponse.safeParse(json); + if (!parsed.success) return null; + const { agentId, model, smallModel, kilocodeToken, organizationId, githubToken, githubCliPat } = + parsed.data; + if (!agentId) return null; + return { + agentId, + model, + smallModel, + kilocodeToken, + organizationId, + githubToken, + githubCliPat, + }; } catch (err) { - console.warn(`${MANAGER_LOG} fetchMayorAgentId failed:`, err); + console.warn(`${MANAGER_LOG} fetchMayorPrewarmContext failed:`, err); return null; } } -function buildPrewarmEnv(mayorAgentId: string): Record { +function buildPrewarmEnv(ctx: MayorPrewarmContext, townId: string): Record | null { + // Must mirror the mayor-shaped subset of buildAgentEnv (agent-runner.ts): + // the kilo serve child snapshots process.env at spawn and loads + // GastownPlugin (plugin/index.ts), which gates mayor-tool registration + // on GASTOWN_AGENT_ROLE === 'mayor' and createMayorClientFromEnv() + // requires GASTOWN_AGENT_ID + GASTOWN_TOWN_ID. If we omit them, the + // prewarmed mayor boots with NO tools, and ensureSDKServer's cache + // hit on the next /agents/start hands back that defective server + // (KILO_CONFIG_CONTENT matches, so the eviction path doesn't fire). const env: Record = { - KILO_TEST_HOME: `/tmp/agent-home-${mayorAgentId}`, - XDG_DATA_HOME: `/tmp/agent-home-${mayorAgentId}/.local/share`, + GASTOWN_AGENT_ID: ctx.agentId, + GASTOWN_TOWN_ID: townId, + GASTOWN_AGENT_ROLE: 'mayor', + KILOCODE_FEATURE: 'gastown', + KILO_TEST_HOME: `/tmp/agent-home-${ctx.agentId}`, + XDG_DATA_HOME: `/tmp/agent-home-${ctx.agentId}/.local/share`, }; const keys = [ 'GASTOWN_API_URL', 'GASTOWN_CONTAINER_TOKEN', - 'GASTOWN_TOWN_ID', - 'KILOCODE_TOKEN', - 'GASTOWN_ORGANIZATION_ID', + 'GASTOWN_SESSION_TOKEN', 'KILO_API_URL', 'KILO_OPENROUTER_BASE', ]; @@ -2658,18 +2729,58 @@ function buildPrewarmEnv(mayorAgentId: string): Record { if (value) env[key] = value; } - const kilocodeToken = env.KILOCODE_TOKEN; - if (kilocodeToken) { - const organizationId = env.GASTOWN_ORGANIZATION_ID || undefined; - const configJson = buildKiloConfigContent( - kilocodeToken, - 'anthropic/claude-sonnet-4.6', - 'anthropic/claude-haiku-4.5', - organizationId - ); - env.KILO_CONFIG_CONTENT = configJson; - env.OPENCODE_CONFIG_CONTENT = configJson; - } + // Prefer the worker-supplied token/org so KILO_CONFIG_CONTENT matches + // what /agents/start will send. Fall back to process.env for back- + // compat with workers that haven't deployed the richer endpoint yet. + const kilocodeToken = ctx.kilocodeToken ?? process.env.KILOCODE_TOKEN; + if (!kilocodeToken) return null; + env.KILOCODE_TOKEN = kilocodeToken; + + // When the worker explicitly returned organizationId (including null + // for "this town has no org"), trust it. Only fall back to process.env + // when the field was omitted entirely (older worker version that + // didn't yet include the prewarm context). + const organizationId = + ctx.organizationId !== undefined + ? ctx.organizationId + : (process.env.GASTOWN_ORGANIZATION_ID ?? null); + if (organizationId) env.GASTOWN_ORGANIZATION_ID = organizationId; + + // Plumb GitHub auth into the prewarmed SDK env so `gh` CLI and `git` + // subprocesses spawned from the mayor's bash tool see credentials. + // Mirror buildAgentEnv (agent-runner.ts:180-188): GITHUB_CLI_PAT wins + // for `gh` (PRs/issues appear under the user's identity), else fall + // back to the integration-resolved GIT_TOKEN. + // + // Without this, ensureMayor's short-circuit path returns a prewarmed + // SDK whose process.env is missing GH_TOKEN entirely — `gh auth status` + // reports "not logged in" until the SDK is torn down and rebuilt. + if (ctx.githubToken) { + env.GIT_TOKEN = ctx.githubToken; + env.GITHUB_TOKEN = ctx.githubToken; + } + if (ctx.githubCliPat) { + env.GITHUB_CLI_PAT = ctx.githubCliPat; + } + const ghToken = ctx.githubCliPat ?? ctx.githubToken; + if (ghToken) { + env.GH_TOKEN = ghToken; + } + + // Without the worker-resolved model, skip prewarm: any guess we make + // here will almost certainly differ from /agents/start's resolved + // model and trigger ensureSDKServer's eviction-and-respawn path, + // making the prewarm a net negative on the critical path. + if (!ctx.model || !ctx.smallModel) return null; + + const configJson = buildKiloConfigContent( + kilocodeToken, + ctx.model, + ctx.smallModel, + organizationId ?? undefined + ); + env.KILO_CONFIG_CONTENT = configJson; + env.OPENCODE_CONFIG_CONTENT = configJson; return env; } @@ -2677,30 +2788,47 @@ function buildPrewarmEnv(mayorAgentId: string): Record { async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): Promise { const t0 = Date.now(); - const mayorAgentId = await fetchMayorAgentId(townId, apiUrl, token); - if (!mayorAgentId) { + const ctx = await fetchMayorPrewarmContext(townId, apiUrl, token); + if (!ctx) { console.log(`${MANAGER_LOG} prewarmMayorSDK: no mayor agent for town ${townId}`); return; } - const workdir = mayorWorkdirForTown(townId); + const env = buildPrewarmEnv(ctx, townId); + if (!env) { + console.log( + `${MANAGER_LOG} prewarmMayorSDK: skipping for town ${townId} — missing model/token (would cause eviction churn)` + ); + return; + } - await hydrateDbFromSnapshot(mayorAgentId, apiUrl, token, `mayor-${townId}`, townId); + // Materialize the mayor workdir before ensureSDKServer's process.chdir. + // Without this, prewarm on a cold container throws ENOENT because + // createMayorWorkspace runs from runAgent (i.e. /agents/start) only. + const workdir = await ensureMayorWorkspaceForTown(townId); + if (workdir !== mayorWorkdirForTown(townId)) { + // Defensive: if the workspace helper ever changes its layout, the + // sdkInstances key (workdir) and the path /agents/start uses must + // stay aligned or the cache hit won't fire. + console.warn( + `${MANAGER_LOG} prewarmMayorSDK: workdir mismatch (got=${workdir}, expected=${mayorWorkdirForTown(townId)})` + ); + } - const env = buildPrewarmEnv(mayorAgentId); + await hydrateDbFromSnapshot(ctx.agentId, apiUrl, token, `mayor-${townId}`, townId); const existing = sdkInstances.get(workdir); if (existing) { const durationMs = Date.now() - t0; log.info('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, townId, port: parseInt(new URL(existing.server.url).port), durationMs, alreadyRunning: true, }); postEventToWorker('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, role: 'mayor', durationMs, }); @@ -2711,14 +2839,14 @@ async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): P const durationMs = Date.now() - t0; log.info('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, townId, port, durationMs, alreadyRunning: false, }); postEventToWorker('mayor.prewarm_complete', { - agentId: mayorAgentId, + agentId: ctx.agentId, role: 'mayor', durationMs, }); @@ -2729,9 +2857,25 @@ async function prewarmMayorSDK(townId: string, apiUrl: string, token: string): P * Gastown worker and resumes all registered agents. * * Called from main.ts when GASTOWN_TOWN_ID and GASTOWN_API_URL are set. + * + * Installs a hydration gate (see `awaitHydration`) for the duration of + * the call so /agents/start and /refresh-token wait for the registry + * loop and mayor prewarm to release the global sdkServerLock before + * contending for it themselves. */ export async function bootHydration(): Promise { - const LOG = '[boot-hydration]'; + let resolve!: () => void; + _hydrationComplete = new Promise(r => { + resolve = r; + }); + try { + await bootHydrationImpl('[boot-hydration]'); + } finally { + resolve(); + } +} + +async function bootHydrationImpl(LOG: string): Promise { const apiUrl = process.env.GASTOWN_API_URL; const townId = process.env.GASTOWN_TOWN_ID; const initialToken = process.env.GASTOWN_CONTAINER_TOKEN; @@ -2760,6 +2904,7 @@ export async function bootHydration(): Promise { try { const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-registry`, { headers: { Authorization: `Bearer ${token}` }, + signal: AbortSignal.timeout(10_000), }); if (!resp.ok) { console.warn(`${LOG} Failed to fetch registry: ${resp.status}`); diff --git a/services/gastown/docs/e2e-pr-feedback-testing.md b/services/gastown/docs/e2e-pr-feedback-testing.md index 0712cedcee..febd3be03a 100644 --- a/services/gastown/docs/e2e-pr-feedback-testing.md +++ b/services/gastown/docs/e2e-pr-feedback-testing.md @@ -11,6 +11,13 @@ When `merge_strategy: 'pr'` is configured: 3. **Auto-resolve detects comments** — `poll_pr` checks for unresolved review threads, dispatches polecat to fix 4. **Auto-merge** — once all comments resolved and CI passes, grace period timer starts, then PR is merged via API +### Convoy Merge Modes + +Convoys (multi-bead jobs) come in two flavors, controlled by `convoy_merge_mode` in town config: + +- **`review-and-merge`** — each bead opens a PR directly to `main`. Independent landings. +- **`review-then-land`** (default) — each bead's PR targets a shared **convoy feature branch**, and a single **landing PR** is opened from that feature branch to `main` after all beads close. Test this with [Test C](#test-c-review-then-land-convoy-via-direct-sling). + ## Prerequisites - Wrangler dev server running for gastown (`pnpm dev` in `cloudflare-gastown/`, port 8803) @@ -759,6 +766,193 @@ for pr in prs: --- +## Test C: review-then-land Convoy via Direct Sling + +This tests the **review-then-land** convoy mode where each sub-bead gets its own PR into the convoy's feature branch, and a final landing PR is created from the feature branch into the default branch (`main`). + +Unlike Test B (which goes through the mayor's `gt_sling_batch` tool via a chat message), this test directly slings a convoy through a debug endpoint, eliminating mayor LLM variability. Use this when you want a deterministic E2E test of the convoy plumbing itself. + +### Prereqs and Debug Endpoints + +These dev-only endpoints are used by this test (all `application/json`, no auth in dev): + +| Method | Path | Description | +| ------ | ----------------------------------- | ---------------------------------------------------------------- | +| GET | `/debug/towns/:townId/rigs` | List rigs registered with the town (returns `{ rigs: [...] }`) | +| POST | `/debug/towns/:townId/sling-convoy` | Directly call `Town.slingConvoy()` — bypasses the mayor | +| GET | `/debug/towns/:townId/convoys` | List active convoys with progress (`closed_beads`/`total_beads`) | + +The `sling-convoy` body matches `Town.slingConvoy()` input: + +```json +{ + "rigId": "", + "convoyTitle": "Bogus convoy E2E test 175337", + "merge_mode": "review-then-land", + "staged": false, + "tasks": [ + { "title": "Add src/utils/foo.ts with function bar" }, + { "title": "Add src/utils/baz.ts ...", "depends_on": [0] } + ] +} +``` + +### C.1. Look Up the Rig + +```bash +RIG_ID=$(curl -s $BASE/debug/towns/$TOWN_ID/rigs | python3 -c "import sys, json; print(json.load(sys.stdin)['rigs'][0]['id'])") +echo "RIG_ID=$RIG_ID" +``` + +### C.2. Confirm Town Configured for review-then-land + +```bash +curl -s $BASE/debug/towns/$TOWN_ID/config | python3 -c " +import sys, json +d = json.load(sys.stdin) +print('merge_strategy:', d.get('merge_strategy')) +print('convoy_merge_mode:', d.get('convoy_merge_mode')) +print('refinery.auto_merge:', d.get('refinery',{}).get('auto_merge')) +" +``` + +If `convoy_merge_mode` is not `review-then-land`, set it: + +```bash +curl -s -X PATCH $BASE/debug/towns/$TOWN_ID/config \ + -H "Content-Type: application/json" \ + -d '{"convoy_merge_mode":"review-then-land","merge_strategy":"pr","refinery":{"auto_merge":true,"auto_merge_delay_minutes":2}}' +``` + +### C.3. Sling a 3-Bead Convoy + +Use a unique title (timestamp suffix) so subsequent runs don't collide and the feature branch name is easy to grep for: + +```bash +TIMESTAMP=$(date +%H%M%S) +TITLE="Bogus convoy E2E test $TIMESTAMP" +RESPONSE=$(curl -s -X POST $BASE/debug/towns/$TOWN_ID/sling-convoy \ + -H "Content-Type: application/json" \ + -d "{ + \"rigId\": \"$RIG_ID\", + \"convoyTitle\": \"$TITLE\", + \"merge_mode\": \"review-then-land\", + \"staged\": false, + \"tasks\": [ + {\"title\": \"Add src/utils/bogus-step1.ts with a single function bogusGreet that returns 'hello bogus'. Include JSDoc. Commit and push.\"}, + {\"title\": \"Add src/utils/bogus-step2.ts with a single function bogusFarewell that returns 'goodbye bogus'. Include JSDoc. Commit and push.\", \"depends_on\": [0]}, + {\"title\": \"Add src/utils/bogus-step3.ts with a single function bogusEcho that takes a string and returns it prefixed with 'echo: '. Include JSDoc. Commit and push.\", \"depends_on\": [1]} + ] + }") +echo "$RESPONSE" | python3 -c " +import sys, json +r = json.load(sys.stdin) +c = r['convoy'] +print(f'CONVOY_ID={c[\"id\"]}') +print(f'FEATURE_BRANCH={c[\"feature_branch\"]}') +for i, b in enumerate(r['beads']): + print(f'BEAD{i+1}={b[\"bead\"][\"bead_id\"]}') +" +``` + +Capture the printed env vars (`CONVOY_ID`, `FEATURE_BRANCH`, `BEAD1..3`) into a sourceable file like `/tmp/convoy-test.env` for the rest of the test. + +### C.4. Monitor Through the Lifecycle + +In **review-then-land** mode you should observe (linear chain of 3 beads, each `depends_on` the previous): + +1. Bead 1 → `in_progress` → polecat creates a sub-PR targeting `` → MR bead → refinery merges sub-PR → bead 1 `closed` +2. Bead 2 unblocks → same cycle, sub-PR into feature branch +3. Bead 3 same +4. After last sub-bead closes: a **landing MR bead** is created with `feature_branch` → `main` PR +5. Refinery reviews landing PR, auto-merge fires, convoy → `closed` + +Monitor with this loop: + +```bash +SLUG=$(echo "$TITLE" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]\{1,\}/-/g; s/^-\|-$//g' | head -c 40) +for i in $(seq 1 80); do + echo "=== $(date +%H:%M:%S) ===" + curl -s $BASE/debug/towns/$TOWN_ID/status | python3 -c " +import sys, json +d = json.load(sys.stdin) +alarm = d.get('alarmStatus', {}) +print(f'agents={json.dumps(alarm.get(\"agents\",{}))} beads={json.dumps(alarm.get(\"beads\",{}))}') +for b in d.get('beadSummary', []): + title = (b.get('title','') or '')[:60] + print(f' {b.get(\"type\",\"?\"):16s} {b.get(\"status\",\"?\"):12s} {title}') +" + curl -s $BASE/debug/towns/$TOWN_ID/convoys | python3 -c " +import sys, json +for c in json.load(sys.stdin).get('convoys', []): + if c['id'] == '$CONVOY_ID': + print(f' convoy {c[\"status\"]} closed={c[\"closed_beads\"]}/{c[\"total_beads\"]} landed={c[\"landed_at\"]}') +" 2>/dev/null + gh pr list --repo $REPO --state all --search "head:convoy/$SLUG" --limit 10 \ + --json number,title,headRefName,baseRefName,state,mergedAt 2>/dev/null | python3 -c " +import sys, json +prs = json.load(sys.stdin) +for pr in prs: + print(f' PR #{pr[\"number\"]:3d} {pr[\"state\"]:8s} {pr[\"headRefName\"][:40]} -> {pr[\"baseRefName\"][:40]}') +" 2>/dev/null + # Stop when convoy is gone from active list (indicates closed) + ACTIVE=$(curl -s $BASE/debug/towns/$TOWN_ID/convoys | python3 -c " +import sys, json +ids = [c['id'] for c in json.load(sys.stdin).get('convoys', [])] +print('YES' if '$CONVOY_ID' in ids else 'NO') +" 2>/dev/null) + if [ "$ACTIVE" = "NO" ]; then echo '=== CONVOY CLOSED ==='; break; fi + sleep 30 +done +``` + +### C.5. Verify Final State + +After the convoy closes, check: + +1. **Sub-bead PRs** — one per non-failed bead, each targeting the convoy feature branch, all merged: + + ```bash + gh pr list --repo $REPO --state all --search "head:$FEATURE_BRANCH" --limit 10 \ + --json number,title,baseRefName,headRefName,state,mergedAt | python3 -m json.tool + ``` + + Expect: each sub-bead → `baseRefName: `, `state: MERGED`. The landing PR → `baseRefName: main`, `state: MERGED`. + +2. **Landing PR** — base=`main`, head=``, merged: + + ```bash + gh pr view --repo $REPO --json state,mergedAt,additions,deletions,changedFiles + ``` + +3. **Files actually landed on main**: + + ```bash + gh api "repos/$REPO/contents/src/utils?ref=main" --jq '.[] | select(.name | startswith("bogus")) | .name' + ``` + +4. **Convoy progress**: `closed_beads == total_beads`, `landed_at` set, convoy bead `status=closed`. + +### Expected Timeline (review-then-land, 3 beads) + +| Step | Duration | +| --------------------------------------------------- | ------------- | +| Sling-convoy creates 3 issue beads + convoy bead | ~1s | +| Bead 1 polecat work + sub-PR + refinery merge | 3-5 min | +| Bead 2 polecat work + sub-PR + refinery merge | 3-5 min | +| Bead 3 polecat work + sub-PR + refinery merge | 3-5 min | +| Landing MR created, refinery reviews PR into `main` | 2-3 min | +| Auto-merge grace period | 2 min | +| **Total** | **15-25 min** | + +### Known Issues Observed in This Flow + +- **Polecat occasionally pushes directly to feature branch instead of opening a sub-PR.** The MR bead is still created (via `review_submitted`) and the refinery still merges the work into the feature branch, but `reviewMetadata.pr_url` is `null` and there is no GitHub PR for that sub-bead. This is an LLM compliance issue in the polecat prompt, not a code bug. The convoy still lands successfully. +- **A failed sub-bead does not block the convoy from landing.** The reconciler treats `failed` blockers the same as `closed` (see `reconciler.ts:960`), so dependents will dispatch and the convoy will land whatever did succeed. If sub-bead 1 of 3 fails, the landing PR will only contain commits from beads 2 and 3. This is by design but worth knowing when interpreting "successful" landings. +- **Container TLS handshake failures with `github.com`.** A wrangler-managed container may start with a 65535 MTU that breaks outgoing TLS to GitHub (`GnuTLS, handshake failed: The TLS connection was non-properly terminated`). Symptom: every `/agents/start` returns "FAILED for X: git fetch --all --prune failed". Fix: `docker kill $(docker ps -q --filter ancestor=cloudflare-dev/towncontainerdo:*)` — wrangler will spin up a replacement that usually has a working network. The `cloneRepoInner` retry path treats this as non-auth so doesn't retry; the bead burns through `MAX_DISPATCH_ATTEMPTS=5` and ends up `failed` if the container isn't restarted. + +--- + ## Expected Timelines ### Scenario 1 (No Review + Auto-Merge) @@ -875,6 +1069,9 @@ All `/debug/` endpoints are unauthenticated in development. In production, they' | `POST` | `/debug/towns/:townId/send-message` | Send message to mayor (dev only) | | `GET` | `/debug/towns/:townId/beads/:beadId` | Full bead details + review metadata + dependencies | | `POST` | `/debug/towns/:townId/graceful-stop` | Trigger SIGTERM on container (dev only) | +| `GET` | `/debug/towns/:townId/rigs` | List rigs registered with the town (dev only) | +| `POST` | `/debug/towns/:townId/sling-convoy` | Directly call `Town.slingConvoy()` (dev only) | +| `GET` | `/debug/towns/:townId/convoys` | List active convoys with progress counts (dev only) | ### Inspect a Bead diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index 700a774342..98841c0ef5 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -786,9 +786,22 @@ export class TownDO extends DurableObject { const townConfig = await this.getTownConfig(); const container = getTownContainerStub(this.env, townId); + // Resolve a fresh GitHub token here too — this method runs both at + // initial config push and on every config change, so the persisted + // GIT_TOKEN must be live rather than the stale value stored in + // git_auth.github_token from rig creation. The container's + // syncTownConfigToProcessEnv path reads `git_auth.github_token` + // from the X-Town-Config header on every request, so the in-process + // GIT_TOKEN follows the same source-of-truth as the persisted one. + const githubToken = await scm.resolveGitHubTokenString({ + env: this.env, + townId, + getTownConfig: () => Promise.resolve(townConfig), + }); + // Phase 1: Persist to DO storage for next boot. const envMapping: Array<[string, string | undefined]> = [ - ['GIT_TOKEN', townConfig.git_auth?.github_token], + ['GIT_TOKEN', githubToken ?? undefined], ['GITLAB_TOKEN', townConfig.git_auth?.gitlab_token], ['GITLAB_INSTANCE_URL', townConfig.git_auth?.gitlab_instance_url], ['GITHUB_CLI_PAT', townConfig.github_cli_pat], @@ -861,7 +874,11 @@ export class TownDO extends DurableObject { // /sync-config endpoint. The X-Town-Config header delivers the // full config; the endpoint applies CONFIG_ENV_MAP to process.env. try { - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); await container.fetch('http://container/sync-config', { method: 'POST', headers: { @@ -985,8 +1002,19 @@ export class TownDO extends DurableObject { logger.setTags({ rigId: rigConfig.rigId }); const townConfig = await this.getTownConfig(); const envVars: Record = {}; - if (townConfig.git_auth?.github_token) { - envVars.GIT_TOKEN = townConfig.git_auth.github_token; + // Resolve GitHub token through scm.resolveGitHubTokenString so the rig + // setup uses a fresh installation token when a platform integration + // is configured. The rig's own integration ID takes precedence over + // the town-level one (this rig may be wired to a different repo + // installation than the rest of the town). + const githubToken = await scm.resolveGitHubTokenString({ + env: this.env, + townId: this.townId, + getTownConfig: () => Promise.resolve(townConfig), + platformIntegrationId: rigConfig.platformIntegrationId, + }); + if (githubToken) { + envVars.GIT_TOKEN = githubToken; } if (townConfig.git_auth?.gitlab_token) { envVars.GITLAB_TOKEN = townConfig.git_auth.gitlab_token; @@ -1001,7 +1029,11 @@ export class TownDO extends DurableObject { envVars.KILOCODE_TOKEN = kilocodeToken; } - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); const container = getTownContainerStub(this.env, this.townId); const response = await container.fetch('http://container/repos/setup', { method: 'POST', @@ -2659,6 +2691,63 @@ export class TownDO extends DurableObject { return mayor?.id ?? null; } + /** + * Returns everything the container needs to prewarm the mayor SDK + * server with a config that matches what the next /agents/start will + * use — so the prewarm cache hit is real instead of triggering the + * "config mismatch, evicting prewarmed server" eviction path. + * + * Returns null only when there's no mayor at all. When the mayor + * exists but the kilocode token isn't available, returns a partial + * shape with just { agentId } so callers can derive the fallback + * agentId without a second RPC hop. + */ + async getMayorPrewarmContext(): Promise<{ + agentId: string; + model?: string; + smallModel?: string; + kilocodeToken?: string; + organizationId?: string | null; + githubToken?: string; + githubCliPat?: string; + } | null> { + const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; + if (!mayor) return null; + + const kilocodeToken = await this.resolveKilocodeToken(); + if (!kilocodeToken) { + return { agentId: mayor.id }; + } + + const townConfig = await this.getTownConfig(); + + // Resolve the GitHub token using the same chain as startAgentInContainer + // so the prewarmed mayor SDK boots with `gh` CLI auth (`GH_TOKEN`) + // already populated. Without this, the mayor's bash tool sees an + // empty environment for git/gh until the SDK is torn down and the + // /agents/start path's buildAgentEnv runs — which never happens + // while ensureMayor short-circuits on a warm session. + const githubToken = await scm.resolveGitHubTokenString({ + env: this.env, + townId: this.townId, + getTownConfig: () => Promise.resolve(townConfig), + }); + + // _ensureMayor dispatches the mayor without a per-rig override + // (Town.do.ts:2766-2790). Match that resolution here so the prewarm + // KILO_CONFIG_CONTENT is byte-identical to what /agents/start will + // build, and ensureSDKServer's config-mismatch eviction never fires. + return { + agentId: mayor.id, + model: config.resolveModel(townConfig, null, 'mayor'), + smallModel: config.resolveSmallModel(townConfig), + kilocodeToken, + organizationId: townConfig.organization_id ?? null, + ...(githubToken ? { githubToken } : {}), + ...(townConfig.github_cli_pat ? { githubCliPat: townConfig.github_cli_pat } : {}), + }; + } + async ensureMayor(): Promise<{ agentId: string; sessionStatus: 'idle' | 'active' | 'starting'; @@ -2814,7 +2903,11 @@ export class TownDO extends DurableObject { if (isAlive) { // Attach fresh town config so the container can update process.env // before restarting the SDK server (tokens, git identity, etc.). - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); // Resolve townConfig to thread the organization_id into the request body // (belt-and-suspenders: ensures org billing survives even if X-Town-Config @@ -4640,7 +4733,11 @@ export class TownDO extends DurableObject { // This ensures org context and credentials are available immediately // after a container restart when the first request is a model update // (PATCH /model) rather than a new agent start. - const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + const containerConfig = await config.buildContainerConfig( + this.ctx.storage, + this.env, + this.townId + ); const headers: Record = { 'X-Town-Config': JSON.stringify(containerConfig), }; @@ -4776,7 +4873,9 @@ export class TownDO extends DurableObject { /** * Health check: verify the alarm is set and return basic town status. * Called by the GastownUserDO watchdog alarm to ensure each town's - * alarm loop is firing. Re-arms the alarm if it's missing. + * alarm loop is firing. Re-arms the alarm if it's missing, picking the + * cadence based on `hasActiveWork` so idle towns don't all wake up + * on the fast 5s cadence after a deploy. */ async healthCheck(): Promise<{ townId: string; @@ -4790,10 +4889,17 @@ export class TownDO extends DurableObject { const currentAlarm = await this.ctx.storage.getAlarm(); const alarmSet = currentAlarm !== null && currentAlarm > Date.now(); - // Re-arm if missing — this is the whole point of the watchdog + // Re-arm if missing — this is the whole point of the watchdog. Pick + // the cadence to match observed activity: active towns recover fast, + // idle towns don't pay the cost of a 5s wake-up storm across the fleet. if (!alarmSet) { - console.warn(`${TOWN_LOG} healthCheck: alarm not set for town=${townId}, re-arming`); - await this.ctx.storage.setAlarm(Date.now() + ACTIVE_ALARM_INTERVAL_MS); + const interval = scheduling.hasActiveWork(this.sql) + ? ACTIVE_ALARM_INTERVAL_MS + : IDLE_ALARM_INTERVAL_MS; + console.warn( + `${TOWN_LOG} healthCheck: alarm not set for town=${townId}, re-arming with ${interval}ms` + ); + await this.ctx.storage.setAlarm(Date.now() + interval); } const activeAgents = Number( diff --git a/services/gastown/src/dos/town/actions.ts b/services/gastown/src/dos/town/actions.ts index 89da498fae..8a02c0e879 100644 --- a/services/gastown/src/dos/town/actions.ts +++ b/services/gastown/src/dos/town/actions.ts @@ -22,7 +22,7 @@ import * as reviewQueue from './review-queue'; import * as patrol from './patrol'; import { getRig } from './rigs'; import { parseGitUrl } from '../../util/platform-pr.util'; -import type { PRStatusResult } from './town-scm'; +import type { PRStatusOutcome, PRStatusError } from './town-scm'; // ── Bead mutations ────────────────────────────────────────────────── @@ -300,8 +300,8 @@ export type ApplyActionContext = { dispatchAgent: (agentId: string, beadId: string, rigId: string) => Promise; /** Stop an agent's container process. */ stopAgent: (agentId: string) => Promise; - /** Check a PR's status via GitHub/GitLab API. Returns PRStatusResult or null. */ - checkPRStatus: (prUrl: string) => Promise; + /** Check a PR's status via GitHub/GitLab API. Returns PRStatusOutcome. */ + checkPRStatus: (prUrl: string) => Promise; /** Check PR for unresolved review comments and failing CI checks. */ checkPRFeedback: (prUrl: string) => Promise; /** Merge a PR via GitHub/GitLab API. */ @@ -330,9 +330,98 @@ const LOG = '[actions]'; /** Fail MR bead after this many consecutive null poll results (#1632). */ const PR_POLL_NULL_THRESHOLD = 10; +/** Fail MR bead after this many consecutive non-transient errors (invalid_response). */ +const PR_POLL_NON_TRANSIENT_THRESHOLD = 3; + /** Minimum interval between PR polls per MR bead (ms) (#1632). */ export const PR_POLL_INTERVAL_MS = 60_000; // 1 minute +function providerLabel(provider: 'github' | 'gitlab'): string { + return provider === 'github' ? 'GitHub' : 'GitLab'; +} + +function failureMessageFor(error: PRStatusError): string { + switch (error.kind) { + case 'no_token': + return ( + `No ${providerLabel(error.provider)} token resolved for this town. Tried (in order): ` + + error.resolutionChain.map(s => `\`${s}\``).join(', ') + + `. Configure one of these in town or rig settings. ` + + `Note: polecat agents use their own container credentials and ` + + `may have created the PR successfully — that does not imply the ` + + `town worker can poll PR status.` + ); + case 'http_error': + if (error.status === 401) { + return `Town's ${providerLabel(error.provider)} token is invalid or expired (HTTP 401). Refresh the token in town settings.`; + } + if (error.status === 403) { + const scopeHint = + error.provider === 'github' + ? 'Ensure the token has `pull-requests: read` scope on the repo, or check for a secondary rate limit.' + : 'Ensure the token has permission to read merge requests in the project.'; + return `Town's ${providerLabel(error.provider)} token lacks permission for this PR (HTTP 403). ${scopeHint}`; + } + if (error.status === 404) { + return `${error.provider === 'github' ? 'PR' : 'MR'} not found (HTTP 404). Was the branch deleted before it could be polled, or is the URL wrong?`; + } + return `${error.provider} API returned HTTP ${error.status} ${error.statusText}. ${error.transient ? 'Retrying.' : 'Not retryable.'}`; + case 'invalid_response': + return ( + `${error.provider} API returned an unexpected response shape ` + + `(${error.reason})${error.sampleKeys ? `; top-level keys: ${error.sampleKeys.join(', ')}` : ''}. ` + + `Please file a bug — the API contract may have drifted.` + ); + case 'unrecognized_url': + return `PR URL format not recognized: ${error.url}. Expected GitHub PR or GitLab MR URL.`; + case 'host_mismatch': + return `Refusing to send GitLab token to unexpected host \`${error.got}\` (configured: \`${error.expected}\`).`; + } +} + +function shouldFailImmediately(error: PRStatusError): boolean { + switch (error.kind) { + case 'no_token': + return true; + case 'unrecognized_url': + return true; + case 'host_mismatch': + return true; + case 'http_error': + return !error.transient; + case 'invalid_response': + return false; + } +} + +function shouldCountAsTransient(error: PRStatusError): boolean { + return error.kind === 'http_error' && error.transient; +} + +type PollCounterState = { + pollTransientCount: number; + pollNonTransientCount: number; + shouldFail: boolean; +}; + +function nextPollCounterState(error: PRStatusError, current: PollCounterState): PollCounterState { + if (shouldCountAsTransient(error)) { + const pollTransientCount = current.pollTransientCount + 1; + return { + pollTransientCount, + pollNonTransientCount: 0, + shouldFail: pollTransientCount >= PR_POLL_NULL_THRESHOLD, + }; + } + + const pollNonTransientCount = current.pollNonTransientCount + 1; + return { + pollTransientCount: 0, + pollNonTransientCount, + shouldFail: pollNonTransientCount >= PR_POLL_NON_TRANSIENT_THRESHOLD, + }; +} + function now(): string { return new Date().toISOString(); } @@ -745,22 +834,24 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro return async () => { try { - const prStatusResult = await ctx.checkPRStatus(action.pr_url); - if (prStatusResult !== null) { - // Any non-null result resets the consecutive null counter + const outcome = await ctx.checkPRStatus(action.pr_url); + if (outcome.ok) { + // Successful poll — reset both consecutive error counters query( sql, /* sql */ ` UPDATE ${beads} SET ${beads.columns.metadata} = json_set( COALESCE(${beads.columns.metadata}, '{}'), - '$.poll_null_count', 0 + '$.poll_transient_count', 0, + '$.poll_non_transient_count', 0, + '$.poll_error_kind', NULL ) WHERE ${beads.bead_id} = ? `, [action.bead_id] ); - const { status, mergeable_state } = prStatusResult; + const { status, mergeable_state } = outcome.result; if (status !== 'open') { ctx.insertEvent('pr_status_changed', { bead_id: action.bead_id, @@ -1048,10 +1139,10 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro // If the PR was merged externally during that window, inserting // pr_feedback_detected would create a feedback bead for a merged // PR — leading to a duplicate PR on an already-merged branch. - const freshStatusResult = await ctx.checkPRStatus(action.pr_url); - if (freshStatusResult?.status !== 'open') { + const freshOutcome = await ctx.checkPRStatus(action.pr_url); + if (!freshOutcome.ok || freshOutcome.result.status !== 'open') { console.log( - `${LOG} poll_pr: PR status changed to '${freshStatusResult?.status ?? 'null'}' during feedback check, skipping feedback for bead=${action.bead_id}` + `${LOG} poll_pr: PR status changed to '${freshOutcome.ok ? freshOutcome.result.status : 'error'}' during feedback check, skipping feedback for bead=${action.bead_id}` ); } else { const existingFeedback = hasExistingFeedbackBead(sql, action.bead_id); @@ -1179,57 +1270,192 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro } } } else { - // Null result — GitHub API unreachable (token missing, expired, rate-limited, or 5xx). - // Increment consecutive null counter; fail the bead after PR_POLL_NULL_THRESHOLD. + const error = outcome.error; + + // Store the latest error kind for analytics query( sql, /* sql */ ` UPDATE ${beads} SET ${beads.columns.metadata} = json_set( COALESCE(${beads.columns.metadata}, '{}'), - '$.poll_null_count', - COALESCE( - json_extract(${beads.columns.metadata}, '$.poll_null_count'), - 0 - ) + 1 + '$.poll_error_kind', ? ) WHERE ${beads.bead_id} = ? `, - [action.bead_id] + [error.kind, action.bead_id] ); - const rows = [ - ...query( + + const failRigRows = z + .object({ rig_id: z.string().nullable() }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` + SELECT ${beads.columns.rig_id} + FROM ${beads} + WHERE ${beads.bead_id} = ? + `, + [action.bead_id] + ), + ]); + const failRigId = failRigRows[0]?.rig_id ?? ''; + + if (shouldFailImmediately(error)) { + console.warn( + `${LOG} poll_pr: immediate-fail error kind=${error.kind} for bead=${action.bead_id}, failing` + ); + beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'pr_poll_failed', + '$.failureKind', ?, + '$.failureMessage', ? + ) + WHERE ${beads.bead_id} = ? + `, + [error.kind, failureMessageFor(error), action.bead_id] + ); + ctx.emitEvent({ + event: 'pr.poll_failed', + townId, + beadId: action.bead_id, + rigId: failRigId, + reason: error.kind, + label: 'provider' in error ? error.provider : '', + statusCode: error.kind === 'http_error' ? error.status : undefined, + }); + } else if (shouldCountAsTransient(error)) { + // Transient HTTP errors (5xx, 429) count toward the 10-strike threshold. + // Migrate legacy poll_null_count into poll_transient_count on first read. + query( sql, /* sql */ ` - SELECT json_extract(${beads.columns.metadata}, '$.poll_null_count') AS null_count - FROM ${beads} + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.poll_transient_count', + COALESCE( + json_extract(${beads.columns.metadata}, '$.poll_transient_count'), + json_extract(${beads.columns.metadata}, '$.poll_null_count'), + 0 + ) + 1, + '$.poll_non_transient_count', 0 + ) WHERE ${beads.bead_id} = ? `, [action.bead_id] - ), - ]; - const nullCount = Number(rows[0]?.null_count ?? 0); - if (nullCount >= PR_POLL_NULL_THRESHOLD) { - console.warn( - `${LOG} poll_pr: ${nullCount} consecutive null results for bead=${action.bead_id}, failing` ); - beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + const rows = [ + ...query( + sql, + /* sql */ ` + SELECT json_extract(${beads.columns.metadata}, '$.poll_transient_count') AS transient_count + FROM ${beads} + WHERE ${beads.bead_id} = ? + `, + [action.bead_id] + ), + ]; + const transientCount = Number(rows[0]?.transient_count ?? 0); + if (transientCount >= PR_POLL_NULL_THRESHOLD) { + console.warn( + `${LOG} poll_pr: ${transientCount} consecutive transient errors for bead=${action.bead_id}, failing` + ); + beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'pr_poll_failed', + '$.failureKind', ?, + '$.failureMessage', ? + ) + WHERE ${beads.bead_id} = ? + `, + [error.kind, failureMessageFor(error), action.bead_id] + ); + ctx.emitEvent({ + event: 'pr.poll_failed', + townId, + beadId: action.bead_id, + rigId: failRigId, + reason: error.kind, + label: 'provider' in error ? error.provider : '', + statusCode: error.kind === 'http_error' ? error.status : undefined, + }); + } + } else { + // Non-transient, non-immediate errors (invalid_response only) + // count toward a lower 3-strike threshold. + // No legacy poll_null_count migration needed: invalid_response + // is a new error kind that couldn't have accumulated under the + // old transient/null classification. query( sql, /* sql */ ` UPDATE ${beads} SET ${beads.columns.metadata} = json_set( COALESCE(${beads.columns.metadata}, '{}'), - '$.failureReason', 'pr_poll_failed', - '$.failureMessage', ? + '$.poll_non_transient_count', + COALESCE( + json_extract(${beads.columns.metadata}, '$.poll_non_transient_count'), + 0 + ) + 1, + '$.poll_transient_count', 0 ) WHERE ${beads.bead_id} = ? `, - [ - `Cannot poll PR status — GitHub API returned null ${nullCount} consecutive times. Check that a valid GitHub token is configured in town settings and that the GitHub API is reachable.`, - action.bead_id, - ] + [action.bead_id] ); + const rows = [ + ...query( + sql, + /* sql */ ` + SELECT json_extract(${beads.columns.metadata}, '$.poll_non_transient_count') AS non_transient_count + FROM ${beads} + WHERE ${beads.bead_id} = ? + `, + [action.bead_id] + ), + ]; + const nonTransientCount = Number(rows[0]?.non_transient_count ?? 0); + if (nonTransientCount >= PR_POLL_NON_TRANSIENT_THRESHOLD) { + console.warn( + `${LOG} poll_pr: ${nonTransientCount} consecutive non-transient errors kind=${error.kind} for bead=${action.bead_id}, failing` + ); + beadOps.updateBeadStatus(sql, action.bead_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'pr_poll_failed', + '$.failureKind', ?, + '$.failureMessage', ? + ) + WHERE ${beads.bead_id} = ? + `, + [error.kind, failureMessageFor(error), action.bead_id] + ); + ctx.emitEvent({ + event: 'pr.poll_failed', + townId, + beadId: action.bead_id, + rigId: failRigId, + reason: error.kind, + label: 'provider' in error ? error.provider : '', + statusCode: error.kind === 'http_error' ? error.status : undefined, + }); + } } } // status === 'open' — no action needed, poll again next tick @@ -1483,3 +1709,9 @@ function parsePrUrl(prUrl: string): { repo: string; prNumber: number } | null { // Exported for testing export { hasExistingFeedbackBead as _hasExistingFeedbackBead, parsePrUrl as _parsePrUrl }; +export { + failureMessageFor as _failureMessageFor, + nextPollCounterState as _nextPollCounterState, + shouldFailImmediately as _shouldFailImmediately, + shouldCountAsTransient as _shouldCountAsTransient, +}; diff --git a/services/gastown/src/dos/town/config.ts b/services/gastown/src/dos/town/config.ts index e02404a745..85b3ca4715 100644 --- a/services/gastown/src/dos/town/config.ts +++ b/services/gastown/src/dos/town/config.ts @@ -9,6 +9,7 @@ import { type MergeStrategy, type RigOverrideConfig, } from '../../types'; +import { resolveGitHubTokenString } from './town-scm'; const CONFIG_KEY = 'town:config'; const NEW_TOWN_DEFAULTS_SEEDED_KEY = 'town:config:newDefaultsSeeded'; @@ -295,17 +296,47 @@ export function resolveRigConfig( /** * Build the ContainerConfig payload for X-Town-Config header. * Sent with every fetch() to the container. + * + * The container's `syncTownConfigToProcessEnv` reads `git_auth.github_token` + * from this payload on every request and writes it to `process.env.GIT_TOKEN`, + * which the SDK server's `gh` CLI inherits via `GH_TOKEN`. To prevent serving + * an expired installation token (TTL ~1h) we resolve through `resolveGitHubToken` + * so a configured platform integration always returns a fresh value. + * + * `townId` is required so we can always perform the integration lookup. + * Making it optional was a foot-gun — a forgotten arg silently re-introduces + * the stale-token bug this function exists to prevent. */ export async function buildContainerConfig( storage: DurableObjectStorage, - env: Env + env: Env, + townId: string ): Promise> { const config = await getTownConfig(storage); + + let resolvedGithubToken = config.git_auth?.github_token; + try { + const fresh = await resolveGitHubTokenString({ + env, + townId, + getTownConfig: () => Promise.resolve(config), + }); + if (fresh) resolvedGithubToken = fresh; + } catch (err) { + console.warn( + `${TOWN_LOG} buildContainerConfig: resolveGitHubTokenString failed; falling back to stored token`, + err + ); + } + return { env_vars: config.env_vars, default_model: resolveModel(config, null, ''), small_model: resolveSmallModel(config), - git_auth: config.git_auth, + git_auth: { + ...config.git_auth, + github_token: resolvedGithubToken, + }, kilocode_token: config.kilocode_token, github_cli_pat: config.github_cli_pat, git_author_name: config.git_author_name, diff --git a/services/gastown/src/dos/town/container-dispatch.ts b/services/gastown/src/dos/town/container-dispatch.ts index 4c37825376..b5deab4f62 100644 --- a/services/gastown/src/dos/town/container-dispatch.ts +++ b/services/gastown/src/dos/town/container-dispatch.ts @@ -10,6 +10,7 @@ import { buildMayorSystemPrompt } from '../../prompts/mayor-system.prompt'; import type { TownConfig, RigOverrideConfig } from '../../types'; import { buildContainerConfig, resolveModel, resolveSmallModel, resolveRigConfig } from './config'; import { writeEvent } from '../../util/analytics.util'; +import { resolveGitHubTokenString } from './town-scm'; const TOWN_LOG = '[Town.do]'; @@ -409,9 +410,20 @@ export async function startAgentInContainer( // Build env vars from town config const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; - // Map git_auth tokens - if (params.townConfig.git_auth?.github_token) { - envVars.GIT_TOKEN = params.townConfig.git_auth.github_token; + // Map git_auth tokens. Resolve GitHub token through resolveGitHubTokenString so + // we mint a fresh installation token when a platform integration is + // configured; otherwise we'd hand the agent a `git_auth.github_token` + // value that may have been written hours ago and is well past its 1h + // installation-token TTL. The resolved value is also what the agent's + // `gh` CLI sees as `GH_TOKEN`. + const githubToken = await resolveGitHubTokenString({ + env, + townId: params.townId, + getTownConfig: () => Promise.resolve(params.townConfig), + platformIntegrationId: params.platformIntegrationId, + }); + if (githubToken) { + envVars.GIT_TOKEN = githubToken; } if (params.townConfig.git_auth?.gitlab_token) { envVars.GITLAB_TOKEN = params.townConfig.git_auth.gitlab_token; @@ -449,7 +461,7 @@ export async function startAgentInContainer( `${TOWN_LOG} startAgentInContainer: envVars built: keys=[${Object.keys(envVars).join(',')}] hasGitToken=${!!envVars.GIT_TOKEN} hasGitlabToken=${!!envVars.GITLAB_TOKEN} hasContainerToken=${!!containerToken} hasAgentJwt=${!!agentToken} hasKilocodeToken=${!!kilocodeToken} git_auth_keys=[${Object.keys(params.townConfig.git_auth ?? {}).join(',')}]` ); - const containerConfig = await buildContainerConfig(storage, env); + const containerConfig = await buildContainerConfig(storage, env, params.townId); const container = getTownContainerStub(env, params.townId); const rigOverride = params.rigOverride ?? null; @@ -613,8 +625,16 @@ export async function startMergeInContainer( } const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; - if (params.townConfig.git_auth?.github_token) { - envVars.GIT_TOKEN = params.townConfig.git_auth.github_token; + // Resolve GitHub token through resolveGitHubTokenString so a configured + // platform integration mints a fresh installation token for the + // merge process. See startAgentInContainer for the rationale. + const mergeGithubToken = await resolveGitHubTokenString({ + env, + townId: params.townId, + getTownConfig: () => Promise.resolve(params.townConfig), + }); + if (mergeGithubToken) { + envVars.GIT_TOKEN = mergeGithubToken; } if (params.townConfig.git_auth?.gitlab_token) { envVars.GITLAB_TOKEN = params.townConfig.git_auth.gitlab_token; @@ -628,7 +648,7 @@ export async function startMergeInContainer( const mergeKilocodeToken = params.kilocodeToken ?? params.townConfig.kilocode_token; if (mergeKilocodeToken) envVars.KILOCODE_TOKEN = mergeKilocodeToken; - const containerConfig = await buildContainerConfig(storage, env); + const containerConfig = await buildContainerConfig(storage, env, params.townId); const container = getTownContainerStub(env, params.townId); const response = await container.fetch('http://container/git/merge', { diff --git a/services/gastown/src/dos/town/reconciler.ts b/services/gastown/src/dos/town/reconciler.ts index c7715a05ee..a1a9c6dbb9 100644 --- a/services/gastown/src/dos/town/reconciler.ts +++ b/services/gastown/src/dos/town/reconciler.ts @@ -883,10 +883,24 @@ export function reconcileAgents(sql: SqlStorage, opts?: { draining?: boolean }): agent_id: agent.bead_id, }); } else if (hookedStatus === 'in_progress' || hookedStatus === 'open') { - // Idle agent hooked to a live bead — the dispatch started but the - // agent died (container failed to start, OOM, etc.) and agentCompleted - // set it to idle without unhooking. Reset the bead to open and unhook - // so the scheduling rules can re-dispatch. + // Idle agent hooked to a live bead — usually means the dispatch + // started but the agent died (container failed to start, OOM, + // etc.) and agentCompleted set it to idle without unhooking. + // + // Guard against the phantom-failed-dispatch case: dispatchAgent + // can return started=false even when the container actually + // accepted the agent (e.g. /refresh-token raced a token rotation), + // and the SDK session keeps heartbeating happily. Tearing the + // hook out from under a live session causes tools that need a + // hooked bead (gt_request_changes, gt_triage_resolve) to fail + // with "is not hooked to a bead" until the session exits. + // + // If we've seen a heartbeat in the last 90s, treat the agent as + // alive and leave the hook in place. The 90s window matches + // reconcileAgents' stale-heartbeat threshold, so a truly dead + // agent still gets reaped by the heartbeat path on a later tick. + if (!staleMs(agent.last_activity_at, 90_000)) continue; + actions.push({ type: 'unhook_agent', agent_id: agent.bead_id, diff --git a/services/gastown/src/dos/town/review-queue.ts b/services/gastown/src/dos/town/review-queue.ts index 80fd1b6b1a..3246db07da 100644 --- a/services/gastown/src/dos/town/review-queue.ts +++ b/services/gastown/src/dos/town/review-queue.ts @@ -453,10 +453,10 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu // The agent was unhooked by a recovery path between when the agent // finished work and when it called gt_done. // - // For refineries, this is critical: the refinery successfully merged - // but the hook was cleared by zombie detection. We MUST still complete - // the review — otherwise the source bead stays open forever. Find the - // most recent non-closed MR bead assigned to this agent and complete it. + // For refineries, this is critical: the refinery may have actually + // completed work (merged a PR, posted a review) but its hook was + // cleared by zombie detection. We need to make progress without + // landing PRs the refinery did NOT actually approve. if (agent.role === 'refinery') { const recentMrRows = [ ...query( @@ -479,6 +479,8 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu `[review-queue] agentDone: unhooked refinery ${agentId} — recovering MR bead ${mrBeadId}` ); if (input.pr_url) { + // Refinery created/landed a PR. Trust the URL: storing it + // moves the MR to in_review where poll_pr decides the merge. const stored = setReviewPrUrl(sql, mrBeadId, input.pr_url); if (stored) { markReviewInReview(sql, mrBeadId); @@ -490,10 +492,46 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu }); } } else { + // No pr_url and no hook: we can NOT prove the refinery + // actually merged. Previously we optimistically marked the + // MR as 'merged' — but the same race that cleared the hook + // (idle+hooked+live-bead reconciler rule firing on a phantom + // dispatch_failed) ALSO fires when the refinery is mid-review + // and decided to call gt_request_changes. In that case the + // refinery wanted rework, gt_request_changes failed with + // "not hooked", and the refinery's fallback gt_done() call + // would silently land the PR it was trying to reject. + // + // Fail the review instead: this returns the source bead to + // 'open' and surfaces the problem so the next dispatch can + // re-review properly. Also raise an escalation so a human + // sees what happened. + console.warn( + `[review-queue] agentDone: unhooked refinery ${agentId} called gt_done without pr_url ` + + `for MR ${mrBeadId} — failing review (cannot confirm merge)` + ); completeReviewWithResult(sql, { entry_id: mrBeadId, - status: 'merged', - message: input.summary ?? 'Merged by refinery agent (recovered from unhook)', + status: 'failed', + message: + input.summary ?? + 'Refinery called gt_done without a pr_url after losing its hook — cannot confirm merge', + }); + createBead(sql, { + type: 'escalation', + title: 'Refinery gt_done without pr_url after hook loss', + body: + `Refinery ${agentId} called gt_done with no pr_url while unhooked from MR ${mrBeadId}. ` + + `The hook was likely cleared by the reconciler after a phantom dispatch failure ` + + `while the SDK session was still alive. The MR has been failed (not merged) so a human ` + + `can verify whether the refinery's intended outcome was 'approve and merge' or ` + + `'request changes'. Summary from refinery: ${input.summary ?? '(none)'}`, + priority: 'high', + metadata: { + source_bead_id: mrBeadId, + source_agent_id: agentId, + kind: 'refinery_unhooked_done', + }, }); } return; diff --git a/services/gastown/src/dos/town/scheduling.ts b/services/gastown/src/dos/town/scheduling.ts index 22eda157ee..2f9e47ff60 100644 --- a/services/gastown/src/dos/town/scheduling.ts +++ b/services/gastown/src/dos/town/scheduling.ts @@ -185,6 +185,7 @@ export async function dispatchAgent( // If the agent truly didn't start: reconcileAgents catches it // after 90s of missing heartbeats and transitions to 'idle'. // If the agent actually started: heartbeats keep it alive. (#1358) + const startError = dispatch.getLastStartError(); ctx.emitEvent({ event: 'agent.dispatch_failed', townId: ctx.townId, @@ -192,7 +193,7 @@ export async function dispatchAgent( agentId: agent.id, beadId: bead.bead_id, role: agent.role, - reason: 'container returned false', + reason: startError ?? 'container returned false', }); } return started; @@ -201,21 +202,18 @@ export async function dispatchAgent( Sentry.captureException(err, { extra: { agentId: agent.id, beadId: bead.bead_id }, }); - try { - query( - ctx.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle', - ${agent_metadata.columns.last_activity_at} = ? - WHERE ${agent_metadata.bead_id} = ? - `, - [now(), agent.id] - ); - // Don't roll back bead to open — same timeout race rationale - } catch (rollbackErr) { - console.error(`${LOG} dispatchAgent: rollback also failed:`, rollbackErr); - } + // Do NOT transition the agent to 'idle' here. The container may + // already have accepted /agents/start (e.g. /refresh-token failed + // late, after the agent process spawned), in which case the SDK + // session is alive and heartbeating. Marking it idle would trip + // reconcileAgents Rule 3 (idle agent + hooked + live bead → + // unhook + reset bead), tearing the hook out from under the + // running session and causing tools like gt_request_changes to + // fail with "is not hooked to a bead" (#1358 follow-up). + // + // Instead, leave the agent as 'working'. If the container truly + // didn't start, reconcileAgents catches it after 90s of missing + // heartbeats and transitions to 'idle' through the normal path. ctx.emitEvent({ event: 'agent.dispatch_failed', townId: ctx.townId, @@ -264,6 +262,12 @@ export function dispatchUnblockedBeads(ctx: SchedulingContext, closedBeadId: str /** * Returns true if the town has work that requires the fast (5s) alarm * interval. Used to decide between active and idle alarm cadence. + * + * Each signal is wrapped in a thunk so `||` short-circuits at the SQL + * layer: as soon as one signal returns true we skip the remaining + * queries. On a hot town with working agents this avoids 4 extra reads + * per check; on a cold idle town it costs the full 5 reads (same as + * before). */ export function hasActiveWork(sql: SqlStorage): boolean { // Stalled agents older than 30min no longer count as active work: they @@ -271,53 +275,81 @@ export function hasActiveWork(sql: SqlStorage): boolean { // returning running/unknown). Keeping them in the active set would pin // the alarm at its 5s fast cadence indefinitely. The stalled->idle // auto-transition in reconcileAgents cleans them up after 2h 30min. - const activeAgentRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} - WHERE ${agent_metadata.status} = 'working' - OR (${agent_metadata.status} = 'stalled' - AND ${agent_metadata.last_activity_at} > strftime('%Y-%m-%dT%H:%M:%fZ', 'now', '-30 minutes'))`, - [] - ), - ]; - const pendingBeadRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} WHERE ${agent_metadata.status} = 'idle' AND ${agent_metadata.current_hook_bead_id} IS NOT NULL`, - [] - ), - ]; - const pendingReviewRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'merge_request' AND ${beads.status} IN ('open', 'in_progress')`, - [] - ), - ]; - const pendingTriageRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? AND ${beads.status} = 'open'`, - [patrol.TRIAGE_LABEL_LIKE] - ), - ]; + const hasActiveAgents = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} + WHERE ${agent_metadata.status} = 'working' + OR (${agent_metadata.status} = 'stalled' + AND ${agent_metadata.last_activity_at} > strftime('%Y-%m-%dT%H:%M:%fZ', 'now', '-30 minutes'))`, + [] + ), + ]) > 0; + + // Idle agents that already hold a hook — the reconciler should dispatch + // them on the next tick. + const hasHookedIdleAgents = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} + WHERE ${agent_metadata.status} = 'idle' + AND ${agent_metadata.current_hook_bead_id} IS NOT NULL`, + [] + ), + ]) > 0; + + const hasOpenMergeRequests = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} + WHERE ${beads.type} = 'merge_request' + AND ${beads.status} IN ('open', 'in_progress')`, + [] + ), + ]) > 0; + + const hasOpenTriageBeads = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.labels} LIKE ? + AND ${beads.status} = 'open'`, + [patrol.TRIAGE_LABEL_LIKE] + ), + ]) > 0; + // Open issue beads with a rig (eligible for dispatch by reconcileBeads Rule 1) // but not yet assigned to any agent. Without this check, the alarm drops to // idle cadence after a container restart when agents lose their hooks and // beads revert to open+unassigned, delaying dispatch by up to 5 minutes. - const unassignedIssueRows = [ - ...query( - sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.status} = 'open' AND ${beads.rig_id} IS NOT NULL AND ${beads.assignee_agent_bead_id} IS NULL`, - [] - ), - ]; + const hasUnassignedIssues = (): boolean => + countOf([ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.status} = 'open' + AND ${beads.rig_id} IS NOT NULL + AND ${beads.assignee_agent_bead_id} IS NULL`, + [] + ), + ]) > 0; + return ( - Number(activeAgentRows[0]?.cnt ?? 0) > 0 || - Number(pendingBeadRows[0]?.cnt ?? 0) > 0 || - Number(pendingReviewRows[0]?.cnt ?? 0) > 0 || - Number(pendingTriageRows[0]?.cnt ?? 0) > 0 || - Number(unassignedIssueRows[0]?.cnt ?? 0) > 0 + hasActiveAgents() || + hasHookedIdleAgents() || + hasOpenMergeRequests() || + hasOpenTriageBeads() || + hasUnassignedIssues() ); } + +/** Read the `cnt` column off the first row of a `SELECT COUNT(*) as cnt` query. */ +function countOf(rows: ReadonlyArray>): number { + return Number(rows[0]?.cnt ?? 0); +} diff --git a/services/gastown/src/dos/town/town-scm.ts b/services/gastown/src/dos/town/town-scm.ts index 315d0abba3..a077d713d3 100644 --- a/services/gastown/src/dos/town/town-scm.ts +++ b/services/gastown/src/dos/town/town-scm.ts @@ -22,27 +22,87 @@ export type SCMContext = { platformIntegrationId?: string; }; +export type GitHubTokenResolution = + | { ok: true; token: string; source: string } + | { ok: false; tried: string[] }; + /** * Resolve a GitHub API token from the town config. - * Fallback chain: github_token → github_cli_pat → town platform integration → rig platform integration. + * + * Priority chain (most to least preferred): + * 1. `github_cli_pat` — user-supplied long-lived PAT, never expires. + * 2. Platform integration (GitHub App installation) — minted fresh by + * git-token-service with KV-backed caching that auto-invalidates + * before each token's 1h TTL elapses. This is the authoritative + * live source whenever an integration is configured. + * 3. `git_auth.github_token` — stored installation token from a past + * `refreshGitCredentials()` write. Kept as a fallback for towns that + * never had an integration wired up. NOT preferred over the integration + * because the stored value is typically stale (1h TTL, never updated + * by anything in the request path). + * + * Historically this preferred the stored token over the integration, + * which made every consumer (PR poller, /refresh-git-token, agent + * dispatch's `GIT_TOKEN`) hand out an expired token whenever the rig + * had been registered more than ~1 hour ago. See ce15a6fe7 for the fix. + * + * Returns a `GitHubTokenResolution`: `ok: true` with the token + source on + * success, or `ok: false` with the `tried` chain on failure. Used by + * `checkPRStatus`'s `no_token` failure path to surface a specific message. */ -export async function resolveGitHubToken(ctx: SCMContext): Promise { +export async function resolveGitHubToken(ctx: SCMContext): Promise { + const tried: string[] = []; const townConfig = await ctx.getTownConfig(); - let token = townConfig.git_auth?.github_token ?? townConfig.github_cli_pat; - if (!token) { - const integrationId = townConfig.git_auth?.platform_integration_id ?? ctx.platformIntegrationId; - if (integrationId && ctx.env.GIT_TOKEN_SERVICE) { - try { - token = await ctx.env.GIT_TOKEN_SERVICE.getToken(integrationId); - } catch (err) { - console.warn( - `${TOWN_LOG} resolveGitHubToken: platform integration token lookup failed for ${integrationId}`, - err - ); + + // 1. github_cli_pat — long-lived user PAT + if (townConfig.github_cli_pat) { + return { ok: true, token: townConfig.github_cli_pat, source: 'town.github_cli_pat' }; + } + tried.push('town.github_cli_pat'); + + // 2. Platform integration — fresh App installation token + const integrationId = townConfig.git_auth?.platform_integration_id ?? ctx.platformIntegrationId; + const sourceLabel = townConfig.git_auth?.platform_integration_id + ? 'town platform integration' + : 'rig platform integration'; + if (integrationId && ctx.env.GIT_TOKEN_SERVICE) { + tried.push(sourceLabel); + try { + const fresh = await ctx.env.GIT_TOKEN_SERVICE.getToken(integrationId); + if (typeof fresh === 'string' && fresh.length > 0) { + return { ok: true, token: fresh, source: sourceLabel }; } + console.warn( + `${TOWN_LOG} resolveGitHubToken: platform integration ${integrationId} returned empty token; falling back to stored github_token` + ); + } catch (err) { + console.warn( + `${TOWN_LOG} resolveGitHubToken: platform integration token lookup failed for ${integrationId}; falling back to stored github_token`, + err + ); } + } else if (!integrationId) { + tried.push('platform integration (none configured)'); + } else { + tried.push(`${sourceLabel} (GIT_TOKEN_SERVICE not bound)`); } - return token ?? null; + + // 3. Stored git_auth.github_token — last-resort fallback + if (townConfig.git_auth?.github_token) { + return { + ok: true, + token: townConfig.git_auth.github_token, + source: 'town.git_auth.github_token', + }; + } + tried.push('town.git_auth.github_token'); + + return { ok: false, tried }; +} + +export async function resolveGitHubTokenString(ctx: SCMContext): Promise { + const r = await resolveGitHubToken(ctx); + return r.ok ? r.token : null; } export type PRStatusResult = { @@ -50,32 +110,58 @@ export type PRStatusResult = { mergeable_state?: string; }; +export type PRStatusError = + | { kind: 'no_token'; provider: 'github' | 'gitlab'; resolutionChain: string[] } + | { kind: 'unrecognized_url'; url: string } + | { + kind: 'http_error'; + provider: 'github' | 'gitlab'; + status: number; + statusText: string; + transient: boolean; + } + | { + kind: 'invalid_response'; + provider: 'github' | 'gitlab'; + reason: 'json_parse' | 'schema_mismatch'; + sampleKeys?: string[]; + } + | { kind: 'host_mismatch'; provider: 'gitlab'; expected: string; got: string }; + +export type PRStatusOutcome = + | { ok: true; result: PRStatusResult } + | { ok: false; error: PRStatusError }; + +function isTransientHttpStatus(status: number): boolean { + return status >= 500 || status === 429; +} + /** * Check the status of a PR/MR via its URL. - * Returns a PRStatusResult with status and optional mergeable_state (GitHub only), - * or null if the status cannot be determined. + * Returns a PRStatusOutcome discriminated union — { ok: true, result } on success, + * or { ok: false, error } with a structured PRStatusError describing why. */ -export async function checkPRStatus( - ctx: SCMContext, - prUrl: string -): Promise { +export async function checkPRStatus(ctx: SCMContext, prUrl: string): Promise { const townConfig = await ctx.getTownConfig(); // GitHub PR URL format: https://github.com/{owner}/{repo}/pull/{number} const ghMatch = prUrl.match(/^https:\/\/github\.com\/([^/]+)\/([^/]+)\/pull\/(\d+)/); if (ghMatch) { const [, owner, repo, numberStr] = ghMatch; - const token = await resolveGitHubToken(ctx); - if (!token) { + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) { console.warn(`${TOWN_LOG} checkPRStatus: no GitHub token available, cannot poll ${prUrl}`); - return null; + return { + ok: false, + error: { kind: 'no_token', provider: 'github', resolutionChain: resolution.tried }, + }; } const response = await fetch( `https://api.github.com/repos/${owner}/${repo}/pulls/${numberStr}`, { headers: { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'User-Agent': 'Gastown-Refinery/1.0', }, @@ -85,17 +171,41 @@ export async function checkPRStatus( console.warn( `${TOWN_LOG} checkPRStatus: GitHub API returned ${response.status} for ${prUrl}` ); - return null; + return { + ok: false, + error: { + kind: 'http_error', + provider: 'github', + status: response.status, + statusText: response.statusText, + transient: isTransientHttpStatus(response.status), + }, + }; } const json = await response.json().catch(() => null); - if (!json) return null; + if (!json) { + return { + ok: false, + error: { kind: 'invalid_response', provider: 'github', reason: 'json_parse' }, + }; + } const data = GitHubPRStatusSchema.safeParse(json); - if (!data.success) return null; + if (!data.success) { + return { + ok: false, + error: { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + sampleKeys: Object.keys(json).slice(0, 8), + }, + }; + } - if (data.data.merged) return { status: 'merged' }; - if (data.data.state === 'closed') return { status: 'closed' }; - return { status: 'open', mergeable_state: data.data.mergeable_state }; + if (data.data.merged) return { ok: true, result: { status: 'merged' } }; + if (data.data.state === 'closed') return { ok: true, result: { status: 'closed' } }; + return { ok: true, result: { status: 'open', mergeable_state: data.data.mergeable_state } }; } // GitLab MR URL format: https://{host}/{path}/-/merge_requests/{iid} @@ -105,7 +215,14 @@ export async function checkPRStatus( const token = townConfig.git_auth?.gitlab_token; if (!token) { console.warn(`${TOWN_LOG} checkPRStatus: no gitlab_token configured, cannot poll ${prUrl}`); - return null; + return { + ok: false, + error: { + kind: 'no_token', + provider: 'gitlab', + resolutionChain: ['town.git_auth.gitlab_token'], + }, + }; } // Validate the host against known GitLab hosts to prevent SSRF/token leak. @@ -117,7 +234,15 @@ export async function checkPRStatus( console.warn( `${TOWN_LOG} checkPRStatus: refusing to send gitlab_token to unknown host: ${prHost}` ); - return null; + return { + ok: false, + error: { + kind: 'host_mismatch', + provider: 'gitlab', + expected: configuredHost ?? 'gitlab.com', + got: prHost, + }, + }; } const encodedPath = encodeURIComponent(projectPath); @@ -131,21 +256,45 @@ export async function checkPRStatus( console.warn( `${TOWN_LOG} checkPRStatus: GitLab API returned ${response.status} for ${prUrl}` ); - return null; + return { + ok: false, + error: { + kind: 'http_error', + provider: 'gitlab', + status: response.status, + statusText: response.statusText, + transient: isTransientHttpStatus(response.status), + }, + }; } const glJson = await response.json().catch(() => null); - if (!glJson) return null; + if (!glJson) { + return { + ok: false, + error: { kind: 'invalid_response', provider: 'gitlab', reason: 'json_parse' }, + }; + } const data = GitLabMRStatusSchema.safeParse(glJson); - if (!data.success) return null; + if (!data.success) { + return { + ok: false, + error: { + kind: 'invalid_response', + provider: 'gitlab', + reason: 'schema_mismatch', + sampleKeys: Object.keys(glJson).slice(0, 8), + }, + }; + } - if (data.data.state === 'merged') return { status: 'merged' }; - if (data.data.state === 'closed') return { status: 'closed' }; - return { status: 'open' }; + if (data.data.state === 'merged') return { ok: true, result: { status: 'merged' } }; + if (data.data.state === 'closed') return { ok: true, result: { status: 'closed' } }; + return { ok: true, result: { status: 'open' } }; } console.warn(`${TOWN_LOG} checkPRStatus: unrecognized PR URL format: ${prUrl}`); - return null; + return { ok: false, error: { kind: 'unrecognized_url', url: prUrl } }; } /** @@ -261,11 +410,11 @@ export async function checkPRFeedback( } const [, owner, repo, numberStr] = ghMatch; - const token = await resolveGitHubToken(ctx); - if (!token) return null; + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) return null; const headers = { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'User-Agent': 'Gastown-Refinery/1.0', }; @@ -481,15 +630,15 @@ export async function mergePR(ctx: SCMContext, prUrl: string): Promise } const [, owner, repo, numberStr] = ghMatch; - const token = await resolveGitHubToken(ctx); - if (!token) { + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) { console.warn(`${TOWN_LOG} mergePR: no GitHub token available`); return false; } const mergeUrl = `https://api.github.com/repos/${owner}/${repo}/pulls/${numberStr}/merge`; const mergeHeaders = { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'Content-Type': 'application/json', 'User-Agent': 'Gastown-Refinery/1.0', @@ -538,8 +687,8 @@ export async function createConvoyBranch( featureBranch: string; } ): Promise { - const token = await resolveGitHubToken(ctx); - if (!token) { + const resolution = await resolveGitHubToken(ctx); + if (!resolution.ok) { console.warn( `${TOWN_LOG} createConvoyBranch: no GitHub token available — skipping branch creation for ${opts.featureBranch}` ); @@ -548,15 +697,13 @@ export async function createConvoyBranch( const coords = parseGitUrl(opts.gitUrl); if (!coords || coords.platform !== 'github') { - // Non-GitHub repos or unparseable URLs: skip silently (GitLab uses a - // different flow; nothing breaks if the branch doesn't pre-exist there). return; } const { owner, repo } = coords; const apiBase = `https://api.github.com/repos/${owner}/${repo}`; const headers = { - Authorization: `token ${token}`, + Authorization: `token ${resolution.token}`, Accept: 'application/vnd.github.v3+json', 'User-Agent': 'Gastown/1.0', 'Content-Type': 'application/json', diff --git a/services/gastown/src/gastown.worker.ts b/services/gastown/src/gastown.worker.ts index 9048a95a14..8a117eb598 100644 --- a/services/gastown/src/gastown.worker.ts +++ b/services/gastown/src/gastown.worker.ts @@ -9,6 +9,7 @@ import { getTownDOStub } from './dos/Town.do'; import { TownConfigUpdateSchema } from './types'; import { resError } from './util/res.util'; import { writeEvent } from './util/analytics.util'; +import { logger } from './util/log.util'; import { authMiddleware, agentOnlyMiddleware, @@ -128,7 +129,6 @@ import { townAuthMiddleware } from './middleware/town-auth.middleware'; import { orgAuthMiddleware } from './middleware/org-auth.middleware'; import { adminAuditMiddleware } from './middleware/admin-audit.middleware'; import { timingMiddleware, instrumented } from './middleware/analytics.middleware'; -import { logger } from './util/log.util'; import { useWorkersLogger } from 'workers-tagged-logger'; import type { MiddlewareHandler } from 'hono'; import { handleGetTownConfig, handleUpdateTownConfig } from './handlers/town-config.handler'; @@ -173,33 +173,65 @@ app.use('*', timingMiddleware); // Cast needed: workers-tagged-logger@1.0.0 was built against an older Hono. app.use('*', useWorkersLogger('gastown-worker') as unknown as MiddlewareHandler); -// ── Request logging ───────────────────────────────────────────────────── -// Extract IDs from the URL path directly — c.req.param() only works -// after Hono has matched a route, which hasn't happened yet in a -// wildcard middleware. -// Matches /orgs/:orgId, /towns/:townId, /rigs/:rigId, /agents/:agentId -// in any combination that appears in our route patterns. -const RE_ORG = /\/orgs\/(?[^/]+)/; -const RE_TOWN = /\/towns\/(?[^/]+)/; -const RE_RIG = /\/rigs\/(?[^/]+)/; -const RE_AGENT = /\/agents\/(?[^/]+)/; - -app.use('*', async (c, next) => { - const method = c.req.method; - const path = c.req.path; - // Tag with route params immediately so all downstream logs (auth, - // handlers, DO calls) inherit them. Auth-derived tags (userId, orgId) - // are set by kiloAuthMiddleware and orgAuthMiddleware when they run. - logger.setTags({ - orgId: RE_ORG.exec(path)?.groups?.orgId, - townId: RE_TOWN.exec(path)?.groups?.townId, - rigId: RE_RIG.exec(path)?.groups?.rigId, - agentId: RE_AGENT.exec(path)?.groups?.agentId, - }); - logger.info(`--> ${method} ${path}`); +// ── Per-route logger tagging ──────────────────────────────────────── +// Use Hono path matching (not regex) so tags are sourced from +// c.req.param() once the route is matched. Each handler runs only +// when its prefix matches; if a request hits /api/towns/:townId/rigs/:rigId, +// both town and rig handlers run in order. +app.use('/api/orgs/:orgId/*', async (c, next) => { + const orgId = c.req.param('orgId'); + if (orgId) logger.setTags({ orgId }); + await next(); +}); +app.use('/api/towns/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/mayor/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/orgs/:orgId/towns/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/users/:userId/towns/:townId/*', async (c, next) => { + const townId = c.req.param('townId'); + if (townId) logger.setTags({ townId }); + await next(); +}); +app.use('/api/users/:userId/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/towns/:townId/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/orgs/:orgId/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/mayor/:townId/tools/rigs/:rigId/*', async (c, next) => { + const rigId = c.req.param('rigId'); + if (rigId) logger.setTags({ rigId }); + await next(); +}); +app.use('/api/towns/:townId/rigs/:rigId/agents/:agentId/*', async (c, next) => { + const agentId = c.req.param('agentId'); + if (agentId) logger.setTags({ agentId }); + await next(); +}); +app.use('/api/mayor/:townId/tools/rigs/:rigId/agents/:agentId/*', async (c, next) => { + const agentId = c.req.param('agentId'); + if (agentId) logger.setTags({ agentId }); await next(); - const elapsed = Math.round(performance.now() - (c.get('requestStartTime') ?? 0)); - logger.info(`<-- ${method} ${path} ${c.res.status}`, { durationMs: elapsed }); }); // ── CORS ──────────────────────────────────────────────────────────────── @@ -339,6 +371,49 @@ app.patch('/debug/towns/:townId/config', async c => { return c.json(result); }); +app.get('/debug/towns/:townId/rigs', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const rigs = await town.listRigs(); + return c.json({ rigs }); +}); + +app.post('/debug/towns/:townId/sling-convoy', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const body: { + rigId: string; + convoyTitle: string; + tasks: Array<{ title: string; body?: string; depends_on?: number[] }>; + merge_mode?: 'review-then-land' | 'review-and-merge'; + staged?: boolean; + } = await c.req.json(); + if (!body.rigId || !body.convoyTitle || !Array.isArray(body.tasks)) { + return c.json({ error: 'Missing required fields: rigId, convoyTitle, tasks' }, 400); + } + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const result = await town.slingConvoy({ + rigId: body.rigId, + convoyTitle: body.convoyTitle, + tasks: body.tasks, + merge_mode: body.merge_mode, + staged: body.staged, + }); + return c.json(result); +}); + +app.get('/debug/towns/:townId/convoys', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const convoys = await town.listConvoys(); + return c.json({ convoys }); +}); + // ── Town ID + Auth ────────────────────────────────────────────────────── // All rig routes live under /api/towns/:townId/rigs/:rigId so the townId // is always available from the URL path. @@ -672,8 +747,20 @@ app.use('/api/towns/:townId/mayor-id', async (c: Context, ne app.get('/api/towns/:townId/mayor-id', async c => { const townId = c.req.param('townId'); const town = getTownDOStub(c.env, townId); - const agentId = await town.getMayorAgentId(); - return c.json({ success: true, agentId }); + // Response contract (consumed by fetchMayorPrewarmContext in the + // container's process-manager.ts): + // - When the town has a mayor AND a kilocode token, return the full + // prewarm context so KILO_CONFIG_CONTENT matches what /agents/start + // will send (no eviction churn in ensureSDKServer). + // - When the mayor agent exists but no kilocode token is available, + // return { agentId } only — the container will skip prewarm. + // - When there is no mayor at all, return { agentId: null } — the + // container treats missing/null agentId as "no mayor, skip prewarm". + const ctx = await town.getMayorPrewarmContext(); + if (!ctx) { + return c.json({ success: true, agentId: null }); + } + return c.json({ success: true, ...ctx }); }); // ── Container Events ───────────────────────────────────────────────────── diff --git a/services/gastown/src/handlers/refresh-git-token.handler.ts b/services/gastown/src/handlers/refresh-git-token.handler.ts index 69afa61f28..c11d200016 100644 --- a/services/gastown/src/handlers/refresh-git-token.handler.ts +++ b/services/gastown/src/handlers/refresh-git-token.handler.ts @@ -52,7 +52,7 @@ export async function handleRefreshGitToken( platformIntegrationId: rigConfig?.platformIntegrationId, }); - if (!freshToken) { + if (!freshToken.ok) { console.warn( `[refresh-git-token] no token available for town=${params.townId} rig=${params.rigId}` ); @@ -62,5 +62,5 @@ export async function handleRefreshGitToken( console.log( `[refresh-git-token] refreshed git token for town=${params.townId} rig=${params.rigId}` ); - return c.json(resSuccess({ token: freshToken }), 200); + return c.json(resSuccess({ token: freshToken.token }), 200); } diff --git a/services/gastown/test/integration/pr-poll-errors.test.ts b/services/gastown/test/integration/pr-poll-errors.test.ts new file mode 100644 index 0000000000..bbc7dfb291 --- /dev/null +++ b/services/gastown/test/integration/pr-poll-errors.test.ts @@ -0,0 +1,101 @@ +/** + * Integration tests for PR poll error discrimination (#3149). + * + * Tests verify that the poll_pr action handler correctly handles the + * discriminated PRStatusOutcome: immediate-fail for no_token / non-transient + * HTTP errors, 3-strike threshold for invalid_response / unrecognized_url / + * host_mismatch, and 10-strike threshold for transient HTTP errors (5xx, 429). + * + * The no-token test runs fully end-to-end through the DO alarm. HTTP error + * scenarios are covered by unit tests (test/unit/pr-poll-*.test.ts) since + * mocking fetch is not practical in the Cloudflare Workers test runtime. + */ + +import { env, runDurableObjectAlarm } from 'cloudflare:test'; +import { describe, it, expect, beforeEach } from 'vitest'; + +function getTownStub(name = 'test-town') { + const id = env.TOWN.idFromName(name); + return env.TOWN.get(id); +} + +describe('PR poll error discrimination (#3149)', () => { + let town: ReturnType; + let townName: string; + + beforeEach(async () => { + townName = `pr-poll-${crypto.randomUUID()}`; + town = getTownStub(townName); + await town.setTownId(townName); + }); + + async function setupMrBeadWithPrUrl(prUrl: string) { + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + + const result = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'PR Poll Test', + tasks: [{ title: 'Task 1' }], + }); + + await runDurableObjectAlarm(town); + + const beadId = result.beads[0].bead.bead_id; + const bead = await town.getBeadAsync(beadId); + const agentId = bead!.assignee_agent_bead_id!; + expect(agentId).toBeTruthy(); + + await town.agentDone(agentId, { + branch: 'gt/polecat/test-branch', + summary: 'Completed task', + }); + + await runDurableObjectAlarm(town); + + const allMrs = await town.listBeads({ type: 'merge_request' }); + const mrBead = allMrs.find(b => b.metadata?.source_bead_id === beadId); + expect(mrBead).toBeTruthy(); + + // Set the PR URL and put the MR bead back to in_progress so the + // reconciler will schedule a poll_pr action on the next alarm tick. + await town.updateBead( + mrBead!.bead_id, + { + metadata: { + ...(mrBead!.metadata ?? {}), + pr_url: prUrl, + }, + }, + 'system' + ); + await town.updateBeadStatus(mrBead!.bead_id, 'in_progress', 'system'); + + return { beadId, mrBeadId: mrBead!.bead_id, agentId, convoyId: result.convoy.id }; + } + + describe('no_token: town with no GitHub token', () => { + it('should fail the MR bead immediately (1 strike) with failureKind "no_token"', async () => { + // Town is set up with no git_auth, so resolveGitHubToken will return + // { ok: false, tried: [...] } and checkPRStatus will return a no_token error. + const { mrBeadId } = await setupMrBeadWithPrUrl('https://github.com/test/repo/pull/1'); + + // Run alarm — the reconciler should generate a poll_pr action, which + // calls checkPRStatus and gets a no_token error. Since no_token is an + // immediate-fail error, the bead should fail on the first poll. + await runDurableObjectAlarm(town); + + const mrBead = await town.getBeadAsync(mrBeadId); + expect(mrBead?.status).toBe('failed'); + expect(mrBead?.metadata?.failureReason).toBe('pr_poll_failed'); + expect(mrBead?.metadata?.failureKind).toBe('no_token'); + expect(mrBead?.metadata?.failureMessage).toContain('No GitHub token resolved'); + expect(mrBead?.metadata?.failureMessage).toContain('town.git_auth.github_token'); + expect(mrBead?.metadata?.failureMessage).toContain('town.github_cli_pat'); + }); + }); +}); diff --git a/services/gastown/test/unit/pr-poll-errors.test.ts b/services/gastown/test/unit/pr-poll-errors.test.ts new file mode 100644 index 0000000000..5bd471634f --- /dev/null +++ b/services/gastown/test/unit/pr-poll-errors.test.ts @@ -0,0 +1,318 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { + checkPRStatus, + resolveGitHubToken, + type SCMContext, + type PRStatusOutcome, +} from '../../src/dos/town/town-scm'; +import type { TownConfig } from '../../src/types'; + +function mockSCMContext(overrides: Partial = {}): SCMContext { + return { + env: {} as SCMContext['env'], + townId: 'test-town', + getTownConfig: async () => ({}) as TownConfig, + ...overrides, + }; +} + +describe('resolveGitHubToken', () => { + it('returns ok:true with source when github_token is configured', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_abc123' } }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.token).toBe('ghp_abc123'); + expect(result.source).toBe('town.git_auth.github_token'); + } + }); + + it('prefers github_cli_pat over stored github_token', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => + ({ git_auth: { github_token: 'ghp_stored' }, github_cli_pat: 'ghp_pat123' }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.token).toBe('ghp_pat123'); + expect(result.source).toBe('town.github_cli_pat'); + } + }); + + it('returns ok:false with resolution chain when no token is configured', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({}) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('town.git_auth.github_token'); + expect(result.tried).toContain('town.github_cli_pat'); + expect(result.tried).toContain('platform integration (none configured)'); + } + }); + + it('includes platform integration source label in resolution chain', async () => { + const ctx = mockSCMContext({ + platformIntegrationId: 'integration-123', + env: { GIT_TOKEN_SERVICE: { getToken: async () => null } } as unknown as SCMContext['env'], + getTownConfig: async () => + ({ git_auth: { platform_integration_id: 'integration-123' } }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('town platform integration'); + } + }); + + it('uses rig platform integration label when town has no platform_integration_id', async () => { + const ctx = mockSCMContext({ + platformIntegrationId: 'rig-integration-456', + env: { GIT_TOKEN_SERVICE: { getToken: async () => null } } as unknown as SCMContext['env'], + getTownConfig: async () => ({}) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('rig platform integration'); + } + }); + + it('includes GIT_TOKEN_SERVICE not bound annotation when integrationId set but service missing', async () => { + const ctx = mockSCMContext({ + platformIntegrationId: 'integration-789', + env: {} as SCMContext['env'], + getTownConfig: async () => + ({ git_auth: { platform_integration_id: 'integration-789' } }) as TownConfig, + }); + const result = await resolveGitHubToken(ctx); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.tried).toContain('town platform integration (GIT_TOKEN_SERVICE not bound)'); + } + }); +}); + +describe('checkPRStatus', () => { + beforeEach(() => { + vi.restoreAllMocks(); + }); + + it('returns no_token error when no GitHub token is available', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({}) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok) { + expect(outcome.error.kind).toBe('no_token'); + expect(outcome.error.provider).toBe('github'); + if (outcome.error.kind === 'no_token') { + expect(outcome.error.resolutionChain).toContain('town.git_auth.github_token'); + expect(outcome.error.resolutionChain).toContain('town.github_cli_pat'); + } + } + }); + + it('returns http_error with transient:true for 5xx status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Server Error', { status: 503, statusText: 'Service Unavailable' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(503); + expect(outcome.error.transient).toBe(true); + expect(outcome.error.provider).toBe('github'); + } + }); + + it('returns http_error with transient:true for 429 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Rate Limited', { status: 429, statusText: 'Too Many Requests' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(429); + expect(outcome.error.transient).toBe(true); + } + }); + + it('returns http_error with transient:false for 401 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Unauthorized', { status: 401, statusText: 'Unauthorized' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_bad' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(401); + expect(outcome.error.transient).toBe(false); + } + }); + + it('returns http_error with transient:false for 403 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Forbidden', { status: 403, statusText: 'Forbidden' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_limited' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(403); + expect(outcome.error.transient).toBe(false); + } + }); + + it('returns http_error with transient:false for 404 status', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('Not Found', { status: 404, statusText: 'Not Found' }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'http_error') { + expect(outcome.error.status).toBe(404); + expect(outcome.error.transient).toBe(false); + } + }); + + it('returns invalid_response with json_parse when response body is not JSON', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response('not json', { + status: 200, + headers: { 'Content-Type': 'text/plain' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'invalid_response') { + expect(outcome.error.reason).toBe('json_parse'); + expect(outcome.error.provider).toBe('github'); + } + }); + + it('returns invalid_response with schema_mismatch and sampleKeys when response shape is wrong', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ id: 42, title: 'not a PR', random_field: true }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'invalid_response') { + expect(outcome.error.reason).toBe('schema_mismatch'); + expect(outcome.error.sampleKeys).toBeDefined(); + expect(outcome.error.sampleKeys!.length).toBeGreaterThan(0); + expect(outcome.error.provider).toBe('github'); + } + }); + + it('returns unrecognized_url for non-GitHub/GitLab URLs', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => ({}) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://example.com/something'); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'unrecognized_url') { + expect(outcome.error.url).toBe('https://example.com/something'); + } + }); + + it('returns ok:true with merged status for merged PR', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ state: 'closed', merged: true, mergeable_state: 'clean' }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(true); + if (outcome.ok) { + expect(outcome.result.status).toBe('merged'); + } + }); + + it('returns ok:true with open status for open PR', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ state: 'open', merged: false, mergeable_state: 'clean' }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { github_token: 'ghp_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://github.com/owner/repo/pull/1'); + expect(outcome.ok).toBe(true); + if (outcome.ok) { + expect(outcome.result.status).toBe('open'); + expect(outcome.result.mergeable_state).toBe('clean'); + } + }); + + it('returns host_mismatch for GitLab URL with unknown host', async () => { + const ctx = mockSCMContext({ + getTownConfig: async () => + ({ + git_auth: { + gitlab_token: 'glpat_test', + gitlab_instance_url: 'https://gitlab.mycompany.com', + }, + }) as TownConfig, + }); + const outcome = await checkPRStatus( + ctx, + 'https://gitlab.evil.com/group/project/-/merge_requests/5' + ); + expect(outcome.ok).toBe(false); + if (!outcome.ok && outcome.error.kind === 'host_mismatch') { + expect(outcome.error.got).toBe('gitlab.evil.com'); + expect(outcome.error.expected).toBe('gitlab.mycompany.com'); + } + }); + + it('allows gitlab.com without host validation', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + new Response(JSON.stringify({ state: 'merged' }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + ); + const ctx = mockSCMContext({ + getTownConfig: async () => ({ git_auth: { gitlab_token: 'glpat_test' } }) as TownConfig, + }); + const outcome = await checkPRStatus(ctx, 'https://gitlab.com/group/project/-/merge_requests/5'); + expect(outcome.ok).toBe(true); + if (outcome.ok) { + expect(outcome.result.status).toBe('merged'); + } + }); +}); diff --git a/services/gastown/test/unit/pr-poll-thresholds.test.ts b/services/gastown/test/unit/pr-poll-thresholds.test.ts new file mode 100644 index 0000000000..d522fc0b6c --- /dev/null +++ b/services/gastown/test/unit/pr-poll-thresholds.test.ts @@ -0,0 +1,377 @@ +import { describe, it, expect } from 'vitest'; +import { + _failureMessageFor, + _nextPollCounterState, + _shouldFailImmediately, + _shouldCountAsTransient, +} from '../../src/dos/town/actions'; +import type { PRStatusError } from '../../src/dos/town/town-scm'; + +describe('failureMessageFor', () => { + it('produces actionable message for no_token with resolution chain', () => { + const error: PRStatusError = { + kind: 'no_token', + provider: 'github', + resolutionChain: [ + 'town.git_auth.github_token', + 'town.github_cli_pat', + 'town platform integration', + 'rig platform integration', + ], + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('No GitHub token resolved'); + expect(msg).toContain('`town.git_auth.github_token`'); + expect(msg).toContain('`town.github_cli_pat`'); + expect(msg).toContain('`town platform integration`'); + expect(msg).toContain('`rig platform integration`'); + expect(msg).toContain('polecat agents use their own container credentials'); + }); + + it('produces specific message for HTTP 401', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('invalid or expired'); + expect(msg).toContain('HTTP 401'); + }); + + it('produces specific message for HTTP 403', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 403, + statusText: 'Forbidden', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('lacks permission'); + expect(msg).toContain('HTTP 403'); + }); + + it('produces specific message for HTTP 404', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 404, + statusText: 'Not Found', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('not found'); + expect(msg).toContain('HTTP 404'); + }); + + it('produces GitLab-specific messages', () => { + const noTokenMessage = _failureMessageFor({ + kind: 'no_token', + provider: 'gitlab', + resolutionChain: ['town.git_auth.gitlab_token'], + }); + expect(noTokenMessage).toContain('No GitLab token resolved'); + + const forbiddenMessage = _failureMessageFor({ + kind: 'http_error', + provider: 'gitlab', + status: 403, + statusText: 'Forbidden', + transient: false, + }); + expect(forbiddenMessage).toContain("Town's GitLab token lacks permission"); + expect(forbiddenMessage).toContain('merge requests'); + expect(forbiddenMessage).not.toContain('pull-requests: read'); + }); + + it('produces generic HTTP message for other status codes', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 422, + statusText: 'Unprocessable Entity', + transient: false, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('HTTP 422'); + expect(msg).toContain('Not retryable'); + }); + + it('indicates Retrying for transient HTTP errors', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('Retrying'); + }); + + it('produces message with sampleKeys for schema_mismatch', () => { + const error: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + sampleKeys: ['id', 'title', 'random_field'], + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('unexpected response shape'); + expect(msg).toContain('schema_mismatch'); + expect(msg).toContain('id, title, random_field'); + expect(msg).toContain('file a bug'); + }); + + it('produces message without sampleKeys for json_parse', () => { + const error: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'json_parse', + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('json_parse'); + expect(msg).not.toContain('top-level keys'); + }); + + it('produces message for unrecognized_url', () => { + const error: PRStatusError = { + kind: 'unrecognized_url', + url: 'https://bitbucket.org/repo/pull/1', + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('not recognized'); + expect(msg).toContain('https://bitbucket.org/repo/pull/1'); + }); + + it('produces message for host_mismatch', () => { + const error: PRStatusError = { + kind: 'host_mismatch', + provider: 'gitlab', + expected: 'gitlab.mycompany.com', + got: 'gitlab.evil.com', + }; + const msg = _failureMessageFor(error); + expect(msg).toContain('Refusing to send GitLab token'); + expect(msg).toContain('gitlab.evil.com'); + expect(msg).toContain('gitlab.mycompany.com'); + }); +}); + +describe('shouldFailImmediately', () => { + it('returns true for no_token', () => { + const error: PRStatusError = { kind: 'no_token', provider: 'github', resolutionChain: [] }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for http_error with transient:false (401)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for http_error with transient:false (403)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 403, + statusText: 'Forbidden', + transient: false, + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for http_error with transient:false (404)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 404, + statusText: 'Not Found', + transient: false, + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns false for http_error with transient:true (5xx)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + expect(_shouldFailImmediately(error)).toBe(false); + }); + + it('returns false for invalid_response', () => { + const error: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + }; + expect(_shouldFailImmediately(error)).toBe(false); + }); + + it('returns true for unrecognized_url', () => { + const error: PRStatusError = { kind: 'unrecognized_url', url: 'https://example.com' }; + expect(_shouldFailImmediately(error)).toBe(true); + }); + + it('returns true for host_mismatch', () => { + const error: PRStatusError = { + kind: 'host_mismatch', + provider: 'gitlab', + expected: 'gitlab.com', + got: 'evil.com', + }; + expect(_shouldFailImmediately(error)).toBe(true); + }); +}); + +describe('shouldCountAsTransient', () => { + it('returns true for http_error with transient:true (5xx)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + expect(_shouldCountAsTransient(error)).toBe(true); + }); + + it('returns true for http_error with transient:true (429)', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 429, + statusText: 'Too Many Requests', + transient: true, + }; + expect(_shouldCountAsTransient(error)).toBe(true); + }); + + it('returns false for http_error with transient:false', () => { + const error: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }; + expect(_shouldCountAsTransient(error)).toBe(false); + }); + + it('returns false for non-http_error kinds', () => { + const error: PRStatusError = { kind: 'no_token', provider: 'github', resolutionChain: [] }; + expect(_shouldCountAsTransient(error)).toBe(false); + }); +}); + +describe('error categorization is mutually exclusive', () => { + const allKinds: PRStatusError[] = [ + { kind: 'no_token', provider: 'github', resolutionChain: [] }, + { + kind: 'http_error', + provider: 'github', + status: 401, + statusText: 'Unauthorized', + transient: false, + }, + { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }, + { kind: 'invalid_response', provider: 'github', reason: 'schema_mismatch' }, + { kind: 'unrecognized_url', url: 'https://example.com' }, + { kind: 'host_mismatch', provider: 'gitlab', expected: 'gitlab.com', got: 'evil.com' }, + ]; + + it('each error kind falls into exactly one bucket', () => { + for (const error of allKinds) { + const buckets = [_shouldFailImmediately(error), _shouldCountAsTransient(error)].filter( + Boolean + ); + expect(buckets.length, `${error.kind} should match exactly one bucket`).toBeLessThanOrEqual( + 1 + ); + } + }); + + it('invalid_response is the only kind in the 3-strike (non-transient) bucket', () => { + for (const error of allKinds) { + const isNonTransient = !_shouldFailImmediately(error) && !_shouldCountAsTransient(error); + if (error.kind === 'invalid_response') { + expect(isNonTransient, `invalid_response should be in 3-strike bucket`).toBe(true); + } else { + expect(isNonTransient, `${error.kind} should NOT be in 3-strike bucket`).toBe(false); + } + } + }); +}); + +describe('counter cross-contamination', () => { + it('9 transient errors then 1 non-transient should not fail (separate counters)', () => { + const transientError: PRStatusError = { + kind: 'http_error', + provider: 'github', + status: 503, + statusText: 'Service Unavailable', + transient: true, + }; + const nonTransientError: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + }; + + let state = { pollTransientCount: 0, pollNonTransientCount: 0, shouldFail: false }; + for (let i = 0; i < 9; i++) { + state = _nextPollCounterState(transientError, state); + } + + expect(state).toEqual({ + pollTransientCount: 9, + pollNonTransientCount: 0, + shouldFail: false, + }); + + state = _nextPollCounterState(nonTransientError, state); + + expect(state).toEqual({ + pollTransientCount: 0, + pollNonTransientCount: 1, + shouldFail: false, + }); + }); + + it('3 consecutive non-transient errors should fail (non-transient counter reaches threshold)', () => { + const nonTransientError: PRStatusError = { + kind: 'invalid_response', + provider: 'github', + reason: 'schema_mismatch', + }; + + let state = { pollTransientCount: 0, pollNonTransientCount: 0, shouldFail: false }; + for (let i = 0; i < 3; i++) { + state = _nextPollCounterState(nonTransientError, state); + } + + expect(state).toEqual({ + pollTransientCount: 0, + pollNonTransientCount: 3, + shouldFail: true, + }); + }); +}); diff --git a/services/gastown/test/unit/town-scm.test.ts b/services/gastown/test/unit/town-scm.test.ts new file mode 100644 index 0000000000..7f1d07cc18 --- /dev/null +++ b/services/gastown/test/unit/town-scm.test.ts @@ -0,0 +1,151 @@ +import { describe, it, expect } from 'vitest'; +import { resolveGitHubToken } from '../../src/dos/town/town-scm'; +import { TownConfigSchema, type TownConfig } from '../../src/types'; + +const STORED_GITHUB_TOKEN = 'ghs_stored_stale_token'; +const FRESH_INSTALLATION_TOKEN = 'ghs_fresh_from_integration'; +const USER_PAT = 'ghp_user_long_lived_pat'; +const INTEGRATION_ID = '119277743'; + +function buildConfig(overrides: { + github_token?: string; + github_cli_pat?: string; + platform_integration_id?: string; +}): TownConfig { + return TownConfigSchema.parse({ + git_auth: { + github_token: overrides.github_token, + platform_integration_id: overrides.platform_integration_id, + }, + github_cli_pat: overrides.github_cli_pat, + }); +} + +function fakeEnv(opts: { + tokenServiceResponse?: string | null; + tokenServiceShouldThrow?: boolean; +}): Env { + return { + GIT_TOKEN_SERVICE: { + getToken: async (_id: string) => { + if (opts.tokenServiceShouldThrow) { + throw new Error('integration lookup failed'); + } + return opts.tokenServiceResponse ?? FRESH_INSTALLATION_TOKEN; + }, + }, + } as unknown as Env; +} + +describe('resolveGitHubToken priority chain', () => { + it('prefers github_cli_pat over everything else', async () => { + const cfg = buildConfig({ + github_cli_pat: USER_PAT, + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const result = await resolveGitHubToken({ + env: fakeEnv({}), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ ok: true, token: USER_PAT, source: 'town.github_cli_pat' }); + }); + + it('returns a fresh integration token when an integration is configured, ignoring the stale stored token', async () => { + const cfg = buildConfig({ + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const result = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceResponse: FRESH_INSTALLATION_TOKEN }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ + ok: true, + token: FRESH_INSTALLATION_TOKEN, + source: 'town platform integration', + }); + }); + + it('falls back to stored github_token when no integration is configured', async () => { + const cfg = buildConfig({ github_token: STORED_GITHUB_TOKEN }); + const result = await resolveGitHubToken({ + env: fakeEnv({}), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ + ok: true, + token: STORED_GITHUB_TOKEN, + source: 'town.git_auth.github_token', + }); + }); + + it('falls back to stored github_token when integration lookup throws', async () => { + const cfg = buildConfig({ + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const result = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceShouldThrow: true }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ + ok: true, + token: STORED_GITHUB_TOKEN, + source: 'town.git_auth.github_token', + }); + }); + + it('uses the rig-level platformIntegrationId when town config does not carry one', async () => { + const cfg = buildConfig({}); + const result = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceResponse: FRESH_INSTALLATION_TOKEN }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + platformIntegrationId: INTEGRATION_ID, + }); + expect(result).toEqual({ + ok: true, + token: FRESH_INSTALLATION_TOKEN, + source: 'rig platform integration', + }); + }); + + it('returns ok:false with tried chain when nothing is configured', async () => { + const cfg = buildConfig({}); + const result = await resolveGitHubToken({ + env: fakeEnv({}), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ + ok: false, + tried: [ + 'town.github_cli_pat', + 'platform integration (none configured)', + 'town.git_auth.github_token', + ], + }); + }); + + it('falls back to stored github_token when integration returns empty string', async () => { + const cfg = buildConfig({ + github_token: STORED_GITHUB_TOKEN, + platform_integration_id: INTEGRATION_ID, + }); + const result = await resolveGitHubToken({ + env: fakeEnv({ tokenServiceResponse: '' }), + townId: 'town-1', + getTownConfig: () => Promise.resolve(cfg), + }); + expect(result).toEqual({ + ok: true, + token: STORED_GITHUB_TOKEN, + source: 'town.git_auth.github_token', + }); + }); +}); diff --git a/services/gastown/wrangler.jsonc b/services/gastown/wrangler.jsonc index e613dc666a..8a92afc46b 100644 --- a/services/gastown/wrangler.jsonc +++ b/services/gastown/wrangler.jsonc @@ -19,7 +19,7 @@ "traces": { "enabled": true, "persist": true, - "head_sampling_rate": 1, + "head_sampling_rate": 0.1, }, }, "upload_source_maps": true, @@ -37,7 +37,7 @@ "class_name": "TownContainerDO", "image": "./container/Dockerfile", "instance_type": "standard-4", - "max_instances": 800, + "max_instances": 500, }, ],