From 88ebbce0027e5e9f4a2e6c797ecd4fa9cb9f67aa Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 26 May 2026 09:07:10 -0700 Subject: [PATCH] fix(hub): JWKS warm should not block port binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Titanium JWKS warm at boot called process.exit(1) on any failure, which has been taking down every Coolify deploy of remo-code-hub since the upstream Keygen JWKS endpoint at `/v1/accounts/{ACCOUNT_ID}/.well-known/jwks.json` returns 404. Architecturally, a stalled Titanium JWKS endpoint should never block the hub from binding its port — the hub serves health checks, public webhooks (Coolify, Sentry intake), the web SPA, the scheduler, and the agent WebSocket, none of which require Titanium JWT verification. Only license-gated routes do, and those fail closed at request time via `verifyLicenseJwt` which already lazy-warms on first use through jose's `createRemoteJWKSet` resolver. Change: log the warm failure loudly and continue binding the port. Misconfiguration remains visible in deploy logs; production stays up. Unblocks PR #60 (started_at scheduler fix) from going live before the midnight cron fires. --- hub/src/index.ts | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/hub/src/index.ts b/hub/src/index.ts index b332e67..ffbb4cc 100644 --- a/hub/src/index.ts +++ b/hub/src/index.ts @@ -267,18 +267,25 @@ function decrementIp(ip: string) { else wsConnectionsPerIp.set(ip, count - 1) } -// Phase 07-A: warm Titanium JWKS BEFORE binding the port. The hub MUST NOT -// serve traffic without JWKS available once Titanium is configured. While -// `TITANIUM_KEYGEN_API_URL` is unset (Plan A pre-cutover state), this is a -// no-op so dev environments without Titanium continue to boot. +// Phase 07-A: warm Titanium JWKS at boot if configured. Previously this was a +// hard refuse-to-bind gate — that was the wrong call. JWKS warm failure must +// NOT block the hub from binding its port: the hub serves many surfaces that +// don't need Titanium JWT verification (health checks, public webhooks, the +// web SPA, the scheduler, agent WS), and a stalled/404 Titanium endpoint +// should never take production down. Auth-gated routes fail closed at request +// time via `verifyLicenseJwt`, which lazily warms on first use. We log +// loudly so misconfiguration is still obvious in deploy logs. if (config.titanium.keygenApiUrl) { try { const { warmJwksCache } = await import('./titanium-client') const keyCount = await warmJwksCache() console.log(`[titanium] JWKS warmed (${keyCount} keys)`) } catch (err) { - console.error('[titanium] JWKS warm failed — refusing to bind port:', (err as Error).message) - process.exit(1) + console.error( + '[titanium] JWKS warm failed at boot — continuing to bind port; ' + + 'auth-gated routes will retry warm on first verify and fail closed if still unavailable:', + (err as Error).message, + ) } }