diff --git a/netlify/functions/mcp.mjs b/netlify/functions/mcp.mjs index 3d98ec10..a5af7768 100644 --- a/netlify/functions/mcp.mjs +++ b/netlify/functions/mcp.mjs @@ -25,7 +25,7 @@ const makeRateLimiter = // -------------------- Config -------------------- -const SERVER_VERSION = '1.1.2' +const SERVER_VERSION = '1.1.3' // Hardcoded upstream const KAPA_MCP_SERVER_URL = 'https://redpanda.mcp.kapa.ai' @@ -140,6 +140,17 @@ function ensureKapaConnected() { } ) + // The transport keeps a persistent connection that's reused across warm + // invocations. When the container freezes/thaws, the idle socket is dropped + // and the error surfaces in the transport's background read loop. Handle it + // here (at the source) so it resets the cache rather than bubbling up as an + // unhandled rejection that crashes the invocation. + kapaTransport.onerror = (err) => { + console.warn('[mcp] kapa transport error; resetting connection', { error: err?.message || String(err) }) + resetKapaConnection() + } + kapaTransport.onclose = () => resetKapaConnection() + kapaConnectPromise = kapaClient.connect(kapaTransport) return kapaConnectPromise } @@ -175,6 +186,15 @@ function ensureBumpConnected() { new URL(BUMP_HUB_MCP_URL) ) + // Reset the cached connection if the persistent socket errors/closes in the + // background (e.g. dropped on container freeze/thaw) so the next request + // reconnects instead of the error crashing the invocation. See Kapa above. + transport.onerror = (err) => { + console.warn('[mcp] bump transport error; resetting connection', { error: err?.message || String(err) }) + resetBumpConnection() + } + transport.onclose = () => resetBumpConnection() + bumpConnectPromise = bumpClient.connect(transport) return bumpConnectPromise } @@ -596,15 +616,52 @@ Returns up to 10 pages per request. URLs must be from docs.redpanda.com/api/doc/ // -------------------- Netlify handler -------------------- +// Safety net: even with transport onerror/onclose handlers, a stray background +// socket error from a cached upstream connection can surface as an unhandled +// rejection — which the Lambda runtime treats as fatal ("Invalid request ID"). +// Recover by logging and resetting the cached connections so the next request +// reconnects, instead of crashing the invocation. Registered once per cold start. +const isUpstreamSocketError = (err) => + /ECONNRESET|socket hang up|EPIPE|ECONNREFUSED|\bsocket\b/i.test( + err instanceof Error ? err.message : String(err) + ) + +let processGuardsInstalled = false +function installProcessGuards() { + if (processGuardsInstalled) return + processGuardsInstalled = true + const reset = (label, err) => { + console.warn(`[mcp] ${label} (recovered, resetting upstream connections)`, { + error: err instanceof Error ? err.message : String(err), + }) + resetKapaConnection() + resetBumpConnection() + } + // The original incident: a background-read-loop rejection with no awaiter that + // the runtime treats as fatal. Recovering here (reset cached connections) is + // cheap and safe, and the error is logged either way. + process.on('unhandledRejection', (reason) => reset('unhandledRejection', reason)) + // uncaughtException is broader and can leave the process in a state Node's + // docs flag as unsafe, so only recover from known upstream socket drops; + // re-throw anything else so genuine bugs surface instead of being masked + // (re-throwing inside this handler terminates the process, as intended). + process.on('uncaughtException', (err) => { + if (isUpstreamSocketError(err)) { + reset('uncaughtException', err) + return + } + console.error('[mcp] fatal uncaughtException (not recovering)', { + error: err instanceof Error ? err.message : String(err), + stack: err instanceof Error ? err.stack : undefined, + }) + throw err + }) +} +installProcessGuards() + const baseHandler = handle({ server, pre: (app) => { - // IMPORTANT: - // Streamable HTTP opens a long-lived SSE stream via GET requests. - // Some rate limiter middleware can interfere with SSE and cause 500s on reconnect/idle. - // We therefore apply rate limiting ONLY to POST/DELETE (expensive operations), - // and allow GET (SSE stream) through un-limited. - const limiter = makeRateLimiter({ windowMs: 15 * 60 * 1000, // 15 minutes limit: 60, // limit each key to 60 requests per windowMs (tune as needed) @@ -616,8 +673,16 @@ const baseHandler = handle({ app.use('/mcp', async (c, next) => { const method = c.req.method if (method === 'GET') { - // Let SSE stream open/reconnect without limiter interference - return next() + // GET opens Streamable HTTP's optional server->client SSE stream. This + // server is request/response only (it never pushes server-initiated + // messages), so on serverless that stream just idles open until the + // function hits its max duration — a wasted full-length invocation per + // connected client. Decline it: the MCP spec allows 405 when the server + // doesn't offer an SSE stream on GET, and clients fall back to POST. + return c.text('Method Not Allowed', 405, { + Allow: 'POST, DELETE, OPTIONS', + 'Access-Control-Allow-Origin': '*', + }) } // Apply limiter to POST + DELETE (and anything else, if ever present) return limiter(c, next)