From 5b27720d70edc25403e21f581fb0c952c0a04871 Mon Sep 17 00:00:00 2001 From: JakeSCahill Date: Mon, 22 Jun 2026 12:02:01 +0100 Subject: [PATCH 1/2] Fix serverless crashes and 60s idle invocations in MCP function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two serverless-vs-long-lived-connection problems surfaced as Netlify function errors: 1. socket hang up / ECONNRESET -> Unhandled Promise Rejection -> 'Invalid request ID'. The cached upstream MCP clients (Kapa, Bump) hold persistent connections reused across warm invocations. When the container freezes/thaws, the idle socket is dropped and the error fires in the transport's background read loop with no awaiter, so the runtime kills the invocation. Fix: set onerror/onclose on each transport to reset the cached connection at the source, plus a process-level unhandledRejection/uncaughtException safety net that logs and resets instead of crashing. 2. Duration: 60000 ms. Every connected client opens the optional GET SSE stream; this server is request/response only, so on serverless that stream idles open until the function's max duration — a wasted full-length invocation per client. Fix: decline GET with 405 (spec allows this when no SSE stream is offered; clients use POST). Bumps SERVER_VERSION to 1.1.3. --- netlify/functions/mcp.mjs | 61 +++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/netlify/functions/mcp.mjs b/netlify/functions/mcp.mjs index 3d98ec10..ccf3f48a 100644 --- a/netlify/functions/mcp.mjs +++ b/netlify/functions/mcp.mjs @@ -25,7 +25,7 @@ const makeRateLimiter = // -------------------- Config -------------------- -const SERVER_VERSION = '1.1.2' +const SERVER_VERSION = '1.1.3' // Hardcoded upstream const KAPA_MCP_SERVER_URL = 'https://redpanda.mcp.kapa.ai' @@ -140,6 +140,17 @@ function ensureKapaConnected() { } ) + // The transport keeps a persistent connection that's reused across warm + // invocations. When the container freezes/thaws, the idle socket is dropped + // and the error surfaces in the transport's background read loop. Handle it + // here (at the source) so it resets the cache rather than bubbling up as an + // unhandled rejection that crashes the invocation. + kapaTransport.onerror = (err) => { + console.warn('[mcp] kapa transport error; resetting connection', { error: err?.message || String(err) }) + resetKapaConnection() + } + kapaTransport.onclose = () => resetKapaConnection() + kapaConnectPromise = kapaClient.connect(kapaTransport) return kapaConnectPromise } @@ -175,6 +186,15 @@ function ensureBumpConnected() { new URL(BUMP_HUB_MCP_URL) ) + // Reset the cached connection if the persistent socket errors/closes in the + // background (e.g. dropped on container freeze/thaw) so the next request + // reconnects instead of the error crashing the invocation. See Kapa above. + transport.onerror = (err) => { + console.warn('[mcp] bump transport error; resetting connection', { error: err?.message || String(err) }) + resetBumpConnection() + } + transport.onclose = () => resetBumpConnection() + bumpConnectPromise = bumpClient.connect(transport) return bumpConnectPromise } @@ -596,15 +616,30 @@ Returns up to 10 pages per request. URLs must be from docs.redpanda.com/api/doc/ // -------------------- Netlify handler -------------------- +// Safety net: even with transport onerror/onclose handlers, a stray background +// socket error from a cached upstream connection can surface as an unhandled +// rejection — which the Lambda runtime treats as fatal ("Invalid request ID"). +// Recover by logging and resetting the cached connections so the next request +// reconnects, instead of crashing the invocation. Registered once per cold start. +let processGuardsInstalled = false +function installProcessGuards() { + if (processGuardsInstalled) return + processGuardsInstalled = true + const recover = (label) => (err) => { + console.warn(`[mcp] ${label} (recovered, resetting upstream connections)`, { + error: err instanceof Error ? err.message : String(err), + }) + resetKapaConnection() + resetBumpConnection() + } + process.on('unhandledRejection', recover('unhandledRejection')) + process.on('uncaughtException', recover('uncaughtException')) +} +installProcessGuards() + const baseHandler = handle({ server, pre: (app) => { - // IMPORTANT: - // Streamable HTTP opens a long-lived SSE stream via GET requests. - // Some rate limiter middleware can interfere with SSE and cause 500s on reconnect/idle. - // We therefore apply rate limiting ONLY to POST/DELETE (expensive operations), - // and allow GET (SSE stream) through un-limited. - const limiter = makeRateLimiter({ windowMs: 15 * 60 * 1000, // 15 minutes limit: 60, // limit each key to 60 requests per windowMs (tune as needed) @@ -616,8 +651,16 @@ const baseHandler = handle({ app.use('/mcp', async (c, next) => { const method = c.req.method if (method === 'GET') { - // Let SSE stream open/reconnect without limiter interference - return next() + // GET opens Streamable HTTP's optional server->client SSE stream. This + // server is request/response only (it never pushes server-initiated + // messages), so on serverless that stream just idles open until the + // function hits its max duration — a wasted full-length invocation per + // connected client. Decline it: the MCP spec allows 405 when the server + // doesn't offer an SSE stream on GET, and clients fall back to POST. + return c.text('Method Not Allowed', 405, { + Allow: 'POST, DELETE, OPTIONS', + 'Access-Control-Allow-Origin': '*', + }) } // Apply limiter to POST + DELETE (and anything else, if ever present) return limiter(c, next) From e0a1cb289ec5525628e23188ce3f0397382b62f5 Mon Sep 17 00:00:00 2001 From: JakeSCahill Date: Mon, 22 Jun 2026 20:05:46 +0100 Subject: [PATCH 2/2] Scope uncaughtException guard to upstream socket errors (review) Per review: the broad uncaughtException handler could mask real bugs and leave the process in an unsafe state. Recover only from known upstream socket drops (ECONNRESET / socket hang up / EPIPE / ECONNREFUSED); log and re-throw anything else so it surfaces normally. unhandledRejection (the actual incident path) stays a recover-all, as reviewed. --- netlify/functions/mcp.mjs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/netlify/functions/mcp.mjs b/netlify/functions/mcp.mjs index ccf3f48a..a5af7768 100644 --- a/netlify/functions/mcp.mjs +++ b/netlify/functions/mcp.mjs @@ -621,19 +621,41 @@ Returns up to 10 pages per request. URLs must be from docs.redpanda.com/api/doc/ // rejection — which the Lambda runtime treats as fatal ("Invalid request ID"). // Recover by logging and resetting the cached connections so the next request // reconnects, instead of crashing the invocation. Registered once per cold start. +const isUpstreamSocketError = (err) => + /ECONNRESET|socket hang up|EPIPE|ECONNREFUSED|\bsocket\b/i.test( + err instanceof Error ? err.message : String(err) + ) + let processGuardsInstalled = false function installProcessGuards() { if (processGuardsInstalled) return processGuardsInstalled = true - const recover = (label) => (err) => { + const reset = (label, err) => { console.warn(`[mcp] ${label} (recovered, resetting upstream connections)`, { error: err instanceof Error ? err.message : String(err), }) resetKapaConnection() resetBumpConnection() } - process.on('unhandledRejection', recover('unhandledRejection')) - process.on('uncaughtException', recover('uncaughtException')) + // The original incident: a background-read-loop rejection with no awaiter that + // the runtime treats as fatal. Recovering here (reset cached connections) is + // cheap and safe, and the error is logged either way. + process.on('unhandledRejection', (reason) => reset('unhandledRejection', reason)) + // uncaughtException is broader and can leave the process in a state Node's + // docs flag as unsafe, so only recover from known upstream socket drops; + // re-throw anything else so genuine bugs surface instead of being masked + // (re-throwing inside this handler terminates the process, as intended). + process.on('uncaughtException', (err) => { + if (isUpstreamSocketError(err)) { + reset('uncaughtException', err) + return + } + console.error('[mcp] fatal uncaughtException (not recovering)', { + error: err instanceof Error ? err.message : String(err), + stack: err instanceof Error ? err.stack : undefined, + }) + throw err + }) } installProcessGuards()