Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 74 additions & 9 deletions netlify/functions/mcp.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const makeRateLimiter =

// -------------------- Config --------------------

const SERVER_VERSION = '1.1.2'
const SERVER_VERSION = '1.1.3'

// Hardcoded upstream
const KAPA_MCP_SERVER_URL = 'https://redpanda.mcp.kapa.ai'
Expand Down Expand Up @@ -140,6 +140,17 @@ function ensureKapaConnected() {
}
)

// The transport keeps a persistent connection that's reused across warm
// invocations. When the container freezes/thaws, the idle socket is dropped
// and the error surfaces in the transport's background read loop. Handle it
// here (at the source) so it resets the cache rather than bubbling up as an
// unhandled rejection that crashes the invocation.
kapaTransport.onerror = (err) => {
console.warn('[mcp] kapa transport error; resetting connection', { error: err?.message || String(err) })
resetKapaConnection()
}
kapaTransport.onclose = () => resetKapaConnection()

kapaConnectPromise = kapaClient.connect(kapaTransport)
return kapaConnectPromise
}
Expand Down Expand Up @@ -175,6 +186,15 @@ function ensureBumpConnected() {
new URL(BUMP_HUB_MCP_URL)
)

// Reset the cached connection if the persistent socket errors/closes in the
// background (e.g. dropped on container freeze/thaw) so the next request
// reconnects instead of the error crashing the invocation. See Kapa above.
transport.onerror = (err) => {
console.warn('[mcp] bump transport error; resetting connection', { error: err?.message || String(err) })
resetBumpConnection()
}
transport.onclose = () => resetBumpConnection()

bumpConnectPromise = bumpClient.connect(transport)
return bumpConnectPromise
}
Expand Down Expand Up @@ -596,15 +616,52 @@ Returns up to 10 pages per request. URLs must be from docs.redpanda.com/api/doc/

// -------------------- Netlify handler --------------------

// Safety net: even with transport onerror/onclose handlers, a stray background
// socket error from a cached upstream connection can surface as an unhandled
// rejection — which the Lambda runtime treats as fatal ("Invalid request ID").
// Recover by logging and resetting the cached connections so the next request
// reconnects, instead of crashing the invocation. Registered once per cold start.
const isUpstreamSocketError = (err) =>
/ECONNRESET|socket hang up|EPIPE|ECONNREFUSED|\bsocket\b/i.test(
err instanceof Error ? err.message : String(err)
)

let processGuardsInstalled = false
function installProcessGuards() {
if (processGuardsInstalled) return
processGuardsInstalled = true
const reset = (label, err) => {
console.warn(`[mcp] ${label} (recovered, resetting upstream connections)`, {
error: err instanceof Error ? err.message : String(err),
})
resetKapaConnection()
resetBumpConnection()
}
// The original incident: a background-read-loop rejection with no awaiter that
// the runtime treats as fatal. Recovering here (reset cached connections) is
// cheap and safe, and the error is logged either way.
process.on('unhandledRejection', (reason) => reset('unhandledRejection', reason))
// uncaughtException is broader and can leave the process in a state Node's
// docs flag as unsafe, so only recover from known upstream socket drops;
// re-throw anything else so genuine bugs surface instead of being masked
// (re-throwing inside this handler terminates the process, as intended).
process.on('uncaughtException', (err) => {
if (isUpstreamSocketError(err)) {
reset('uncaughtException', err)
return
}
console.error('[mcp] fatal uncaughtException (not recovering)', {
error: err instanceof Error ? err.message : String(err),
stack: err instanceof Error ? err.stack : undefined,
})
throw err
})
}
installProcessGuards()

const baseHandler = handle({
server,
pre: (app) => {
// IMPORTANT:
// Streamable HTTP opens a long-lived SSE stream via GET requests.
// Some rate limiter middleware can interfere with SSE and cause 500s on reconnect/idle.
// We therefore apply rate limiting ONLY to POST/DELETE (expensive operations),
// and allow GET (SSE stream) through un-limited.

const limiter = makeRateLimiter({
windowMs: 15 * 60 * 1000, // 15 minutes
limit: 60, // limit each key to 60 requests per windowMs (tune as needed)
Expand All @@ -616,8 +673,16 @@ const baseHandler = handle({
app.use('/mcp', async (c, next) => {
const method = c.req.method
if (method === 'GET') {
// Let SSE stream open/reconnect without limiter interference
return next()
// GET opens Streamable HTTP's optional server->client SSE stream. This
// server is request/response only (it never pushes server-initiated
// messages), so on serverless that stream just idles open until the
// function hits its max duration — a wasted full-length invocation per
// connected client. Decline it: the MCP spec allows 405 when the server
// doesn't offer an SSE stream on GET, and clients fall back to POST.
return c.text('Method Not Allowed', 405, {
Allow: 'POST, DELETE, OPTIONS',
'Access-Control-Allow-Origin': '*',
})
}
// Apply limiter to POST + DELETE (and anything else, if ever present)
return limiter(c, next)
Expand Down