diff --git a/README.md b/README.md index 337ee4f..11e63fa 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,8 @@ agentpay-backend/ - [Billing units and settlement semantics](docs/billing-units.md) explains stroops, `priceStroops`, `billedStroops`, `/api/v1/billing/*`, and why `POST /api/v1/settle` drains backend counters without moving funds. +- [Metrics](docs/metrics.md) documents Prometheus gauges, HTTP counters, + duration histograms, and error counters exposed by `GET /api/v1/metrics`. ## Quickstart diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 0000000..2d65edb --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,25 @@ +# Metrics + +`GET /api/v1/metrics` exposes Prometheus text format (`text/plain; version=0.0.4`). + +The endpoint keeps the existing operational gauges: + +- `agentpay_services_total` +- `agentpay_api_keys_total` +- `agentpay_usage_requests_total` +- `agentpay_paused` + +It also records HTTP traffic metrics: + +- `agentpay_http_requests_total{method,route,status}` counts completed + responses. +- `agentpay_http_request_duration_seconds{method,route,status}` is a histogram + with bucket, sum, and count samples. +- `agentpay_http_errors_total{type}` counts requests that reached the terminal + Express error handler. + +`route` uses the matched Express route pattern, such as +`/api/v1/usage/:agent/:serviceId`, rather than raw request paths. Unmatched +routes and parser failures use `route="unmatched"` so agent IDs, service IDs, +and other user-controlled path segments do not create high-cardinality metric +labels. diff --git a/src/metrics.test.ts b/src/metrics.test.ts new file mode 100644 index 0000000..5dac452 --- /dev/null +++ b/src/metrics.test.ts @@ -0,0 +1,95 @@ +import { beforeEach, describe, it } from "node:test"; +import assert from "node:assert"; +import request from "supertest"; +import { createApp } from "./index.js"; +import { eventLog } from "./events.js"; +import { resetHttpMetrics } from "./metrics.js"; +import { + apiKeyStore, + pauseState, + rateBuckets, + servicesDisabled, + servicesMetadata, + servicesStore, + usageStore, + webhookStore, +} from "./store/state.js"; + +beforeEach(() => { + apiKeyStore.clear(); + eventLog.length = 0; + rateBuckets.clear(); + servicesDisabled.clear(); + servicesMetadata.clear(); + servicesStore.clear(); + usageStore.clear(); + webhookStore.clear(); + pauseState.paused = false; + resetHttpMetrics(); +}); + +void describe("Prometheus HTTP metrics", () => { + void it("records request counters and duration histograms by method, route, and status", async () => { + const app = createApp(); + + const write = await request(app) + .post("/api/v1/usage") + .send({ agent: "agent-metrics", serviceId: "svc-metrics", requests: 3 }); + assert.strictEqual(write.status, 201); + + const read = await request(app).get("/api/v1/usage/agent-metrics/svc-metrics"); + assert.strictEqual(read.status, 200); + + const metrics = await request(app).get("/api/v1/metrics"); + assert.strictEqual(metrics.status, 200); + assert.match( + metrics.text, + /agentpay_http_requests_total\{method="POST",route="\/api\/v1\/usage",status="201"\} 1/ + ); + assert.match( + metrics.text, + /agentpay_http_requests_total\{method="GET",route="\/api\/v1\/usage\/:agent\/:serviceId",status="200"\} 1/ + ); + assert.match( + metrics.text, + /agentpay_http_request_duration_seconds_count\{method="POST",route="\/api\/v1\/usage",status="201"\} 1/ + ); + assert.match( + metrics.text, + /agentpay_http_request_duration_seconds_bucket\{method="POST",route="\/api\/v1\/usage",status="201",le="0\.5"\} 1/ + ); + }); + + void it("uses bounded route-pattern labels instead of raw request paths", async () => { + const app = createApp(); + + await request(app).get("/api/v1/usage/raw-agent/raw-service"); + + const metrics = await request(app).get("/api/v1/metrics"); + assert.strictEqual(metrics.status, 200); + assert.ok(metrics.text.includes('route="/api/v1/usage/:agent/:serviceId"')); + assert.ok(!metrics.text.includes("raw-agent")); + assert.ok(!metrics.text.includes("raw-service")); + }); + + void it("increments the terminal error counter when the error handler runs", async () => { + const app = createApp(); + + const bad = await request(app) + .post("/api/v1/usage") + .set("Content-Type", "application/json") + .send("{not-json"); + assert.strictEqual(bad.status, 500); + + const metrics = await request(app).get("/api/v1/metrics"); + assert.strictEqual(metrics.status, 200); + assert.match( + metrics.text, + /agentpay_http_errors_total\{type="entity\.parse\.failed"\} 1/ + ); + assert.match( + metrics.text, + /agentpay_http_requests_total\{method="POST",route="unmatched",status="500"\} 1/ + ); + }); +}); diff --git a/src/metrics.ts b/src/metrics.ts new file mode 100644 index 0000000..b2212ce --- /dev/null +++ b/src/metrics.ts @@ -0,0 +1,155 @@ +import type { Request } from "express"; + +type HttpMetricLabels = { + method: string; + route: string; + status: string; +}; + +type HttpMetricSample = HttpMetricLabels & { + count: number; + sumSeconds: number; + buckets: Map; +}; + +const DURATION_BUCKETS_SECONDS = [ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, +]; +const httpSamples = new Map(); +const httpErrorCounts = new Map(); + +/** Records one completed HTTP response for Prometheus counters and histograms. */ +export function recordHttpRequest( + req: Request, + statusCode: number, + durationSeconds: number +): void { + const labels: HttpMetricLabels = { + method: req.method.toUpperCase(), + route: routePattern(req), + status: String(statusCode), + }; + const key = metricKey(labels); + let sample = httpSamples.get(key); + if (!sample) { + sample = { + ...labels, + count: 0, + sumSeconds: 0, + buckets: new Map(DURATION_BUCKETS_SECONDS.map((bucket) => [bucket, 0])), + }; + httpSamples.set(key, sample); + } + + sample.count += 1; + sample.sumSeconds += durationSeconds; + for (const bucket of DURATION_BUCKETS_SECONDS) { + if (durationSeconds <= bucket) { + sample.buckets.set(bucket, (sample.buckets.get(bucket) ?? 0) + 1); + } + } +} + +/** Records one terminal Express error-handler invocation. */ +export function recordHttpError(type: string): void { + httpErrorCounts.set(type, (httpErrorCounts.get(type) ?? 0) + 1); +} + +/** Appends Prometheus request counters, histograms, and error counters. */ +export function renderHttpMetrics(): string[] { + const lines = [ + "# HELP agentpay_http_requests_total HTTP responses by method, route, and status.", + "# TYPE agentpay_http_requests_total counter", + ]; + + for (const sample of sortedSamples()) { + lines.push(`agentpay_http_requests_total${labelSet(sample)} ${sample.count}`); + } + + lines.push( + "# HELP agentpay_http_request_duration_seconds HTTP response duration in seconds.", + "# TYPE agentpay_http_request_duration_seconds histogram" + ); + for (const sample of sortedSamples()) { + for (const bucket of DURATION_BUCKETS_SECONDS) { + lines.push( + `agentpay_http_request_duration_seconds_bucket${labelSet({ + ...sample, + le: String(bucket), + })} ${sample.buckets.get(bucket) ?? 0}` + ); + } + lines.push( + `agentpay_http_request_duration_seconds_bucket${labelSet({ + ...sample, + le: "+Inf", + })} ${sample.count}` + ); + lines.push( + `agentpay_http_request_duration_seconds_sum${labelSet(sample)} ${formatNumber( + sample.sumSeconds + )}` + ); + lines.push( + `agentpay_http_request_duration_seconds_count${labelSet(sample)} ${sample.count}` + ); + } + + lines.push( + "# HELP agentpay_http_errors_total Terminal error-handler invocations by error type.", + "# TYPE agentpay_http_errors_total counter" + ); + for (const [type, count] of Array.from(httpErrorCounts.entries()).sort()) { + lines.push(`agentpay_http_errors_total${labelSet({ type })} ${count}`); + } + + return lines; +} + +export function resetHttpMetrics(): void { + httpSamples.clear(); + httpErrorCounts.clear(); +} + +function sortedSamples(): HttpMetricSample[] { + return Array.from(httpSamples.values()).sort((a, b) => + metricKey(a).localeCompare(metricKey(b)) + ); +} + +function metricKey(labels: HttpMetricLabels): string { + return `${labels.method}\n${labels.route}\n${labels.status}`; +} + +function routePattern(req: Request): string { + const routePath = req.route?.path; + if (typeof routePath === "string") { + return `${req.baseUrl}${routePath}`; + } + return "unmatched"; +} + +function labelSet(labels: Record): string { + const entries = Object.entries(labels).filter(([key]) => + ["method", "route", "status", "le", "type"].includes(key) + ); + return `{${entries + .map(([key, value]) => `${key}="${escapeLabelValue(labelValue(value))}"`) + .join(",")}}`; +} + +function labelValue(value: unknown): string { + if (typeof value === "string") return value; + if (typeof value === "number") return String(value); + return ""; +} + +function escapeLabelValue(value: string): string { + return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, '\\"'); +} + +function formatNumber(value: number): string { + return Number.isInteger(value) + ? String(value) + : value.toFixed(6).replace(/0+$/, "").replace(/\.$/, ""); +} diff --git a/src/middleware/index.ts b/src/middleware/index.ts index 53859a7..b36874d 100644 --- a/src/middleware/index.ts +++ b/src/middleware/index.ts @@ -12,6 +12,7 @@ import { RATE_LIMIT_PER_WINDOW, RATE_LIMIT_WINDOW_MS, } from "../store/state.js"; +import { recordHttpRequest } from "../metrics.js"; import type { AgentPayRequest } from "../types.js"; /** @@ -20,6 +21,7 @@ import type { AgentPayRequest } from "../types.js"; */ export function installPreRouteMiddleware(app: Application): void { app.use(createCorsMiddleware()); + app.use(requestTimerMiddleware); app.use(express.json({ limit: "100kb" })); app.use(securityHeadersMiddleware); app.use(requestIdMiddleware); @@ -33,7 +35,6 @@ export function installRequestStateMiddleware(app: Application): void { app.use(apiKeyRecognitionMiddleware); app.use(pauseGuardMiddleware); app.use(rateLimitMiddleware); - app.use(requestTimerMiddleware); } /** @@ -147,6 +148,7 @@ function requestTimerMiddleware(req: Request, res: Response, next: NextFunction) const startNs = process.hrtime.bigint(); res.on("finish", () => { const ms = Number(process.hrtime.bigint() - startNs) / 1_000_000; + recordHttpRequest(req, res.statusCode, ms / 1000); if (!res.headersSent) { res.setHeader("Server-Timing", `app;dur=${ms.toFixed(1)}`); } diff --git a/src/routes/errors.ts b/src/routes/errors.ts index fd76332..ff80444 100644 --- a/src/routes/errors.ts +++ b/src/routes/errors.ts @@ -4,6 +4,7 @@ import { type Request, type Response, } from "express"; +import { recordHttpError } from "../metrics.js"; import { getRequestId } from "../types.js"; /** @@ -19,6 +20,7 @@ export function installErrorHandlers(app: Application): void { }); app.use((err: unknown, req: Request, res: Response, _next: NextFunction) => { + recordHttpError(errorType(err)); if ( err && typeof err === "object" && @@ -42,3 +44,16 @@ export function installErrorHandlers(app: Application): void { }); }); } + +function errorType(err: unknown): string { + if (err && typeof err === "object" && "type" in err) { + const type = (err as { type: unknown }).type; + if (typeof type === "string" && type.length > 0) { + return type; + } + } + if (err instanceof Error && err.name.length > 0) { + return err.name; + } + return "unknown"; +} diff --git a/src/routes/metrics.ts b/src/routes/metrics.ts index e8f2dd4..5e37c62 100644 --- a/src/routes/metrics.ts +++ b/src/routes/metrics.ts @@ -1,4 +1,5 @@ import { Router, type Response } from "express"; +import { renderHttpMetrics } from "../metrics.js"; import { apiKeyStore, pauseState, servicesStore, usageStore } from "../store/state.js"; /** @@ -23,6 +24,7 @@ export function createMetricsRouter(): Router { "# HELP agentpay_paused 1 if the backend is paused, 0 otherwise.", "# TYPE agentpay_paused gauge", `agentpay_paused ${pauseState.paused ? 1 : 0}`, + ...renderHttpMetrics(), ]; res.setHeader("Content-Type", "text/plain; version=0.0.4"); res.send(lines.join("\n") + "\n");