Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ function parseUsageFromData(data: Record<string, unknown>): { inputTokens: numbe
* For non-streaming JSON responses, uses a bounded sliding-window regex scan.
*/
function createMetricsTransform(
ctx: { requestId: string; model: string; actualModel?: string; tier: string; startTime: number; fallbackMode?: "sequential" | "race"; sessionId?: string; _streamState?: StreamState },
ctx: { requestId: string; model: string; actualModel?: string; tier: string; startTime: number; fallbackMode?: "sequential" | "race"; sessionId?: string; _streamState?: StreamState; _streamStartTime?: number },
provider: string,
targetProvider: string,
metricsStore: MetricsStore,
Expand Down Expand Up @@ -239,8 +239,14 @@ function createMetricsTransform(
const recordMetrics = (inp: number, out: number, cacheRead: number = 0, cacheCreation: number = 0) => {
try {
const latencyMs = Date.now() - ctx.startTime;
const latencySec = latencyMs / 1000;
const tps = latencySec > 0 ? out / latencySec : 0;
// Use streaming-only duration for TPS (exclude TTFB wait time).
// Only use streaming duration when it's long enough for reliable measurement
// (>= 200ms). Short durations (< 200ms) have huge relative error due to
// Date.now() resolution (~1ms), producing inflated numbers like 64K tok/s.
const rawStreamDurMs = ctx._streamStartTime ? Date.now() - ctx._streamStartTime : 0;
const durMs = rawStreamDurMs >= 200 ? rawStreamDurMs : latencyMs;
const tpsSec = durMs / 1000;
const tps = tpsSec > 0 ? out / tpsSec : 0;

metricsStore.recordRequest({
requestId: ctx.requestId,
Expand Down Expand Up @@ -334,6 +340,7 @@ function createMetricsTransform(
const now = Date.now();
if (firstChunk || now - lastStreamEmit >= STREAM_THROTTLE_MS) {
lastStreamEmit = now;
if (firstChunk) ctx._streamStartTime = now; // capture streaming start (excludes TTFB)
firstChunk = false;
const contextWindow = getContextWindow(ctx.actualModel || ctx.model);
setImmediate(() => {
Expand Down Expand Up @@ -369,6 +376,7 @@ function createMetricsTransform(
const nowJson = Date.now();
if (firstChunk || nowJson - lastStreamEmit >= STREAM_THROTTLE_MS) {
lastStreamEmit = nowJson;
if (firstChunk) ctx._streamStartTime = nowJson; // capture streaming start (excludes TTFB)
firstChunk = false;
const contextWindow = getContextWindow(ctx.actualModel || ctx.model);
setImmediate(() => {
Expand Down
2 changes: 2 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ export interface RequestContext {
hasDistribution?: boolean;
/** Tracks current StreamState for transition validation */
_streamState?: StreamState;
/** Timestamp when the first streaming chunk arrived (after TTFB). Used for streaming-only TPS. */
_streamStartTime?: number;
/** Retry-after value (seconds) from the last provider 429/503 response */
_retryAfterMs?: number;
/** Set when all providers in the chain have health < UNHEALTHY_THRESHOLD.
Expand Down
Loading