From cd0cf3dc379e775526ca0347c30f7af917ba275a Mon Sep 17 00:00:00 2001 From: Revant Patel Date: Tue, 2 Jun 2026 15:16:13 -0700 Subject: [PATCH] default to glm-5.1 --- QUICKSTART.md | 10 ++--- README.md | 11 +++++- config.example.json | 16 ++++---- package-lock.json | 4 +- package.json | 2 +- scripts/claudia-claude.mjs | 8 ++-- scripts/presets.mjs | 20 +++++----- scripts/profile.mjs | 6 +-- scripts/providers.mjs | 4 +- src/openai.ts | 44 ++++++++++++++++++++- tests/claudia-config.test.ts | 1 + tests/openai.test.ts | 77 ++++++++++++++++++++++++++++++++++++ tests/profile.test.ts | 4 +- tests/status.test.ts | 2 +- 14 files changed, 168 insertions(+), 41 deletions(-) diff --git a/QUICKSTART.md b/QUICKSTART.md index 047b0b0..21327ee 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -44,7 +44,7 @@ If your NVIDIA key changes later, run `npm run key`. ## Advanced: NVIDIA NIM (Recommended for Quality) -NVIDIA hosted models like `stepfun-ai/step-3.5-flash` and `z-ai/glm4.7`. +NVIDIA hosted models like `z-ai/glm-5.1`, `z-ai/glm4.7`, and `qwen/qwen3.5-122b-a10b`. From the cloned repo root: ```sh @@ -130,10 +130,10 @@ claudia-claude --model local-model | `npm run release:check` | Release gate: typecheck + tests + build + package smoke | | `npm run config` | Re-run the configuration wizard | | `claudia-claude` | Launch Claude Code connected to the router | -| `npm run claude:fast` | Fast coding model (stepfun-ai/step-3.5-flash) | -| `npm run claude:glm` | High-quality model with thinking (z-ai/glm4.7) | -| `npm run claude:qwen` | Qwen coding model (qwen/qwen3.5-122b-a10b) | -| `npm run claude:smoke` | Quick smoke test (nemotron-mini-4b) | +| `npm run claude:fast` | Default long-context model (z-ai/glm-5.1) | +| `npm run claude:glm` | High-quality thinking model, slower on purpose (z-ai/glm4.7) | +| `npm run claude:qwen` | Backup coding model, less consistent on complex code (qwen/qwen3.5-122b-a10b) | +| `npm run claude:smoke` | Quick smoke test only (nemotron-mini-4b) | --- diff --git a/README.md b/README.md index 68966fe..d7fa000 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,14 @@ npm run claude:fast -- --managed-auth If you see a managed-login warning, remove `--managed-auth`. Claude managed credentials are sent only to the local router; your NVIDIA key is sent to NVIDIA by the router. -The fast script and default wrapper route `claude-3-5-sonnet-latest` to NVIDIA `stepfun-ai/step-3.5-flash`. Use `npm run claude:glm` for the slower GLM quality profile, `npm run claude:qwen` for the Qwen fallback, or `npm run claude:smoke` to test routing with the smallest configured model. +The fast script and default wrapper route `claude-3-5-sonnet-latest` to NVIDIA `z-ai/glm-5.1`. Use `npm run claude:glm` for the slower thinking-heavy GLM quality profile, `npm run claude:qwen` for the Qwen fallback, or `npm run claude:smoke` to test routing with the smallest configured model. + +Model tradeoffs: + +- `fast`: best default for long prompts and coding; slower than smaller models, but much less likely to hit context limits +- `glm`: stronger on hard tasks when it reasons longer, but slower +- `qwen`: backup option when you want a different model family, but less consistent on complex code +- `smoke`: smallest and quickest option for health checks, not real work ### Check the router @@ -169,7 +176,7 @@ LOG_LEVEL=info 2. Keep `defaultBackend` set to `nvidia` in `config.json`. -3. Use a mapped Claude-style model alias such as `claude-3-5-sonnet-latest`, or send any model name and Claudia Router will use the NVIDIA backend default model. +3. Use a mapped Claude-style model alias such as `claude-3-5-sonnet-latest`, or send any model name and Claudia Router will use the NVIDIA backend default model (`z-ai/glm-5.1`). If you want to switch providers later, use `npm run init -- --provider openrouter` or `npm run init -- --provider local`. Use `npm run config` if you prefer the interactive provider picker. diff --git a/config.example.json b/config.example.json index ad86d94..382691a 100644 --- a/config.example.json +++ b/config.example.json @@ -5,7 +5,7 @@ "nvidia": { "baseUrl": "https://integrate.api.nvidia.com/v1", "apiKeyEnv": "NVIDIA_API_KEY", - "defaultModel": "stepfun-ai/step-3.5-flash" + "defaultModel": "z-ai/glm-5.1" }, "openrouter": { "baseUrl": "https://openrouter.ai/api/v1", @@ -21,10 +21,10 @@ "modelProfiles": { "claude-3-5-sonnet-latest": { "backend": "nvidia", - "providerModel": "stepfun-ai/step-3.5-flash", + "providerModel": "z-ai/glm-5.1", "retryAttempts": 3, "retryBaseDelayMs": 500, - "notes": "Fast NVIDIA coding profile", + "notes": "Default long-context NVIDIA coding profile; better for big prompts, slightly slower than smaller models", "capabilities": { "toolCalls": true, "coding": true @@ -41,7 +41,7 @@ "clear_thinking": false } }, - "notes": "Higher-quality GLM coding profile; slower because thinking is enabled", + "notes": "Thinking-heavy GLM coding profile; slower, but stronger for hard coding tasks", "capabilities": { "toolCalls": true, "coding": true @@ -58,7 +58,7 @@ "clear_thinking": false } }, - "notes": "Explicit GLM 4.7 profile for harder coding tasks", + "notes": "Explicit GLM 4.7 profile for harder coding tasks; slower than the default profile", "capabilities": { "toolCalls": true, "coding": true @@ -69,7 +69,7 @@ "providerModel": "qwen/qwen3.5-122b-a10b", "retryAttempts": 3, "retryBaseDelayMs": 500, - "notes": "Qwen fallback NVIDIA coding profile", + "notes": "Qwen fallback NVIDIA coding profile; useful as a backup, but less consistent on complex code", "capabilities": { "toolCalls": true, "coding": true @@ -80,7 +80,7 @@ "providerModel": "nvidia/nemotron-mini-4b-instruct", "retryAttempts": 1, "retryBaseDelayMs": 250, - "notes": "Smoke-test/free-small NVIDIA profile", + "notes": "Smoke-test profile for quick checks; not meant for real coding sessions", "capabilities": { "toolCalls": false, "coding": false @@ -90,7 +90,7 @@ "modelMap": { "legacy-claude-3-5-sonnet-latest": { "backend": "nvidia", - "model": "stepfun-ai/step-3.5-flash" + "model": "z-ai/glm-5.1" } } } diff --git a/package-lock.json b/package-lock.json index c6cf7e9..a43cc04 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "claudia-router", - "version": "0.1.1", + "version": "0.1.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "claudia-router", - "version": "0.1.1", + "version": "0.1.2", "license": "MIT", "dependencies": { "dotenv": "^16.4.7", diff --git a/package.json b/package.json index cf93316..7306aa1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "claudia-router", - "version": "0.1.1", + "version": "0.1.2", "description": "A lightweight Anthropic-compatible local router for sending Claude-style coding requests to OpenAI-compatible model backends.", "type": "module", "main": "dist/index.js", diff --git a/scripts/claudia-claude.mjs b/scripts/claudia-claude.mjs index 91e74e5..34eac95 100755 --- a/scripts/claudia-claude.mjs +++ b/scripts/claudia-claude.mjs @@ -91,10 +91,10 @@ Claudia Router Model Profiles Shortcuts (use with --model or in npm scripts): - --model fast Fast coding: stepfun-ai/step-3.5-flash (NVIDIA) - --model glm High-quality: z-ai/glm4.7 with thinking (NVIDIA) - --model qwen Fallback: qwen/qwen3.5-122b-a10b (NVIDIA) - --model smoke Lightweight: nvidia/nemotron-mini-4b-instruct (NVIDIA) + --model fast Default: z-ai/glm-5.1 (NVIDIA) — best long-context option, a bit slower + --model glm Thinking-heavy: z-ai/glm4.7 (NVIDIA) — slower, but better on hard tasks + --model qwen Fallback: qwen/qwen3.5-122b-a10b (NVIDIA) — useful fallback, less consistent + --model smoke Lightweight: nvidia/nemotron-mini-4b-instruct (NVIDIA) — for quick checks only Built-in npm scripts: diff --git a/scripts/presets.mjs b/scripts/presets.mjs index 2196d68..b92f4ea 100644 --- a/scripts/presets.mjs +++ b/scripts/presets.mjs @@ -4,30 +4,30 @@ export const PROFILE_PRESETS = { fast: { alias: "fast", model: "claude-3-5-sonnet-latest", - description: "Default coding preset", + description: "Default long-context preset", nextCommand: "npm run claude:fast", - notes: "Fast coding profile" + notes: "Default GLM-5.1 routing profile; strongest context window, but slower than smaller models" }, glm: { alias: "glm", model: "claude-3-5-sonnet-glm", - description: "Higher-quality GLM preset", + description: "Thinking-heavy preset", nextCommand: "npm run claude:glm", - notes: "Explicit GLM 4.7 profile for harder coding tasks" + notes: "More deliberate reasoning, but slower and better for hard coding tasks" }, qwen: { alias: "qwen", model: "claude-3-5-sonnet-qwen", - description: "Qwen fallback preset", + description: "Fallback preset", nextCommand: "npm run claude:qwen", - notes: "Qwen fallback NVIDIA coding profile" + notes: "Useful when GLM is unavailable, but less consistent on complex code" }, smoke: { alias: "smoke", model: "claude-3-haiku-latest", description: "Smallest smoke-test preset", nextCommand: "npm run claude:smoke", - notes: "Smoke-test/free-small NVIDIA profile" + notes: "Fast and cheap for checks, but not intended for real coding work" } }; @@ -69,9 +69,9 @@ export function buildInteractiveChoices(config) { key: alias, kind: "profile", profileName: alias, - label: alias, - description: PROFILE_PRESETS[alias].description - })); + label: alias, + description: PROFILE_PRESETS[alias].description + })); return [...profileChoices, ...INTERACTIVE_PROVIDER_CHOICES]; } diff --git a/scripts/profile.mjs b/scripts/profile.mjs index 16ae01d..dbbabe3 100644 --- a/scripts/profile.mjs +++ b/scripts/profile.mjs @@ -17,9 +17,9 @@ import { const USAGE = `Usage: claudia-router profile [name|show|list|toggle] Commands: - fast Set the active Claude profile to fast - glm Set the active Claude profile to GLM quality - qwen Set the active Claude profile to Qwen fallback + fast Set the active Claude profile to the default long-context model + glm Set the active Claude profile to the slower thinking-heavy model + qwen Set the active Claude profile to the backup model smoke Set the active Claude profile to the smoke-test model list Show all available profile presets toggle Switch between fast and glm diff --git a/scripts/providers.mjs b/scripts/providers.mjs index d906b4e..c55bd68 100644 --- a/scripts/providers.mjs +++ b/scripts/providers.mjs @@ -4,10 +4,10 @@ export const PROVIDERS = { name: "NVIDIA NIM", baseUrl: "https://integrate.api.nvidia.com/v1", apiKeyEnv: "NVIDIA_API_KEY", - defaultModel: "stepfun-ai/step-3.5-flash", + defaultModel: "z-ai/glm-5.1", smokeModel: "nvidia/nemotron-mini-4b-instruct", requiresKey: true, - description: "Fast, high-quality models hosted by NVIDIA" + description: "Long-context and coding-capable models hosted by NVIDIA" }, openrouter: { key: "openrouter", diff --git a/src/openai.ts b/src/openai.ts index a0bc1c9..79d7195 100644 --- a/src/openai.ts +++ b/src/openai.ts @@ -192,6 +192,7 @@ export async function callOpenAICompatibleBackend(args: { const timeout = setTimeout(() => controller.abort(), PROVIDER_TIMEOUT_MS); const maxAttempts = Math.max(1, args.retryAttempts ?? PROVIDER_DEFAULT_MAX_ATTEMPTS); const retryBaseDelayMs = Math.max(0, args.retryBaseDelayMs ?? PROVIDER_DEFAULT_RETRY_BASE_MS); + let requestToSend = args.request; try { const headers: Record = { @@ -209,7 +210,7 @@ export async function callOpenAICompatibleBackend(args: { response = await fetch(`${args.backend.baseUrl}/chat/completions`, { method: "POST", headers, - body: JSON.stringify(args.request), + body: JSON.stringify(requestToSend), signal: controller.signal }); @@ -224,6 +225,31 @@ export async function callOpenAICompatibleBackend(args: { })); } + const contextLimit = parseContextLengthError(bodyText); + if ( + response.status === 400 && + contextLimit && + requestToSend.max_tokens > 1 + ) { + if (contextLimit.promptTokens >= contextLimit.limit) { + throw new ClaudiaError( + "invalid_request_error", + `Prompt exceeds the model context window of ${contextLimit.limit} tokens. Choose a larger-context model or shorten the conversation.`, + 400 + ); + } + + const adjustedMaxTokens = Math.max(1, contextLimit.limit - contextLimit.promptTokens - 1); + + if (adjustedMaxTokens < requestToSend.max_tokens) { + requestToSend = { + ...requestToSend, + max_tokens: adjustedMaxTokens + }; + continue; + } + } + if (response.ok || !shouldRetryProviderStatus(response.status) || attempt === maxAttempts) { break; } @@ -320,6 +346,22 @@ function truncateProviderBody(body: string): string { return body.length > 500 ? `${body.slice(0, 500)}...` : body; } +function parseContextLengthError(bodyText: string): { limit: number; promptTokens: number; completionTokens: number } | null { + const match = bodyText.match( + /maximum context length is (\d+) tokens[\s\S]*?requested (\d+) tokens \((\d+) in the messages, (\d+) in the completion\)/i + ); + + if (!match) { + return null; + } + + return { + limit: Number(match[1]), + promptTokens: Number(match[3]), + completionTokens: Number(match[4]) + }; +} + async function pollPendingProviderResponse(args: { backend: BackendConfig; headers: Record; diff --git a/tests/claudia-config.test.ts b/tests/claudia-config.test.ts index 4429cbb..4ba8453 100644 --- a/tests/claudia-config.test.ts +++ b/tests/claudia-config.test.ts @@ -71,6 +71,7 @@ test("configuration wizard awaits remote connectivity before completion", async assert.match(logs.join("\n"), /OK Connected to nvidia successfully/); assert.match(logs.join("\n"), /Configuration complete!/); const config = JSON.parse(fs.readFileSync(path.join(cwd, "config.json"), "utf8")); + assert.equal(config.modelProfiles["claude-3-5-sonnet-latest"]?.providerModel, "z-ai/glm-5.1"); assert.equal(config.modelProfiles["claude-3-5-sonnet-glm"]?.providerModel, "z-ai/glm4.7"); assert.equal(config.modelProfiles["claude-3-5-sonnet-qwen"]?.providerModel, "qwen/qwen3.5-122b-a10b"); }); diff --git a/tests/openai.test.ts b/tests/openai.test.ts index 1590fb0..e80e5b5 100644 --- a/tests/openai.test.ts +++ b/tests/openai.test.ts @@ -275,6 +275,83 @@ test("polls pending NVIDIA responses until the result is ready", async () => { } }); +test("automatically shrinks completion budget when the provider reports a context overflow", async () => { + const originalFetch = globalThis.fetch; + const maxTokensSeen: number[] = []; + let attempts = 0; + + globalThis.fetch = async (_input, init) => { + attempts += 1; + const requestBody = JSON.parse(String(init?.body)) as { max_tokens: number }; + maxTokensSeen.push(requestBody.max_tokens); + + if (attempts === 1) { + return new Response( + JSON.stringify({ + error: + "This model's maximum context length is 4096 tokens. However, you requested 4439 tokens (343 in the messages, 4096 in the completion). Please reduce the length of the messages or completion." + }), + { + status: 400, + headers: { + "content-type": "application/json" + } + } + ); + } + + return new Response( + JSON.stringify({ + model: "test-model", + choices: [ + { + message: { + role: "assistant", + content: "ok" + }, + finish_reason: "stop" + } + ], + usage: { + prompt_tokens: 343, + completion_tokens: 12 + } + }), + { + status: 200, + headers: { + "content-type": "application/json" + } + } + ); + }; + + try { + const result = await callOpenAICompatibleBackend({ + backend: { + baseUrl: "https://provider.test/v1", + apiKeyEnv: "TEST_API_KEY", + defaultModel: "test-model" + }, + request: { + model: "test-model", + messages: [ + { + role: "user", + content: "Say ok" + } + ], + max_tokens: 4096 + } + }); + + assert.equal(result.text, "ok"); + assert.deepEqual(maxTokensSeen, [4096, 3752]); + } finally { + globalThis.fetch = originalFetch; + } +}); + test("formats completed responses as Anthropic SSE events", () => { const stream = buildAnthropicStream({ id: "msg_test", diff --git a/tests/profile.test.ts b/tests/profile.test.ts index 24e5ca1..f21521a 100644 --- a/tests/profile.test.ts +++ b/tests/profile.test.ts @@ -23,14 +23,14 @@ function writeNvidiaConfig(cwd: string): void { nvidia: { baseUrl: "https://integrate.api.nvidia.com/v1", apiKeyEnv: "NVIDIA_API_KEY", - defaultModel: "stepfun-ai/step-3.5-flash" + defaultModel: "z-ai/glm-5.1" } }, modelMap: {}, modelProfiles: { "claude-3-5-sonnet-latest": { backend: "nvidia", - providerModel: "stepfun-ai/step-3.5-flash" + providerModel: "z-ai/glm-5.1" }, "claude-3-5-sonnet-glm": { backend: "nvidia", diff --git a/tests/status.test.ts b/tests/status.test.ts index 2d9b76c..432e6bc 100644 --- a/tests/status.test.ts +++ b/tests/status.test.ts @@ -22,7 +22,7 @@ function createStatusDirectory(env = "NVIDIA_API_KEY=test-key\nCLAUDIA_CLAUDE_MO nvidia: { baseUrl: "https://example.invalid/v1", apiKeyEnv: "NVIDIA_API_KEY", - defaultModel: "stepfun-ai/step-3.5-flash" + defaultModel: "z-ai/glm-5.1" } }, modelMap: {},