Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions QUICKSTART.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ If your NVIDIA key changes later, run `npm run key`.

## Advanced: NVIDIA NIM (Recommended for Quality)

NVIDIA hosted models like `stepfun-ai/step-3.5-flash` and `z-ai/glm4.7`.
NVIDIA hosted models like `z-ai/glm-5.1`, `z-ai/glm4.7`, and `qwen/qwen3.5-122b-a10b`.
From the cloned repo root:

```sh
Expand Down Expand Up @@ -130,10 +130,10 @@ claudia-claude --model local-model
| `npm run release:check` | Release gate: typecheck + tests + build + package smoke |
| `npm run config` | Re-run the configuration wizard |
| `claudia-claude` | Launch Claude Code connected to the router |
| `npm run claude:fast` | Fast coding model (stepfun-ai/step-3.5-flash) |
| `npm run claude:glm` | High-quality model with thinking (z-ai/glm4.7) |
| `npm run claude:qwen` | Qwen coding model (qwen/qwen3.5-122b-a10b) |
| `npm run claude:smoke` | Quick smoke test (nemotron-mini-4b) |
| `npm run claude:fast` | Default long-context model (z-ai/glm-5.1) |
| `npm run claude:glm` | High-quality thinking model, slower on purpose (z-ai/glm4.7) |
| `npm run claude:qwen` | Backup coding model, less consistent on complex code (qwen/qwen3.5-122b-a10b) |
| `npm run claude:smoke` | Quick smoke test only (nemotron-mini-4b) |

---

Expand Down
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,14 @@ npm run claude:fast -- --managed-auth

If you see a managed-login warning, remove `--managed-auth`. Claude managed credentials are sent only to the local router; your NVIDIA key is sent to NVIDIA by the router.

The fast script and default wrapper route `claude-3-5-sonnet-latest` to NVIDIA `stepfun-ai/step-3.5-flash`. Use `npm run claude:glm` for the slower GLM quality profile, `npm run claude:qwen` for the Qwen fallback, or `npm run claude:smoke` to test routing with the smallest configured model.
The fast script and default wrapper route `claude-3-5-sonnet-latest` to NVIDIA `z-ai/glm-5.1`. Use `npm run claude:glm` for the slower thinking-heavy GLM quality profile, `npm run claude:qwen` for the Qwen fallback, or `npm run claude:smoke` to test routing with the smallest configured model.

Model tradeoffs:

- `fast`: best default for long prompts and coding; slower than smaller models, but much less likely to hit context limits
- `glm`: stronger on hard tasks when it reasons longer, but slower
- `qwen`: backup option when you want a different model family, but less consistent on complex code
- `smoke`: smallest and quickest option for health checks, not real work

### Check the router

Expand Down Expand Up @@ -169,7 +176,7 @@ LOG_LEVEL=info

2. Keep `defaultBackend` set to `nvidia` in `config.json`.

3. Use a mapped Claude-style model alias such as `claude-3-5-sonnet-latest`, or send any model name and Claudia Router will use the NVIDIA backend default model.
3. Use a mapped Claude-style model alias such as `claude-3-5-sonnet-latest`, or send any model name and Claudia Router will use the NVIDIA backend default model (`z-ai/glm-5.1`).

If you want to switch providers later, use `npm run init -- --provider openrouter` or `npm run init -- --provider local`. Use `npm run config` if you prefer the interactive provider picker.

Expand Down
16 changes: 8 additions & 8 deletions config.example.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"nvidia": {
"baseUrl": "https://integrate.api.nvidia.com/v1",
"apiKeyEnv": "NVIDIA_API_KEY",
"defaultModel": "stepfun-ai/step-3.5-flash"
"defaultModel": "z-ai/glm-5.1"
},
"openrouter": {
"baseUrl": "https://openrouter.ai/api/v1",
Expand All @@ -21,10 +21,10 @@
"modelProfiles": {
"claude-3-5-sonnet-latest": {
"backend": "nvidia",
"providerModel": "stepfun-ai/step-3.5-flash",
"providerModel": "z-ai/glm-5.1",
"retryAttempts": 3,
"retryBaseDelayMs": 500,
"notes": "Fast NVIDIA coding profile",
"notes": "Default long-context NVIDIA coding profile; better for big prompts, slightly slower than smaller models",
"capabilities": {
"toolCalls": true,
"coding": true
Expand All @@ -41,7 +41,7 @@
"clear_thinking": false
}
},
"notes": "Higher-quality GLM coding profile; slower because thinking is enabled",
"notes": "Thinking-heavy GLM coding profile; slower, but stronger for hard coding tasks",
"capabilities": {
"toolCalls": true,
"coding": true
Expand All @@ -58,7 +58,7 @@
"clear_thinking": false
}
},
"notes": "Explicit GLM 4.7 profile for harder coding tasks",
"notes": "Explicit GLM 4.7 profile for harder coding tasks; slower than the default profile",
"capabilities": {
"toolCalls": true,
"coding": true
Expand All @@ -69,7 +69,7 @@
"providerModel": "qwen/qwen3.5-122b-a10b",
"retryAttempts": 3,
"retryBaseDelayMs": 500,
"notes": "Qwen fallback NVIDIA coding profile",
"notes": "Qwen fallback NVIDIA coding profile; useful as a backup, but less consistent on complex code",
"capabilities": {
"toolCalls": true,
"coding": true
Expand All @@ -80,7 +80,7 @@
"providerModel": "nvidia/nemotron-mini-4b-instruct",
"retryAttempts": 1,
"retryBaseDelayMs": 250,
"notes": "Smoke-test/free-small NVIDIA profile",
"notes": "Smoke-test profile for quick checks; not meant for real coding sessions",
"capabilities": {
"toolCalls": false,
"coding": false
Expand All @@ -90,7 +90,7 @@
"modelMap": {
"legacy-claude-3-5-sonnet-latest": {
"backend": "nvidia",
"model": "stepfun-ai/step-3.5-flash"
"model": "z-ai/glm-5.1"
}
}
}
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "claudia-router",
"version": "0.1.1",
"version": "0.1.2",
"description": "A lightweight Anthropic-compatible local router for sending Claude-style coding requests to OpenAI-compatible model backends.",
"type": "module",
"main": "dist/index.js",
Expand Down
8 changes: 4 additions & 4 deletions scripts/claudia-claude.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ Claudia Router Model Profiles

Shortcuts (use with --model or in npm scripts):

--model fast Fast coding: stepfun-ai/step-3.5-flash (NVIDIA)
--model glm High-quality: z-ai/glm4.7 with thinking (NVIDIA)
--model qwen Fallback: qwen/qwen3.5-122b-a10b (NVIDIA)
--model smoke Lightweight: nvidia/nemotron-mini-4b-instruct (NVIDIA)
--model fast Default: z-ai/glm-5.1 (NVIDIA) — best long-context option, a bit slower
--model glm Thinking-heavy: z-ai/glm4.7 (NVIDIA) — slower, but better on hard tasks
--model qwen Fallback: qwen/qwen3.5-122b-a10b (NVIDIA) — useful fallback, less consistent
--model smoke Lightweight: nvidia/nemotron-mini-4b-instruct (NVIDIA) — for quick checks only

Built-in npm scripts:

Expand Down
20 changes: 10 additions & 10 deletions scripts/presets.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,30 @@ export const PROFILE_PRESETS = {
fast: {
alias: "fast",
model: "claude-3-5-sonnet-latest",
description: "Default coding preset",
description: "Default long-context preset",
nextCommand: "npm run claude:fast",
notes: "Fast coding profile"
notes: "Default GLM-5.1 routing profile; strongest context window, but slower than smaller models"
},
glm: {
alias: "glm",
model: "claude-3-5-sonnet-glm",
description: "Higher-quality GLM preset",
description: "Thinking-heavy preset",
nextCommand: "npm run claude:glm",
notes: "Explicit GLM 4.7 profile for harder coding tasks"
notes: "More deliberate reasoning, but slower and better for hard coding tasks"
},
qwen: {
alias: "qwen",
model: "claude-3-5-sonnet-qwen",
description: "Qwen fallback preset",
description: "Fallback preset",
nextCommand: "npm run claude:qwen",
notes: "Qwen fallback NVIDIA coding profile"
notes: "Useful when GLM is unavailable, but less consistent on complex code"
},
smoke: {
alias: "smoke",
model: "claude-3-haiku-latest",
description: "Smallest smoke-test preset",
nextCommand: "npm run claude:smoke",
notes: "Smoke-test/free-small NVIDIA profile"
notes: "Fast and cheap for checks, but not intended for real coding work"
}
};

Expand Down Expand Up @@ -69,9 +69,9 @@ export function buildInteractiveChoices(config) {
key: alias,
kind: "profile",
profileName: alias,
label: alias,
description: PROFILE_PRESETS[alias].description
}));
label: alias,
description: PROFILE_PRESETS[alias].description
}));

return [...profileChoices, ...INTERACTIVE_PROVIDER_CHOICES];
}
Expand Down
6 changes: 3 additions & 3 deletions scripts/profile.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ import {
const USAGE = `Usage: claudia-router profile [name|show|list|toggle]

Commands:
fast Set the active Claude profile to fast
glm Set the active Claude profile to GLM quality
qwen Set the active Claude profile to Qwen fallback
fast Set the active Claude profile to the default long-context model
glm Set the active Claude profile to the slower thinking-heavy model
qwen Set the active Claude profile to the backup model
smoke Set the active Claude profile to the smoke-test model
list Show all available profile presets
toggle Switch between fast and glm
Expand Down
4 changes: 2 additions & 2 deletions scripts/providers.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ export const PROVIDERS = {
name: "NVIDIA NIM",
baseUrl: "https://integrate.api.nvidia.com/v1",
apiKeyEnv: "NVIDIA_API_KEY",
defaultModel: "stepfun-ai/step-3.5-flash",
defaultModel: "z-ai/glm-5.1",
smokeModel: "nvidia/nemotron-mini-4b-instruct",
requiresKey: true,
description: "Fast, high-quality models hosted by NVIDIA"
description: "Long-context and coding-capable models hosted by NVIDIA"
},
openrouter: {
key: "openrouter",
Expand Down
44 changes: 43 additions & 1 deletion src/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ export async function callOpenAICompatibleBackend(args: {
const timeout = setTimeout(() => controller.abort(), PROVIDER_TIMEOUT_MS);
const maxAttempts = Math.max(1, args.retryAttempts ?? PROVIDER_DEFAULT_MAX_ATTEMPTS);
const retryBaseDelayMs = Math.max(0, args.retryBaseDelayMs ?? PROVIDER_DEFAULT_RETRY_BASE_MS);
let requestToSend = args.request;

try {
const headers: Record<string, string> = {
Expand All @@ -209,7 +210,7 @@ export async function callOpenAICompatibleBackend(args: {
response = await fetch(`${args.backend.baseUrl}/chat/completions`, {
method: "POST",
headers,
body: JSON.stringify(args.request),
body: JSON.stringify(requestToSend),
signal: controller.signal
});

Expand All @@ -224,6 +225,31 @@ export async function callOpenAICompatibleBackend(args: {
}));
}

const contextLimit = parseContextLengthError(bodyText);
if (
response.status === 400 &&
contextLimit &&
requestToSend.max_tokens > 1
) {
if (contextLimit.promptTokens >= contextLimit.limit) {
throw new ClaudiaError(
"invalid_request_error",
`Prompt exceeds the model context window of ${contextLimit.limit} tokens. Choose a larger-context model or shorten the conversation.`,
400
);
}

const adjustedMaxTokens = Math.max(1, contextLimit.limit - contextLimit.promptTokens - 1);

if (adjustedMaxTokens < requestToSend.max_tokens) {
requestToSend = {
...requestToSend,
max_tokens: adjustedMaxTokens
};
continue;
}
}

if (response.ok || !shouldRetryProviderStatus(response.status) || attempt === maxAttempts) {
break;
}
Expand Down Expand Up @@ -320,6 +346,22 @@ function truncateProviderBody(body: string): string {
return body.length > 500 ? `${body.slice(0, 500)}...` : body;
}

function parseContextLengthError(bodyText: string): { limit: number; promptTokens: number; completionTokens: number } | null {
const match = bodyText.match(
/maximum context length is (\d+) tokens[\s\S]*?requested (\d+) tokens \((\d+) in the messages, (\d+) in the completion\)/i
);

if (!match) {
return null;
}

return {
limit: Number(match[1]),
promptTokens: Number(match[3]),
completionTokens: Number(match[4])
};
}

async function pollPendingProviderResponse(args: {
backend: BackendConfig;
headers: Record<string, string>;
Expand Down
1 change: 1 addition & 0 deletions tests/claudia-config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ test("configuration wizard awaits remote connectivity before completion", async
assert.match(logs.join("\n"), /OK Connected to nvidia successfully/);
assert.match(logs.join("\n"), /Configuration complete!/);
const config = JSON.parse(fs.readFileSync(path.join(cwd, "config.json"), "utf8"));
assert.equal(config.modelProfiles["claude-3-5-sonnet-latest"]?.providerModel, "z-ai/glm-5.1");
assert.equal(config.modelProfiles["claude-3-5-sonnet-glm"]?.providerModel, "z-ai/glm4.7");
assert.equal(config.modelProfiles["claude-3-5-sonnet-qwen"]?.providerModel, "qwen/qwen3.5-122b-a10b");
});
77 changes: 77 additions & 0 deletions tests/openai.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,83 @@ test("polls pending NVIDIA responses until the result is ready", async () => {
}
});

test("automatically shrinks completion budget when the provider reports a context overflow", async () => {
const originalFetch = globalThis.fetch;
const maxTokensSeen: number[] = [];
let attempts = 0;

globalThis.fetch = async (_input, init) => {
attempts += 1;
const requestBody = JSON.parse(String(init?.body)) as { max_tokens: number };
maxTokensSeen.push(requestBody.max_tokens);

if (attempts === 1) {
return new Response(
JSON.stringify({
error:
"This model's maximum context length is 4096 tokens. However, you requested 4439 tokens (343 in the messages, 4096 in the completion). Please reduce the length of the messages or completion."
}),
{
status: 400,
headers: {
"content-type": "application/json"
}
}
);
}

return new Response(
JSON.stringify({
model: "test-model",
choices: [
{
message: {
role: "assistant",
content: "ok"
},
finish_reason: "stop"
}
],
usage: {
prompt_tokens: 343,
completion_tokens: 12
}
}),
{
status: 200,
headers: {
"content-type": "application/json"
}
}
);
};

try {
const result = await callOpenAICompatibleBackend({
backend: {
baseUrl: "https://provider.test/v1",
apiKeyEnv: "TEST_API_KEY",
defaultModel: "test-model"
},
request: {
model: "test-model",
messages: [
{
role: "user",
content: "Say ok"
}
],
max_tokens: 4096
}
});

assert.equal(result.text, "ok");
assert.deepEqual(maxTokensSeen, [4096, 3752]);
} finally {
globalThis.fetch = originalFetch;
}
});

test("formats completed responses as Anthropic SSE events", () => {
const stream = buildAnthropicStream({
id: "msg_test",
Expand Down
Loading
Loading