ChunkyMonkey11 · ChunkyMonkey11 · Jun 2, 2026 · Jun 2, 2026
diff --git a/QUICKSTART.md b/QUICKSTART.md
@@ -44,7 +44,7 @@ If your NVIDIA key changes later, run `npm run key`.
 
 ## Advanced: NVIDIA NIM (Recommended for Quality)
 
-NVIDIA hosted models like `stepfun-ai/step-3.5-flash` and `z-ai/glm4.7`.
+NVIDIA hosted models like `z-ai/glm-5.1`, `z-ai/glm4.7`, and `qwen/qwen3.5-122b-a10b`.
 From the cloned repo root:
 
 ```sh
@@ -130,10 +130,10 @@ claudia-claude --model local-model
 | `npm run release:check` | Release gate: typecheck + tests + build + package smoke |
 | `npm run config` | Re-run the configuration wizard |
 | `claudia-claude` | Launch Claude Code connected to the router |
-| `npm run claude:fast` | Fast coding model (stepfun-ai/step-3.5-flash) |
-| `npm run claude:glm` | High-quality model with thinking (z-ai/glm4.7) |
-| `npm run claude:qwen` | Qwen coding model (qwen/qwen3.5-122b-a10b) |
-| `npm run claude:smoke` | Quick smoke test (nemotron-mini-4b) |
+| `npm run claude:fast` | Default long-context model (z-ai/glm-5.1) |
+| `npm run claude:glm` | High-quality thinking model, slower on purpose (z-ai/glm4.7) |
+| `npm run claude:qwen` | Backup coding model, less consistent on complex code (qwen/qwen3.5-122b-a10b) |
+| `npm run claude:smoke` | Quick smoke test only (nemotron-mini-4b) |
 
 ---
 

diff --git a/README.md b/README.md
@@ -125,7 +125,14 @@ npm run claude:fast -- --managed-auth
 
 If you see a managed-login warning, remove `--managed-auth`. Claude managed credentials are sent only to the local router; your NVIDIA key is sent to NVIDIA by the router.
 
-The fast script and default wrapper route `claude-3-5-sonnet-latest` to NVIDIA `stepfun-ai/step-3.5-flash`. Use `npm run claude:glm` for the slower GLM quality profile, `npm run claude:qwen` for the Qwen fallback, or `npm run claude:smoke` to test routing with the smallest configured model.
+The fast script and default wrapper route `claude-3-5-sonnet-latest` to NVIDIA `z-ai/glm-5.1`. Use `npm run claude:glm` for the slower thinking-heavy GLM quality profile, `npm run claude:qwen` for the Qwen fallback, or `npm run claude:smoke` to test routing with the smallest configured model.
+
+Model tradeoffs:
+
+- `fast`: best default for long prompts and coding; slower than smaller models, but much less likely to hit context limits
+- `glm`: stronger on hard tasks when it reasons longer, but slower
+- `qwen`: backup option when you want a different model family, but less consistent on complex code
+- `smoke`: smallest and quickest option for health checks, not real work
 
 ### Check the router
 
@@ -169,7 +176,7 @@ LOG_LEVEL=info
 
 2. Keep `defaultBackend` set to `nvidia` in `config.json`.
 
-3. Use a mapped Claude-style model alias such as `claude-3-5-sonnet-latest`, or send any model name and Claudia Router will use the NVIDIA backend default model.
+3. Use a mapped Claude-style model alias such as `claude-3-5-sonnet-latest`, or send any model name and Claudia Router will use the NVIDIA backend default model (`z-ai/glm-5.1`).
 
 If you want to switch providers later, use `npm run init -- --provider openrouter` or `npm run init -- --provider local`. Use `npm run config` if you prefer the interactive provider picker.
 

diff --git a/config.example.json b/config.example.json
@@ -5,7 +5,7 @@
     "nvidia": {
       "baseUrl": "https://integrate.api.nvidia.com/v1",
       "apiKeyEnv": "NVIDIA_API_KEY",
-      "defaultModel": "stepfun-ai/step-3.5-flash"
+      "defaultModel": "z-ai/glm-5.1"
     },
     "openrouter": {
       "baseUrl": "https://openrouter.ai/api/v1",
@@ -21,10 +21,10 @@
   "modelProfiles": {
     "claude-3-5-sonnet-latest": {
       "backend": "nvidia",
-      "providerModel": "stepfun-ai/step-3.5-flash",
+      "providerModel": "z-ai/glm-5.1",
       "retryAttempts": 3,
       "retryBaseDelayMs": 500,
-      "notes": "Fast NVIDIA coding profile",
+      "notes": "Default long-context NVIDIA coding profile; better for big prompts, slightly slower than smaller models",
       "capabilities": {
         "toolCalls": true,
         "coding": true
@@ -41,7 +41,7 @@
           "clear_thinking": false
         }
       },
-      "notes": "Higher-quality GLM coding profile; slower because thinking is enabled",
+      "notes": "Thinking-heavy GLM coding profile; slower, but stronger for hard coding tasks",
       "capabilities": {
         "toolCalls": true,
         "coding": true
@@ -58,7 +58,7 @@
           "clear_thinking": false
         }
       },
-      "notes": "Explicit GLM 4.7 profile for harder coding tasks",
+      "notes": "Explicit GLM 4.7 profile for harder coding tasks; slower than the default profile",
       "capabilities": {
         "toolCalls": true,
         "coding": true
@@ -69,7 +69,7 @@
       "providerModel": "qwen/qwen3.5-122b-a10b",
       "retryAttempts": 3,
       "retryBaseDelayMs": 500,
-      "notes": "Qwen fallback NVIDIA coding profile",
+      "notes": "Qwen fallback NVIDIA coding profile; useful as a backup, but less consistent on complex code",
       "capabilities": {
         "toolCalls": true,
         "coding": true
@@ -80,7 +80,7 @@
       "providerModel": "nvidia/nemotron-mini-4b-instruct",
       "retryAttempts": 1,
       "retryBaseDelayMs": 250,
-      "notes": "Smoke-test/free-small NVIDIA profile",
+      "notes": "Smoke-test profile for quick checks; not meant for real coding sessions",
       "capabilities": {
         "toolCalls": false,
         "coding": false
@@ -90,7 +90,7 @@
   "modelMap": {
     "legacy-claude-3-5-sonnet-latest": {
       "backend": "nvidia",
-      "model": "stepfun-ai/step-3.5-flash"
+      "model": "z-ai/glm-5.1"
     }
   }
 }
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "claudia-router",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "description": "A lightweight Anthropic-compatible local router for sending Claude-style coding requests to OpenAI-compatible model backends.",
   "type": "module",
   "main": "dist/index.js",

diff --git a/scripts/claudia-claude.mjs b/scripts/claudia-claude.mjs
@@ -91,10 +91,10 @@ Claudia Router Model Profiles
 
 Shortcuts (use with --model or in npm scripts):
 
-  --model fast     Fast coding: stepfun-ai/step-3.5-flash (NVIDIA)
-  --model glm      High-quality: z-ai/glm4.7 with thinking (NVIDIA)
-  --model qwen     Fallback: qwen/qwen3.5-122b-a10b (NVIDIA)
-  --model smoke    Lightweight: nvidia/nemotron-mini-4b-instruct (NVIDIA)
+  --model fast     Default: z-ai/glm-5.1 (NVIDIA) — best long-context option, a bit slower
+  --model glm      Thinking-heavy: z-ai/glm4.7 (NVIDIA) — slower, but better on hard tasks
+  --model qwen     Fallback: qwen/qwen3.5-122b-a10b (NVIDIA) — useful fallback, less consistent
+  --model smoke    Lightweight: nvidia/nemotron-mini-4b-instruct (NVIDIA) — for quick checks only
 
   Built-in npm scripts:
 

diff --git a/scripts/presets.mjs b/scripts/presets.mjs
@@ -4,30 +4,30 @@ export const PROFILE_PRESETS = {
   fast: {
     alias: "fast",
     model: "claude-3-5-sonnet-latest",
-    description: "Default coding preset",
+    description: "Default long-context preset",
     nextCommand: "npm run claude:fast",
-    notes: "Fast coding profile"
+    notes: "Default GLM-5.1 routing profile; strongest context window, but slower than smaller models"
   },
   glm: {
     alias: "glm",
     model: "claude-3-5-sonnet-glm",
-    description: "Higher-quality GLM preset",
+    description: "Thinking-heavy preset",
     nextCommand: "npm run claude:glm",
-    notes: "Explicit GLM 4.7 profile for harder coding tasks"
+    notes: "More deliberate reasoning, but slower and better for hard coding tasks"
   },
   qwen: {
     alias: "qwen",
     model: "claude-3-5-sonnet-qwen",
-    description: "Qwen fallback preset",
+    description: "Fallback preset",
     nextCommand: "npm run claude:qwen",
-    notes: "Qwen fallback NVIDIA coding profile"
+    notes: "Useful when GLM is unavailable, but less consistent on complex code"
   },
   smoke: {
     alias: "smoke",
     model: "claude-3-haiku-latest",
     description: "Smallest smoke-test preset",
     nextCommand: "npm run claude:smoke",
-    notes: "Smoke-test/free-small NVIDIA profile"
+    notes: "Fast and cheap for checks, but not intended for real coding work"
   }
 };
 
@@ -69,9 +69,9 @@ export function buildInteractiveChoices(config) {
     key: alias,
     kind: "profile",
     profileName: alias,
-    label: alias,
-    description: PROFILE_PRESETS[alias].description
-  }));
+      label: alias,
+      description: PROFILE_PRESETS[alias].description
+    }));
 
   return [...profileChoices, ...INTERACTIVE_PROVIDER_CHOICES];
 }

diff --git a/scripts/profile.mjs b/scripts/profile.mjs
@@ -17,9 +17,9 @@ import {
 const USAGE = `Usage: claudia-router profile [name|show|list|toggle]
 
 Commands:
-  fast     Set the active Claude profile to fast
-  glm      Set the active Claude profile to GLM quality
-  qwen     Set the active Claude profile to Qwen fallback
+  fast     Set the active Claude profile to the default long-context model
+  glm      Set the active Claude profile to the slower thinking-heavy model
+  qwen     Set the active Claude profile to the backup model
   smoke    Set the active Claude profile to the smoke-test model
   list     Show all available profile presets
   toggle   Switch between fast and glm

diff --git a/scripts/providers.mjs b/scripts/providers.mjs
@@ -4,10 +4,10 @@ export const PROVIDERS = {
     name: "NVIDIA NIM",
     baseUrl: "https://integrate.api.nvidia.com/v1",
     apiKeyEnv: "NVIDIA_API_KEY",
-    defaultModel: "stepfun-ai/step-3.5-flash",
+    defaultModel: "z-ai/glm-5.1",
     smokeModel: "nvidia/nemotron-mini-4b-instruct",
     requiresKey: true,
-    description: "Fast, high-quality models hosted by NVIDIA"
+    description: "Long-context and coding-capable models hosted by NVIDIA"
   },
   openrouter: {
     key: "openrouter",

diff --git a/src/openai.ts b/src/openai.ts
@@ -192,6 +192,7 @@ export async function callOpenAICompatibleBackend(args: {
   const timeout = setTimeout(() => controller.abort(), PROVIDER_TIMEOUT_MS);
   const maxAttempts = Math.max(1, args.retryAttempts ?? PROVIDER_DEFAULT_MAX_ATTEMPTS);
   const retryBaseDelayMs = Math.max(0, args.retryBaseDelayMs ?? PROVIDER_DEFAULT_RETRY_BASE_MS);
+  let requestToSend = args.request;
 
   try {
     const headers: Record<string, string> = {
@@ -209,7 +210,7 @@ export async function callOpenAICompatibleBackend(args: {
       response = await fetch(`${args.backend.baseUrl}/chat/completions`, {
         method: "POST",
         headers,
-        body: JSON.stringify(args.request),
+        body: JSON.stringify(requestToSend),
         signal: controller.signal
       });
 
@@ -224,6 +225,31 @@ export async function callOpenAICompatibleBackend(args: {
         }));
       }
 
+      const contextLimit = parseContextLengthError(bodyText);
+      if (
+        response.status === 400 &&
+        contextLimit &&
+        requestToSend.max_tokens > 1
+      ) {
+        if (contextLimit.promptTokens >= contextLimit.limit) {
+          throw new ClaudiaError(
+            "invalid_request_error",
+            `Prompt exceeds the model context window of ${contextLimit.limit} tokens. Choose a larger-context model or shorten the conversation.`,
+            400
+          );
+        }
+
+        const adjustedMaxTokens = Math.max(1, contextLimit.limit - contextLimit.promptTokens - 1);
+
+        if (adjustedMaxTokens < requestToSend.max_tokens) {
+          requestToSend = {
+            ...requestToSend,
+            max_tokens: adjustedMaxTokens
+          };
+          continue;
+        }
+      }
+
       if (response.ok || !shouldRetryProviderStatus(response.status) || attempt === maxAttempts) {
         break;
       }
@@ -320,6 +346,22 @@ function truncateProviderBody(body: string): string {
   return body.length > 500 ? `${body.slice(0, 500)}...` : body;
 }
 
+function parseContextLengthError(bodyText: string): { limit: number; promptTokens: number; completionTokens: number } | null {
+  const match = bodyText.match(
+    /maximum context length is (\d+) tokens[\s\S]*?requested (\d+) tokens \((\d+) in the messages, (\d+) in the completion\)/i
+  );
+
+  if (!match) {
+    return null;
+  }
+
+  return {
+    limit: Number(match[1]),
+    promptTokens: Number(match[3]),
+    completionTokens: Number(match[4])
+  };
+}
+
 async function pollPendingProviderResponse(args: {
   backend: BackendConfig;
   headers: Record<string, string>;

diff --git a/tests/claudia-config.test.ts b/tests/claudia-config.test.ts
@@ -71,6 +71,7 @@ test("configuration wizard awaits remote connectivity before completion", async
   assert.match(logs.join("\n"), /OK Connected to nvidia successfully/);
   assert.match(logs.join("\n"), /Configuration complete!/);
   const config = JSON.parse(fs.readFileSync(path.join(cwd, "config.json"), "utf8"));
+  assert.equal(config.modelProfiles["claude-3-5-sonnet-latest"]?.providerModel, "z-ai/glm-5.1");
   assert.equal(config.modelProfiles["claude-3-5-sonnet-glm"]?.providerModel, "z-ai/glm4.7");
   assert.equal(config.modelProfiles["claude-3-5-sonnet-qwen"]?.providerModel, "qwen/qwen3.5-122b-a10b");
 });
diff --git a/tests/openai.test.ts b/tests/openai.test.ts
@@ -275,6 +275,83 @@ test("polls pending NVIDIA responses until the result is ready", async () => {
   }
 });
 
+test("automatically shrinks completion budget when the provider reports a context overflow", async () => {
+  const originalFetch = globalThis.fetch;
+  const maxTokensSeen: number[] = [];
+  let attempts = 0;
+
+  globalThis.fetch = async (_input, init) => {
+    attempts += 1;
+    const requestBody = JSON.parse(String(init?.body)) as { max_tokens: number };
+    maxTokensSeen.push(requestBody.max_tokens);
+
+    if (attempts === 1) {
+      return new Response(
+        JSON.stringify({
+          error:
+            "This model's maximum context length is 4096 tokens. However, you requested 4439 tokens (343 in the messages, 4096 in the completion). Please reduce the length of the messages or completion."
+        }),
+        {
+          status: 400,
+          headers: {
+            "content-type": "application/json"
+          }
+        }
+      );
+    }
+
+    return new Response(
+      JSON.stringify({
+        model: "test-model",
+        choices: [
+          {
+            message: {
+              role: "assistant",
+              content: "ok"
+            },
+            finish_reason: "stop"
+          }
+        ],
+        usage: {
+          prompt_tokens: 343,
+          completion_tokens: 12
+        }
+      }),
+      {
+        status: 200,
+        headers: {
+          "content-type": "application/json"
+        }
+      }
+    );
+  };
+
+  try {
+    const result = await callOpenAICompatibleBackend({
+      backend: {
+        baseUrl: "https://provider.test/v1",
+        apiKeyEnv: "TEST_API_KEY",
+        defaultModel: "test-model"
+      },
+      request: {
+        model: "test-model",
+        messages: [
+          {
+            role: "user",
+            content: "Say ok"
+          }
+        ],
+        max_tokens: 4096
+      }
+    });
+
+    assert.equal(result.text, "ok");
+    assert.deepEqual(maxTokensSeen, [4096, 3752]);
+  } finally {
+    globalThis.fetch = originalFetch;
+  }
+});
+
 test("formats completed responses as Anthropic SSE events", () => {
   const stream = buildAnthropicStream({
     id: "msg_test",