diff --git a/.agents/types/agent-definition.ts b/.agents/types/agent-definition.ts
index abbcbc0cda..6323ec7b77 100644
--- a/.agents/types/agent-definition.ts
+++ b/.agents/types/agent-definition.ts
@@ -431,6 +431,7 @@ export type ModelName =
   | 'z-ai/glm-4.7-flash'
   | 'z-ai/glm-4.7-flash:nitro'
   | 'minimax/minimax-m2.5'
+  | 'minimax/minimax-m2.7'
   | (string & {})
 
 import type { ToolName, GetToolParams } from './tools'
diff --git a/agents/e2e/context-pruning-threshold.e2e.test.ts b/agents/e2e/context-pruning-threshold.e2e.test.ts
new file mode 100644
index 0000000000..e62d213461
--- /dev/null
+++ b/agents/e2e/context-pruning-threshold.e2e.test.ts
@@ -0,0 +1,645 @@
+/**
+ * E2E Test: Context Pruning Threshold Verification
+ *
+ * This test verifies that context pruning triggers at the correct token count
+ * threshold and not prematurely. It uses the real token counting API and
+ * a multi-turn conversation to accumulate context naturally.
+ *
+ * Background: A previous bug caused the token counting API to either fail
+ * (falling back to a local overcounting formula) or apply a 30% buffer
+ * for non-Anthropic models, causing pruning to trigger at ~140k instead
+ * of the 200k limit. This test ensures:
+ *
+ * 1. Pruning does NOT trigger when token count is well below the limit
+ * 2. Pruning DOES trigger when token count exceeds the limit
+ * 3. The token count reported by the API is accurate (no 30% buffer for Anthropic models)
+ * 4. After pruning, tool-call/tool-result pairs remain intact
+ *
+ * Detection strategy: We detect pruning by checking for significant message
+ * count reduction and token count reduction. The context-pruner may produce
+ * a <conversation_summary> message, OR the fallback trimMessagesToFitTokenLimit
+ * may produce <system>Previous message(s) omitted due to length</system>.
+ * Both count as successful pruning for our purposes.
+ */
+
+import { API_KEY_ENV_VAR } from '@codebuff/common/old-constants'
+import {
+  CodebuffClient,
+  initialSessionState,
+  withMessageHistory,
+  type AgentDefinition,
+  type Message,
+  type ToolMessage,
+  type JSONValue,
+} from '@codebuff/sdk'
+import { describe, expect, it } from 'bun:test'
+
+import contextPruner from '../context-pruner'
+
+import type { ToolCallPart } from '@codebuff/common/types/messages/content-part'
+
+/**
+ * Type guard to check if a content part is a tool-call part with toolCallId.
+ */
+function isToolCallPart(part: unknown): part is ToolCallPart {
+  return (
+    typeof part === 'object' &&
+    part !== null &&
+    'type' in part &&
+    part.type === 'tool-call' &&
+    'toolCallId' in part &&
+    typeof (part as ToolCallPart).toolCallId === 'string'
+  )
+}
+
+/**
+ * Type guard to check if a message is a tool message with toolCallId.
+ */
+function isToolMessageWithId(
+  msg: Message,
+): msg is ToolMessage & { toolCallId: string } {
+  return (
+    msg.role === 'tool' &&
+    'toolCallId' in msg &&
+    typeof msg.toolCallId === 'string'
+  )
+}
+
+// Helper to create a text message
+const createMessage = (
+  role: 'user' | 'assistant',
+  content: string,
+): Message => ({
+  role,
+  content: [{ type: 'text', text: content }],
+})
+
+// Helper to create a tool call message
+const createToolCallMessage = (
+  toolCallId: string,
+  toolName: string,
+  input: Record<string, unknown>,
+): Message => ({
+  role: 'assistant',
+  content: [
+    {
+      type: 'tool-call',
+      toolCallId,
+      toolName,
+      input,
+    },
+  ],
+})
+
+// Helper to create a tool result message
+const createToolResultMessage = (
+  toolCallId: string,
+  toolName: string,
+  value: JSONValue,
+): ToolMessage => ({
+  role: 'tool',
+  toolCallId,
+  toolName,
+  content: [{ type: 'json', value }],
+})
+
+/**
+ * Test agent that auto-spawns context-pruner inline before each step,
+ * exactly mirroring how base2 works in production.
+ *
+ * The handleSteps function uses ({ params }) to receive maxContextLength
+ * from client.run({ params: { maxContextLength: ... } }), which flows through
+ * as spawnParams → toolCallParams → generator params, matching base2 exactly.
+ */
+const testAgent: AgentDefinition = {
+  id: 'context-pruning-threshold-test-agent',
+  displayName: 'Context Pruning Threshold Test Agent',
+  model: 'anthropic/claude-haiku-4.5',
+  includeMessageHistory: true,
+  toolNames: ['spawn_agents'],
+  spawnableAgents: ['context-pruner'],
+  instructionsPrompt: `You are a test agent for verifying context pruning behavior. When the user asks you to do something, do it briefly and concisely. Just say "OK" or "DONE" as requested.`,
+  handleSteps: function* ({ params }) {
+    while (true) {
+      // Run context-pruner before each step (same as base2 uses spawn_agent_inline)
+      yield {
+        toolName: 'spawn_agent_inline',
+        input: {
+          agent_type: 'context-pruner',
+          params: params ?? {},
+        },
+        includeToolCall: false,
+      } as any
+
+      const { stepsComplete } = yield 'STEP'
+      if (stepsComplete) break
+    }
+  },
+}
+
+/**
+ * Builds a message history targeting a specific approximate token count.
+ *
+ * Token estimation uses word-based content (NATO alphabet words repeated)
+ * which tokenizes at a predictable ~4 chars/token for Anthropic models.
+ * This is much more accurate than repeated 'x' characters which compress
+ * to ~5-6 chars/token, making estimates unreliable.
+ *
+ * Each round creates user (8k chars) + assistant (8k chars) +
+ * tool pair every other round (~4k chars). At ~4 chars/token:
+ * - User message: 8k/4 = 2k tokens
+ * - Assistant message: 8k/4 = 2k tokens
+ * - Tool pair (every other round avg): ~550 tokens
+ * - Tokens per round ≈ 4,550
+ * - Plus system prompt + tool definitions add ~15-20k tokens
+ */
+const LARGE_CONTENT_SIZE = 8_000
+const CHARS_PER_TOKEN = 4
+const TOOL_PAIR_TOKENS = 550 // avg tokens for tool call + result every other round
+const TOKENS_PER_ROUND = Math.ceil(
+  (2 * LARGE_CONTENT_SIZE) / CHARS_PER_TOKEN + TOOL_PAIR_TOKENS,
+)
+
+/**
+ * Diverse word content that tokenizes predictably at ~4 chars/token.
+ * Repeated 'x' characters compress to ~5-6 chars/token in Anthropic's BPE tokenizer,
+ * making token estimates inaccurate. Using diverse words avoids this.
+ */
+const WORD_FILLER =
+  'alpha bravo charlie delta echo foxtrot golf hotel india juliett kilo lima mike november oscar papa quebec romeo sierra tango uniform victor whiskey xray yankee zulu '
+
+function makeLargeContent(prefix: string, size: number): string {
+  const repeats = Math.ceil((size - prefix.length) / WORD_FILLER.length)
+  return prefix + WORD_FILLER.repeat(repeats).slice(0, size - prefix.length)
+}
+
+function buildMessageHistory(targetApproxTokens: number): Message[] {
+  const messages: Message[] = []
+  const roundsNeeded = Math.max(1, Math.ceil(targetApproxTokens / TOKENS_PER_ROUND))
+  const now = Date.now()
+
+  console.log(
+    `  Building ${roundsNeeded} rounds for ~${targetApproxTokens} tokens ` +
+    `(est ${TOKENS_PER_ROUND} tokens/round)`,
+  )
+
+  for (let i = 0; i < roundsNeeded; i++) {
+    // Add sentAt timestamps so context-pruner's cache-miss detection works correctly.
+    // Space messages 30s apart so no cache-miss (>5min gap) is triggered inadvertently.
+    const sentAt = now - (roundsNeeded - i) * 30_000
+
+    // User message with diverse word content (~4 chars/token)
+    const userMsg = createMessage(
+      'user',
+      makeLargeContent(`Round ${i + 1}: `, LARGE_CONTENT_SIZE),
+    )
+    userMsg.sentAt = sentAt
+    messages.push(userMsg)
+
+    // Assistant response with diverse word content
+    const assistantMsg = createMessage(
+      'assistant',
+      makeLargeContent(`Response ${i + 1}: `, LARGE_CONTENT_SIZE),
+    )
+    assistantMsg.sentAt = sentAt + 10_000
+    messages.push(assistantMsg)
+
+    // Add a tool call pair every other round for realism
+    if (i % 2 === 0) {
+      const callId = `call-${i}`
+      messages.push(
+        createToolCallMessage(callId, 'read_files', { paths: [`file-${i}.ts`] }),
+      )
+      messages.push(
+        createToolResultMessage(callId, 'read_files', {
+          content: makeLargeContent('', LARGE_CONTENT_SIZE / 2),
+        }),
+      )
+    }
+  }
+
+  return messages
+}
+
+/**
+ * Detects whether context pruning occurred by checking for:
+ * 1. <conversation_summary> tag (context-pruner's output)
+ * 2. <system>Previous message(s) omitted due to length</system> (trimMessagesToFitTokenLimit fallback)
+ * 3. Significant message count reduction (>50% fewer messages than original)
+ */
+function detectPruning(
+  finalMessages: Message[],
+  originalMessageCount: number,
+): {
+  wasPruned: boolean
+  hasSummary: boolean
+  hasTrimFallback: boolean
+  messageReduction: number
+} {
+  const hasSummary = finalMessages.some((msg) => {
+    if (msg.role !== 'user' || !Array.isArray(msg.content)) return false
+    return msg.content.some(
+      (part) =>
+        typeof part === 'object' &&
+        'type' in part &&
+        part.type === 'text' &&
+        typeof (part as any).text === 'string' &&
+        (part as any).text.includes('<conversation_summary>'),
+    )
+  })
+
+  const hasTrimFallback = finalMessages.some((msg) => {
+    if (!Array.isArray(msg.content)) return false
+    return msg.content.some(
+      (part) =>
+        typeof part === 'object' &&
+        'type' in part &&
+        part.type === 'text' &&
+        typeof (part as any).text === 'string' &&
+        (part as any).text.includes('Previous message(s) omitted'),
+    )
+  })
+
+  // Message reduction: if fewer than 50% of original messages remain
+  const messageReduction =
+    originalMessageCount > 0
+      ? 1 - finalMessages.length / originalMessageCount
+      : 0
+
+  const wasPruned =
+    hasSummary || hasTrimFallback || messageReduction > 0.5
+
+  return { wasPruned, hasSummary, hasTrimFallback, messageReduction }
+}
+
+/**
+ * Verifies tool-call/tool-result pair integrity.
+ * Anthropic API rejects requests with orphaned tool calls or results.
+ */
+function verifyToolCallPairIntegrity(messages: Message[]) {
+  const toolCallIds = new Set<string>()
+  const toolResultIds = new Set<string>()
+
+  for (const msg of messages) {
+    if (msg.role === 'assistant' && Array.isArray(msg.content)) {
+      for (const part of msg.content) {
+        if (isToolCallPart(part)) {
+          toolCallIds.add(part.toolCallId)
+        }
+      }
+    }
+    if (isToolMessageWithId(msg)) {
+      toolResultIds.add(msg.toolCallId)
+    }
+  }
+
+  // Every tool result must have a matching tool call
+  for (const resultId of toolResultIds) {
+    expect(toolCallIds.has(resultId)).toBe(true)
+  }
+  // Every tool call must have a matching tool result
+  for (const callId of toolCallIds) {
+    expect(toolResultIds.has(callId)).toBe(true)
+  }
+}
+
+describe('Context Pruning Threshold E2E', () => {
+  it(
+    'should NOT prune when token count is well below the limit',
+    async () => {
+      const apiKey = process.env[API_KEY_ENV_VAR]!
+      if (!apiKey) {
+        console.log('Skipping: No API key found')
+        return
+      }
+
+      // Build message history targeting ~30k tokens of message content
+      // With maxContextLength=100k, this should be well below the pruning threshold
+      const messages = buildMessageHistory(30_000)
+
+      const client = new CodebuffClient({
+        apiKey,
+        agentDefinitions: [testAgent, contextPruner],
+      })
+
+      const sessionState = await initialSessionState({})
+      const runStateWithMessages = withMessageHistory({
+        runState: { sessionState, output: { type: 'error', message: '' } },
+        messages,
+      })
+
+      // Run the agent with maxContextLength=100k - context-pruner should NOT prune
+      const run = await client.run({
+        agent: testAgent.id,
+        prompt: 'Say "OK" and nothing else.',
+        previousRun: runStateWithMessages,
+        params: { maxContextLength: 100_000 },
+        handleEvent: (event) => {
+          if (event.type === 'text') {
+            console.log('  [below-limit] Agent text:', event.text.slice(0, 100))
+          }
+        },
+      })
+
+      // Should complete without error
+      if (run.output.type === 'error') {
+        console.error('Below-limit test error:', JSON.stringify(run.output, null, 2))
+      }
+      expect(run.output.type).not.toEqual('error')
+
+      // Check the final message history
+      const finalMessages =
+        run.sessionState?.mainAgentState.messageHistory ?? []
+      const tokenCount = run.sessionState?.mainAgentState.contextTokenCount ?? 0
+      const pruningResult = detectPruning(finalMessages, messages.length)
+
+      console.log('  [below-limit] Token count:', tokenCount)
+      console.log(
+        '  [below-limit] Message count:',
+        finalMessages.length,
+        '(original:',
+        messages.length,
+        ')',
+      )
+      console.log('  [below-limit] Pruning result:', pruningResult)
+
+      // Key assertion: pruning should NOT have happened
+      expect(pruningResult.wasPruned).toBe(false)
+
+      // Token count should be below the limit
+      expect(tokenCount).toBeLessThan(100_000)
+
+      // CRITICAL: The token count should NOT have a 30% buffer applied
+      // If the old bug were present, the actual count (~50k) would be reported as ~65k
+      // With accurate counting for Anthropic models, no buffer is applied
+      expect(tokenCount).toBeGreaterThan(10_000) // At least some tokens accumulated
+      expect(tokenCount).toBeLessThan(80_000) // Well below limit even with natural variance
+    },
+    { timeout: 120_000 },
+  )
+
+  it(
+    'should prune when token count exceeds the limit',
+    async () => {
+      const apiKey = process.env[API_KEY_ENV_VAR]!
+      if (!apiKey) {
+        console.log('Skipping: No API key found')
+        return
+      }
+
+      // Build message history targeting ~80k tokens of message content
+      // With maxContextLength=50k, this should exceed the pruning threshold
+      const messages = buildMessageHistory(80_000)
+
+      const client = new CodebuffClient({
+        apiKey,
+        agentDefinitions: [testAgent, contextPruner],
+      })
+
+      const sessionState = await initialSessionState({})
+      const runStateWithMessages = withMessageHistory({
+        runState: { sessionState, output: { type: 'error', message: '' } },
+        messages,
+      })
+
+      // Run the agent with maxContextLength=50k - context-pruner SHOULD prune
+      const run = await client.run({
+        agent: testAgent.id,
+        prompt: 'Say "DONE" and nothing else.',
+        previousRun: runStateWithMessages,
+        params: { maxContextLength: 50_000 },
+        handleEvent: (event) => {
+          if (event.type === 'text') {
+            console.log('  [above-limit] Agent text:', event.text.slice(0, 100))
+          }
+        },
+      })
+
+      // Should complete without error
+      if (run.output.type === 'error') {
+        console.error('Above-limit test error:', JSON.stringify(run.output, null, 2))
+      }
+      expect(run.output.type).not.toEqual('error')
+
+      // Check the final message history
+      const finalMessages =
+        run.sessionState?.mainAgentState.messageHistory ?? []
+      const tokenCount = run.sessionState?.mainAgentState.contextTokenCount ?? 0
+      const pruningResult = detectPruning(finalMessages, messages.length)
+
+      console.log('  [above-limit] Token count:', tokenCount)
+      console.log(
+        '  [above-limit] Message count:',
+        finalMessages.length,
+        '(original:',
+        messages.length,
+        ')',
+      )
+      console.log('  [above-limit] Pruning result:', pruningResult)
+
+      // Key assertion: pruning SHOULD have happened
+      // We accept any form of pruning: conversation_summary, trimMessages fallback, or significant reduction
+      expect(pruningResult.wasPruned).toBe(true)
+
+      // After pruning, the message count should be significantly reduced
+      expect(finalMessages.length).toBeLessThan(messages.length)
+
+      // Verify tool-call/tool-result pair integrity after pruning
+      verifyToolCallPairIntegrity(finalMessages)
+
+      // After pruning, the token count should be below the limit
+      expect(tokenCount).toBeLessThan(50_000)
+    },
+    { timeout: 180_000 },
+  )
+
+  it(
+    'should verify token counting accuracy: no premature 30% buffer for Anthropic models',
+    async () => {
+      const apiKey = process.env[API_KEY_ENV_VAR]!
+      if (!apiKey) {
+        console.log('Skipping: No API key found')
+        return
+      }
+
+      // This test verifies that the token counting API returns accurate counts
+      // for Anthropic models without a 30% buffer or local fallback overcounting.
+      //
+      // Strategy: Run TWO agent calls with the same message history:
+      //   1. Calibration run with 200k limit (no pruning) → measure TRUE token count
+      //   2. Test run with 100k limit → check if pruning triggers
+      //
+      // If true tokens < 100k but pruning triggered in the 100k run, that proves
+      // the token counting API is over-reporting (30% buffer or fallback bug).
+      //
+      // We target ~95k estimated tokens of content, which should produce ~95-100k
+      // actual tokens — close to the 100k limit but safely under with accurate counting.
+      //
+      // Accurate counting:  ~90k < 100k → no pruning in either run ✓
+      // 30% buffer:         ~90k reported as ~117k → premature pruning in 100k run ✗
+      // Local fallback:     ~90k reported as ~135k+ → premature pruning in 100k run ✗
+
+      // Create a large history targeting ~95k estimated tokens of message content
+      const TARGET_ESTIMATED_TOKENS = 95_000
+      const messages = buildMessageHistory(TARGET_ESTIMATED_TOKENS)
+
+      const client = new CodebuffClient({
+        apiKey,
+        agentDefinitions: [testAgent, contextPruner],
+      })
+
+      // =========================================================================
+      // Step 1: CALIBRATION RUN — measure true token count with 200k limit (no pruning)
+      // =========================================================================
+      const sessionStateCal = await initialSessionState({})
+      const runStateCal = withMessageHistory({
+        runState: {
+          sessionState: sessionStateCal,
+          output: { type: 'error', message: '' },
+        },
+        messages,
+      })
+
+      console.log('  [accuracy] Running calibration with 200k limit...')
+      const calRun = await client.run({
+        agent: testAgent.id,
+        prompt: 'Say "CAL" and nothing else.',
+        previousRun: runStateCal,
+        params: { maxContextLength: 200_000 },
+        handleEvent: (event) => {
+          if (event.type === 'text') {
+            console.log('  [accuracy-cal] Agent text:', event.text.slice(0, 100))
+          }
+        },
+      })
+
+      const trueTokenCount =
+        calRun.sessionState?.mainAgentState.contextTokenCount ?? 0
+      const calMessages =
+        calRun.sessionState?.mainAgentState.messageHistory ?? []
+      const calPruning = detectPruning(calMessages, messages.length)
+
+      console.log('  [accuracy] ========== CALIBRATION RESULTS ==========')
+      console.log('  [accuracy] TRUE token count (200k limit):', trueTokenCount)
+      console.log(
+        '  [accuracy] Cal message count:',
+        calMessages.length,
+        '(original:',
+        messages.length,
+        ')',
+      )
+      console.log('  [accuracy] Cal pruning result:', calPruning)
+      console.log(
+        '  [accuracy] Ratio true/estimated:',
+        (trueTokenCount / TARGET_ESTIMATED_TOKENS).toFixed(2),
+      )
+      console.log('  [accuracy] =========================================')
+
+      // Calibration should not have pruned (200k limit is very high)
+      expect(calPruning.wasPruned).toBe(false)
+      expect(trueTokenCount).toBeGreaterThan(50_000)
+
+      // =========================================================================
+      // Step 2: TEST RUN — same content with 100k limit
+      // =========================================================================
+      const sessionState = await initialSessionState({})
+      const runStateWithMessages = withMessageHistory({
+        runState: { sessionState, output: { type: 'error', message: '' } },
+        messages,
+      })
+
+      const MAX_CONTEXT_LENGTH = 100_000
+
+      console.log('  [accuracy] Running test with 100k limit...')
+      const run = await client.run({
+        agent: testAgent.id,
+        prompt: 'Say "ACK" and nothing else.',
+        previousRun: runStateWithMessages,
+        params: { maxContextLength: MAX_CONTEXT_LENGTH },
+        handleEvent: (event) => {
+          if (event.type === 'text') {
+            console.log('  [accuracy-100k] Agent text:', event.text.slice(0, 100))
+          }
+        },
+      })
+
+      if (run.output.type === 'error') {
+        console.error('Accuracy test error:', JSON.stringify(run.output, null, 2))
+      }
+      expect(run.output.type).not.toEqual('error')
+
+      const reportedTokenCount =
+        run.sessionState?.mainAgentState.contextTokenCount ?? 0
+      const finalMessages =
+        run.sessionState?.mainAgentState.messageHistory ?? []
+      const pruningResult = detectPruning(finalMessages, messages.length)
+
+      console.log('  [accuracy] ========== 100K LIMIT TEST RESULTS ==========')
+      console.log('  [accuracy] Reported token count:', reportedTokenCount)
+      console.log(
+        '  [accuracy] Final message count:',
+        finalMessages.length,
+        '(original:',
+        messages.length,
+        ')',
+      )
+      console.log('  [accuracy] Pruning result:', pruningResult)
+      console.log(
+        '  [accuracy] Was pruned:',
+        pruningResult.wasPruned,
+        '(true tokens were:',
+        trueTokenCount,
+        ', limit:',
+        MAX_CONTEXT_LENGTH,
+        ')',
+      )
+      console.log('  [accuracy] ================================================')
+
+      // =========================================================================
+      // DIAGNOSIS: Compare true tokens vs limit
+      // =========================================================================
+      if (trueTokenCount < MAX_CONTEXT_LENGTH && pruningResult.wasPruned) {
+        console.error(
+          `  ❌ BUG DETECTED: True tokens (${trueTokenCount}) < limit (${MAX_CONTEXT_LENGTH}), ` +
+            `but pruning was triggered! The token counting API is over-reporting.`,
+        )
+      } else if (
+        trueTokenCount < MAX_CONTEXT_LENGTH &&
+        !pruningResult.wasPruned
+      ) {
+        console.log(
+          `  ✅ No bug: True tokens (${trueTokenCount}) < limit (${MAX_CONTEXT_LENGTH}), ` +
+            `no pruning occurred.`,
+        )
+      } else {
+        console.log(
+          `  ⚠️ Content too large: True tokens (${trueTokenCount}) >= limit (${MAX_CONTEXT_LENGTH}). ` +
+            `Pruning is expected. Adjust content size.`,
+        )
+      }
+
+      // The ratio of true token count to our estimated content tokens.
+      // Our estimate is for message content only; the actual count includes
+      // system prompt + tool definitions. So ratio 1.0-1.3 is expected.
+      // A 30% buffer on the full count would push the ratio above 1.3.
+      const ratio = trueTokenCount / TARGET_ESTIMATED_TOKENS
+      console.log(
+        '  [accuracy] Ratio of true/estimated:',
+        ratio.toFixed(2),
+        '(expected: 1.0-1.3, 30% bug → 1.3+, fallback → 1.5+)',
+      )
+      expect(ratio).toBeLessThan(1.3)
+
+      // CRITICAL: If true tokens are under 100k, no pruning should have occurred.
+      // If true tokens >= 100k, pruning is expected and we skip this assertion.
+      if (trueTokenCount < MAX_CONTEXT_LENGTH) {
+        expect(pruningResult.wasPruned).toBe(false)
+      } else {
+        console.log(
+          `  [accuracy] Content too large: true tokens (${trueTokenCount}) >= limit (${MAX_CONTEXT_LENGTH}). Pruning is expected.`,
+        )
+      }
+    },
+    { timeout: 300_000 },
+  )
+})
diff --git a/agents/librarian/librarian.ts b/agents/librarian/librarian.ts
index 69dd157181..8498648c48 100644
--- a/agents/librarian/librarian.ts
+++ b/agents/librarian/librarian.ts
@@ -9,7 +9,7 @@ const librarian: AgentDefinition = {
   id: 'librarian',
   publisher,
   displayName: 'Librarian',
-  model: 'minimax/minimax-m2.5',
+  model: 'minimax/minimax-m2.7',
 
   spawnerPrompt:
     'Spawn the librarian agent to shallow-clone a GitHub repository into /tmp and answer questions about its code, structure, or documentation. The agent returns structured output with `answer`, `relevantFiles` (absolute paths in the cloned repo), and `cloneDir`. You can use `run_terminal_command` with `cat` to read the returned `relevantFiles` paths. Clean up `cloneDir` with `rm -rf` when done.',
diff --git a/agents/tmux-cli.ts b/agents/tmux-cli.ts
index 3a7877ae6e..a03066dab5 100644
--- a/agents/tmux-cli.ts
+++ b/agents/tmux-cli.ts
@@ -71,7 +71,7 @@ const outputSchema = {
 const definition: AgentDefinition = {
   id: 'tmux-cli',
   displayName: 'Tmux CLI Agent',
-  model: 'minimax/minimax-m2.5',
+  model: 'minimax/minimax-m2.7',
   // Provider options are tightly coupled to the model choice above.
   // If you change the model, update these accordingly.
   providerOptions: {
diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts
index 522994ac27..b81fc69c88 100644
--- a/agents/types/agent-definition.ts
+++ b/agents/types/agent-definition.ts
@@ -432,6 +432,7 @@ export type ModelName =
   | 'z-ai/glm-4.7-flash'
   | 'z-ai/glm-4.7-flash:nitro'
   | 'minimax/minimax-m2.5'
+  | 'minimax/minimax-m2.7'
   | (string & {})
 
 import type { ToolName, GetToolParams } from './tools'
diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts
index 3a9f5c9166..551500f3f5 100644
--- a/common/src/constants/free-agents.ts
+++ b/common/src/constants/free-agents.ts
@@ -18,7 +18,7 @@ export const FREE_COST_MODE = 'free' as const
  */
 export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
   // Root orchestrator
-  'base2-free': new Set(['minimax/minimax-m2.5', 'z-ai/glm-5.1']),
+  'base2-free': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']),
 
   // File exploration agents
   'file-picker': new Set(['google/gemini-2.5-flash-lite']),
@@ -33,10 +33,10 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
   'basher': new Set(['google/gemini-3.1-flash-lite-preview']),
 
   // Editor for free mode
-  'editor-lite': new Set(['minimax/minimax-m2.5', 'z-ai/glm-5.1']),
+  'editor-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']),
 
   // Code reviewer for free mode
-  'code-reviewer-lite': new Set(['minimax/minimax-m2.5', 'z-ai/glm-5.1']),
+  'code-reviewer-lite': new Set(['minimax/minimax-m2.7', 'z-ai/glm-5.1']),
 
   // Thinker for free mode
   'thinker-with-files-gemini': new Set(['google/gemini-3.1-pro-preview']),
@@ -106,7 +106,7 @@ export function isFreeModeAllowedAgentModel(
   // Exact match first
   if (allowedModels.has(model)) return true
 
-  // OpenRouter may return dated variants (e.g. "minimax/minimax-m2.5-20260211")
+  // OpenRouter may return dated variants (e.g. "minimax/minimax-m2.7-20260211")
   // so also check if the returned model starts with any allowed model prefix.
   for (const allowed of allowedModels) {
     if (model.startsWith(allowed + '-')) return true
diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts
index 522994ac27..b81fc69c88 100644
--- a/common/src/templates/initial-agents-dir/types/agent-definition.ts
+++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts
@@ -432,6 +432,7 @@ export type ModelName =
   | 'z-ai/glm-4.7-flash'
   | 'z-ai/glm-4.7-flash:nitro'
   | 'minimax/minimax-m2.5'
+  | 'minimax/minimax-m2.7'
   | (string & {})
 
 import type { ToolName, GetToolParams } from './tools'
diff --git a/evals/buffbench/main-nightly.ts b/evals/buffbench/main-nightly.ts
index df3c6f0ea5..c96685c131 100644
--- a/evals/buffbench/main-nightly.ts
+++ b/evals/buffbench/main-nightly.ts
@@ -17,7 +17,7 @@ async function main() {
   const results = await runBuffBench({
     evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2-free'],
-    taskConcurrency: 3,
+    taskConcurrency: 6,
     saveTraces,
   })
 
diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts
index aeb462abe3..471f6e6dbc 100644
--- a/evals/buffbench/main.ts
+++ b/evals/buffbench/main.ts
@@ -11,7 +11,7 @@ async function main() {
   await runBuffBench({
     evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2-free-evals'],
-    taskConcurrency: 5,
+    taskConcurrency: 10,
     saveTraces,
   })
 
diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts
index d9825930c0..aa915f1529 100644
--- a/web/src/llm-api/fireworks.ts
+++ b/web/src/llm-api/fireworks.ts
@@ -29,7 +29,9 @@ const fireworksAgent = new Agent({
 /** Map from OpenRouter model IDs to Fireworks standard API model IDs */
 const FIREWORKS_MODEL_MAP: Record<string, string> = {
   'minimax/minimax-m2.5': 'accounts/fireworks/models/minimax-m2p5',
+  'minimax/minimax-m2.7': 'accounts/fireworks/models/minimax-m2p7',
   'z-ai/glm-5.1': 'accounts/fireworks/models/glm-5p1',
+  'moonshotai/kimi-k2.5': 'accounts/fireworks/models/kimi-k2p5',
 }
 
 /** Flag to enable custom Fireworks deployments (set to false to use global API only) */
@@ -152,11 +154,21 @@ const FIREWORKS_PRICING_MAP: Record<string, FireworksPricing> = {
     cachedInputCostPerToken: 0.03 / 1_000_000,
     outputCostPerToken: 1.20 / 1_000_000,
   },
+  'minimax/minimax-m2.7': {
+    inputCostPerToken: 0.30 / 1_000_000,
+    cachedInputCostPerToken: 0.06 / 1_000_000,
+    outputCostPerToken: 1.20 / 1_000_000,
+  },
   'z-ai/glm-5.1': {
     inputCostPerToken: 1.40 / 1_000_000,
     cachedInputCostPerToken: 0.26 / 1_000_000,
     outputCostPerToken: 4.40 / 1_000_000,
   },
+  'moonshotai/kimi-k2.5': {
+    inputCostPerToken: 0.60 / 1_000_000,
+    cachedInputCostPerToken: 0.10 / 1_000_000,
+    outputCostPerToken: 3.00 / 1_000_000,
+  },
 }
 
 function getFireworksPricing(model: string): FireworksPricing {