diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..48e8c04
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,53 @@
+# Version control
+.git
+.gitignore
+.gitattributes
+
+# Python build / runtime caches
+__pycache__
+*.py[cod]
+*$py.class
+*.egg-info
+.pytest_cache
+.hypothesis
+.mypy_cache
+.ruff_cache
+.coverage
+coverage.xml
+htmlcov
+
+# Local virtualenvs (Dockerfile creates its own via Poetry)
+.venv
+venv
+env
+
+# Local-only config and secrets
+.env
+.env.*
+!.env.example
+*.pem
+*.key
+*.pfx
+id_rsa*
+credentials*
+secrets*
+
+# Tests / docs / examples are not needed at runtime
+tests
+docs
+examples
+
+# Editor / OS cruft
+.DS_Store
+.idea
+.vscode
+*.swp
+*~
+
+# CI / local tooling state
+.github
+.claude
+
+# Planning / local notes
+plans
+*.tmp
diff --git a/.env.example b/.env.example
index 749c598..943e014 100644
--- a/.env.example
+++ b/.env.example
@@ -25,8 +25,36 @@ MAX_TIMEOUT=600000
 CORS_ORIGINS=["*"]
 
 # Model Configuration
-# Default Claude model to use when none specified in request
-DEFAULT_MODEL=claude-sonnet-4-5-20250929
+# Default Claude model to use when none specified in request.
+# When unset AND ANTHROPIC_API_KEY is configured, the wrapper resolves the
+# latest Sonnet from Anthropic's live Models API at startup. Otherwise it
+# falls back to claude-sonnet-4-6.
+# DEFAULT_MODEL=claude-sonnet-4-6
+
+# Speed/cost-optimized model alias.
+# FAST_MODEL=claude-haiku-4-5-20251001
+
+# Model Discovery (optional)
+# ANTHROPIC_API_KEY unlocks two best-effort enhancements:
+#   1. /v1/models returns Anthropic's live model list (cached for 1 hour)
+#   2. DEFAULT_MODEL resolves to the latest Sonnet at startup
+# It is NOT required to run the wrapper - Bedrock, Vertex, and Claude CLI
+# subscription auth all work without it; /v1/models then returns the static
+# fallback list.
+# ANTHROPIC_API_KEY=sk-ant-...
+
+# Pin the advertised model list. Takes precedence over both live and static.
+# CLAUDE_MODELS_OVERRIDE=claude-sonnet-4-6,claude-opus-4-6
+
+# Cache TTL for live /v1/models results (seconds).
+# MODEL_LIST_CACHE_TTL_SECONDS=3600
+
+# Short cache TTL when the live fetch fails so transient outages don't
+# suppress live discovery for the full hour.
+# MODEL_LIST_ERROR_TTL_SECONDS=60
+
+# HTTP timeout for the live model fetch.
+# MODEL_LIST_REQUEST_TIMEOUT_SECONDS=5
 
 # Rate Limiting Configuration
 RATE_LIMIT_ENABLED=true
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..be9f473
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,49 @@
+version: 2
+updates:
+  # Python dependencies via Poetry. claude-agent-sdk ships on a fast
+  # cadence (47 patch releases between 0.1.18 and 0.1.65 in the window
+  # covered by CHANGELOG 2.9.0); weekly checks keep the drift bounded
+  # without drowning review in daily PRs.
+  - package-ecosystem: pip
+    directory: "/"
+    schedule:
+      interval: weekly
+      day: monday
+      time: "06:00"
+      timezone: Etc/UTC
+    open-pull-requests-limit: 5
+    labels:
+      - dependencies
+      - python
+    commit-message:
+      prefix: "chore(deps)"
+      include: scope
+    groups:
+      # Group minor/patch bumps so the review queue stays short.
+      python-minor-patch:
+        patterns:
+          - "*"
+        update-types:
+          - minor
+          - patch
+    # claude-agent-sdk stays exact-pinned on purpose (see pyproject.toml).
+    # Do not let Dependabot widen the constraint - it must only propose
+    # a new exact pin.
+    ignore: []
+
+  # GitHub Actions versions (actions/checkout, codecov/codecov-action,
+  # snok/install-poetry, etc). Weekly keeps supply-chain drift bounded.
+  - package-ecosystem: github-actions
+    directory: "/"
+    schedule:
+      interval: weekly
+      day: monday
+      time: "06:00"
+      timezone: Etc/UTC
+    open-pull-requests-limit: 5
+    labels:
+      - dependencies
+      - github-actions
+    commit-message:
+      prefix: "chore(ci)"
+      include: scope
diff --git a/.github/workflows/check-sdk-version.yml b/.github/workflows/check-sdk-version.yml
new file mode 100644
index 0000000..76905e1
--- /dev/null
+++ b/.github/workflows/check-sdk-version.yml
@@ -0,0 +1,181 @@
+name: Check claude-agent-sdk version
+
+# Belt-and-suspenders on top of Dependabot: every Monday, fetch the
+# latest claude-agent-sdk release from PyPI and compare to the pin in
+# pyproject.toml. If we are behind, open a draft PR with the pin bump
+# and regenerated poetry.lock so a human reviewer just adds the version
+# bump + CHANGELOG entry before merging. Also runnable manually.
+#
+# Idempotent: skips PR creation when an open PR for that head branch
+# already exists. Job summary fallback runs unconditionally on drift
+# so the run page always carries the version delta even if PR creation
+# can't run (existing PR, branch conflict, etc.).
+#
+# Workflow injection notes: schedule + workflow_dispatch are the only
+# event sources, so no user-controlled event payload is involved. The
+# values flowing into run blocks (pinned, latest, branch) are derived
+# from pyproject.toml and pypi.org JSON, and are passed via env: so
+# they never reach the shell via ${{ }} expression interpolation.
+
+on:
+  schedule:
+    - cron: "0 14 * * 1" # Mondays 14:00 UTC
+  workflow_dispatch:
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Compare pinned SDK vs latest PyPI release
+        id: compare
+        run: |
+          set -euo pipefail
+
+          pinned=$(python3 -c '
+          import re, pathlib
+          text = pathlib.Path("pyproject.toml").read_text()
+          m = re.search(r"claude-agent-sdk\s*=\s*(?:\"([^\"]+)\"|\{version\s*=\s*\"([^\"]+)\")", text)
+          if not m:
+              raise SystemExit("Could not find claude-agent-sdk pin in pyproject.toml")
+          print((m.group(1) or m.group(2)).lstrip("^~="))
+          ')
+
+          latest=$(curl -sSf https://pypi.org/pypi/claude-agent-sdk/json | python3 -c 'import json,sys; print(json.load(sys.stdin)["info"]["version"])')
+
+          echo "pinned=$pinned" >> "$GITHUB_OUTPUT"
+          echo "latest=$latest" >> "$GITHUB_OUTPUT"
+
+          if [ "$pinned" = "$latest" ]; then
+            echo "up_to_date=true" >> "$GITHUB_OUTPUT"
+            echo "claude-agent-sdk pin $pinned matches latest PyPI release."
+          else
+            echo "up_to_date=false" >> "$GITHUB_OUTPUT"
+            echo "::warning::claude-agent-sdk pin ($pinned) is behind latest PyPI release ($latest)."
+          fi
+
+      - name: Set up Python
+        if: steps.compare.outputs.up_to_date == 'false'
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Poetry
+        if: steps.compare.outputs.up_to_date == 'false'
+        run: pipx install poetry==2.3.4
+
+      - name: Check for existing bump PR
+        id: existing
+        if: steps.compare.outputs.up_to_date == 'false'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          LATEST: ${{ steps.compare.outputs.latest }}
+        run: |
+          set -euo pipefail
+          branch="chore/sdk-bump-${LATEST}"
+          echo "branch=$branch" >> "$GITHUB_OUTPUT"
+          if [ -n "$(gh pr list --state open --head "$branch" --json number --jq '.[0].number')" ]; then
+            echo "exists=true" >> "$GITHUB_OUTPUT"
+            echo "An open PR already exists for $branch; skipping create step."
+          else
+            echo "exists=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Bump pin, regenerate lock, and open draft PR
+        if: steps.compare.outputs.up_to_date == 'false' && steps.existing.outputs.exists == 'false'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PINNED: ${{ steps.compare.outputs.pinned }}
+          LATEST: ${{ steps.compare.outputs.latest }}
+          BRANCH: ${{ steps.existing.outputs.branch }}
+        run: |
+          set -euo pipefail
+
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git checkout -b "$BRANCH"
+
+          python3 - <<'PY'
+          import os, pathlib, re
+          latest = os.environ["LATEST"]
+          path = pathlib.Path("pyproject.toml")
+          text = path.read_text()
+          # Prefer the table form first (which carries the [otel] extras).
+          table_pat = re.compile(
+              r'(claude-agent-sdk\s*=\s*\{[^}]*version\s*=\s*")[^"]+(")'
+          )
+          new_text, n = table_pat.subn(r'\g<1>' + latest + r'\g<2>', text, count=1)
+          if n == 0:
+              # Fall back to the plain-string form.
+              string_pat = re.compile(r'(claude-agent-sdk\s*=\s*")[^"]+(")')
+              new_text, n = string_pat.subn(r'\g<1>' + latest + r'\g<2>', text, count=1)
+          if n == 0:
+              raise SystemExit("Failed to update claude-agent-sdk pin in pyproject.toml")
+          path.write_text(new_text)
+          PY
+
+          poetry lock --no-interaction
+
+          git add pyproject.toml poetry.lock
+          git commit -m "chore(deps): bump claude-agent-sdk $PINNED -> $LATEST"
+          git push origin "$BRANCH"
+
+          gh pr create \
+            --draft \
+            --base main \
+            --head "$BRANCH" \
+            --title "chore(deps): bump claude-agent-sdk $PINNED -> $LATEST" \
+            --body "Automated bump opened by the \`Check claude-agent-sdk version\` workflow.
+
+          Bumps the SDK pin in \`pyproject.toml\` from \`$PINNED\` to \`$LATEST\` and regenerates \`poetry.lock\`. Scope is deliberately limited to the pin + lock so the human reviewer owns the release coordination.
+
+          References:
+          - Release notes: https://github.com/anthropics/claude-agent-sdk-python/releases/tag/v$LATEST
+          - Full changelog: https://github.com/anthropics/claude-agent-sdk-python/compare/v$PINNED...v$LATEST
+          - PyPI: https://pypi.org/project/claude-agent-sdk/$LATEST/
+
+          Reviewer checklist before merging:
+
+          - [ ] Bump version in \`pyproject.toml\` \`[tool.poetry] version\` and \`src/__init__.py\`
+          - [ ] Add a new \`## [x.y.z]\` section to \`CHANGELOG.md\` describing this bump
+          - [ ] Confirm the \`[otel]\` extra is still present on the pin (the SDK unconditionally imports \`opentelemetry.propagate\`)
+          - [ ] Push an empty commit (\`git commit --allow-empty\`) so the test matrix fires: PRs opened with the default \`GITHUB_TOKEN\` do not trigger downstream \`pull_request\` workflow runs by design
+          - [ ] Confirm all CI checks pass
+
+          Mark the PR ready for review once the items above are in place."
+
+      - name: Write drift summary
+        if: steps.compare.outputs.up_to_date == 'false'
+        env:
+          PINNED: ${{ steps.compare.outputs.pinned }}
+          LATEST: ${{ steps.compare.outputs.latest }}
+          BRANCH: ${{ steps.existing.outputs.branch }}
+          PR_EXISTS: ${{ steps.existing.outputs.exists }}
+        run: |
+          set -euo pipefail
+          {
+            echo "## claude-agent-sdk drift"
+            echo
+            echo "| | Version |"
+            echo "|---|---|"
+            echo "| Pinned | \`$PINNED\` |"
+            echo "| Latest on PyPI | \`$LATEST\` |"
+            echo
+            if [ "$PR_EXISTS" = "true" ]; then
+              echo "An open PR for branch \`$BRANCH\` already exists; no new PR was opened."
+            else
+              echo "Opened draft PR on branch \`$BRANCH\`."
+            fi
+            echo
+            echo "Release notes: https://github.com/anthropics/claude-agent-sdk-python/releases/tag/v$LATEST"
+            echo "Full changelog: https://github.com/anthropics/claude-agent-sdk-python/compare/v$PINNED...v$LATEST"
+            echo "PyPI: https://pypi.org/project/claude-agent-sdk/$LATEST/"
+            echo
+            echo "The SDK unconditionally imports \`opentelemetry.propagate\`, so keep the \`[otel]\` extra on the pin."
+          } >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7df474a..b37e5b7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,11 +6,24 @@ on:
   pull_request:
     branches: [main]
 
+# Minimum GITHUB_TOKEN scope the jobs need. The Codecov upload reads a
+# separate CODECOV_TOKEN from repo secrets; it does not need a broader
+# GITHUB_TOKEN scope.
+permissions:
+  contents: read
+
 jobs:
   test:
     runs-on: ubuntu-latest
+    # Hard ceiling so a hung pytest or stalled Poetry install can't chew
+    # through minutes silently.
+    timeout-minutes: 15
     strategy:
+      # Don't cancel the rest of the matrix when one Python fails - we want
+      # to see whether a regression is specific to a single interpreter.
+      fail-fast: false
       matrix:
+        # pyproject.toml pins python = "^3.10"; test every supported minor.
         python-version: ["3.10", "3.11", "3.12"]
 
     steps:
@@ -34,6 +47,13 @@ jobs:
           path: .venv
           key: venv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
 
+      # Catch lockfile drift early. The project depends on an exact pin of
+      # claude-agent-sdk (see CHANGELOG 2.9.0) so pyproject <-> poetry.lock
+      # disagreement must fail CI rather than silently resolve at install
+      # time.
+      - name: Verify lockfile is in sync
+        run: poetry check --lock
+
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
         run: poetry install --no-interaction --no-root
@@ -48,11 +68,17 @@ jobs:
         run: poetry run mypy src --ignore-missing-imports
         continue-on-error: true
 
-      - name: Security scan
+      - name: Static security scan (bandit)
         run: poetry run bandit -r src/ -ll -x tests
 
-      - name: Dependency vulnerability scan
-        run: poetry run safety check || true
+      - name: Dependency vulnerability scan (pip-audit)
+        # pip-audit replaces the deprecated `safety check` command. It
+        # reads the installed venv so it matches exactly what will ship.
+        # Non-blocking: Dependabot / Trivy on the built image are the
+        # authoritative gates; this is a fast PR signal.
+        run: |
+          poetry run pip install --quiet pip-audit
+          poetry run pip-audit --strict --progress-spinner off || true
         continue-on-error: true
 
       - name: Run tests
diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml
deleted file mode 100644
index 86b7ac9..0000000
--- a/.github/workflows/claude-code-review.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Claude Code Review
-on:
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-
-jobs:
-  claude-review:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: write
-      issues: read
-      id-token: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-          fetch-depth: 1
-      - name: Run Claude Code Review
-        id: claude-review
-        uses: anthropics/claude-code-action@v1
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
-          prompt: |
-            REPO: ${{ github.repository }}
-            PR NUMBER: ${{ github.event.pull_request.number }}
-            Please review this pull request and provide feedback on:
-            - Code quality and best practices
-            - Potential bugs or issues
-            - Performance considerations
-            - Security concerns
-            - Test coverage
-            Use the repository's CLAUDE.md for guidance on style and conventions. Be constructive and helpful in your feedback.
-            Use `gh pr comment` with your Bash tool to leave your review as a comment on the PR.
-          claude_args: '--allowed-tools "Bash(gh issue view:*),Bash(gh search:*),Bash(gh issue list:*),Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)"'
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
index d300267..692f6c1 100644
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -1,5 +1,10 @@
 name: Claude Code
 
+# Triggered by "@claude" mentions in issues, PR review comments, or PR reviews.
+# Uses issue_comment / pull_request_review_comment / pull_request_review (NOT
+# pull_request_target) so untrusted PR code is never checked out into a
+# privileged context.
+
 on:
   issue_comment:
     types: [created]
@@ -12,6 +17,11 @@ on:
 
 jobs:
   claude:
+    # Only run when "@claude" actually appears in the triggering event body
+    # or title. The event payload fields (issue.title, issue.body,
+    # comment.body, review.body) are user-controlled; they are used only in
+    # a contains(...) gate here and are never interpolated into a run:
+    # block, so there is no shell-injection surface.
     if: |
       (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
       (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
@@ -36,15 +46,16 @@ jobs:
         with:
           claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
 
-          # This is an optional setting that allows Claude to read CI results on PRs
+          # Allow Claude to read CI results on PRs so it can reference
+          # failing checks in its responses.
           additional_permissions: |
             actions: read
 
-          # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it.
-          # prompt: 'Update the pull request description to include a summary of changes.'
-
-          # Optional: Add claude_args to customize behavior and configuration
-          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
-          # or https://code.claude.com/docs/en/cli-reference for available options
-          # claude_args: '--allowed-tools Bash(gh pr:*)'
+          # Repo-specific tool allowlist for @claude runs. Intentionally
+          # narrow: read-only GitHub introspection, read-only git, and
+          # Poetry/pytest for verification. No write commands (no gh pr
+          # create, no gh pr merge, no git push, no editor invocations) -
+          # a human still owns any mutation.
+          claude_args: >-
+            --allowed-tools "Bash(gh pr view:*),Bash(gh pr diff:*),Bash(gh pr list:*),Bash(gh pr checks:*),Bash(gh pr comment:*),Bash(gh issue view:*),Bash(gh issue list:*),Bash(gh api:*),Bash(gh search:*),Bash(git log:*),Bash(git diff:*),Bash(git show:*),Bash(git blame:*),Bash(poetry run pytest:*),Bash(poetry run black --check:*),Bash(poetry run bandit:*)"
 
diff --git a/.gitignore b/.gitignore
index a59cdee..59011c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ logs/
 # Testing
 .coverage
 .pytest_cache/
+.hypothesis/
 htmlcov/
 
 # Claude Code
@@ -57,4 +58,4 @@ test_debug_*.py
 test_performance_*.py
 test_user_*.py
 test_new_*.py
-test_roocode_compatibility.py
\ No newline at end of file
+test_roocode_compatibility.py.hypothesis/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..0ceedb9
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,576 @@
+# Changelog
+
+All notable changes to the Claude Code OpenAI Wrapper project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [2.9.7] - 2026-05-12
+
+### Added
+
+- Active CLI-auth health probe. When `CLAUDE_AUTH_METHOD=claude_cli`,
+  the lifespan schedules a periodic background coroutine that runs
+  the existing `claude_cli.verify_cli()` (a 1-turn
+  `query(prompt="Hello", max_turns=1)`) and updates a shared
+  `cli_health` state. Bounds the stale window between the bundled CLI
+  losing its session and a real chat request discovering it.
+  - Interval is configurable via `CLI_AUTH_PROBE_INTERVAL_SECONDS`
+    (default 600s / 10 min). Set to 0 to disable. Skipped entirely for
+    non-cli auth methods (API key / Bedrock / Vertex), which surface
+    upstream auth failures via the existing
+    `assistant_authentication_failed` -> 401 mapping.
+  - Probe results visible at `GET /v1/auth/status` under a new
+    `cli_health` block: `ok`, `last_probed_at`, `last_ok_at`,
+    `error_kind` (`auth_failure` | `unknown` | `null`),
+    `error_message`.
+
+### Changed
+
+- `POST /v1/chat/completions` and `POST /v1/messages` now return
+  **HTTP 401** with `error.type=authentication_error` and
+  `error.code=claude_cli_not_authenticated` when the latest CLI probe
+  failed, instead of letting the request fall through to a generic
+  502 from the SDK or 503 from the config check. OpenAI / Anthropic
+  client libraries route 401 as `AuthenticationError`, giving callers a
+  durable signal to roll keys or re-`/login` rather than retrying a
+  doomed request.
+- `_build_sdk_error_response` (the
+  `ClaudeResultError.subtype=error_during_execution` path) now scans
+  `error_message` + `stderr_tail` for the same CLI-auth-failure markers
+  the probe uses (`not logged in`, `please run /login`,
+  `invalid api key`, `authentication_error`, `401`). On a match the
+  response is 401 + `authentication_error` and `cli_health` is seeded
+  failed so the next request fails fast without a round-trip.
+- Auth-failure responses now bypass the global `http_exception_handler`
+  (which previously rewrote the body as `error.type=api_error`) by
+  returning `JSONResponse` directly. Required for OpenAI / Anthropic
+  clients to read the authentication signal.
+
+### Tests
+
+- `tests/test_auth_unit.py::TestProbeCliAuth` - three async unit tests
+  covering `probe_cli_auth()`: success (`mark_ok`), `Not logged in`
+  stderr (`auth_failure`), generic exception (`unknown`).
+- `tests/test_endpoints.py::TestChatCompletionsCliHealthGate` and
+  `tests/test_anthropic_messages.py::TestAnthropicMessagesCliHealthGate`
+  - in-process TestClient assertions that both endpoints return 401
+  with `authentication_error` when `cli_health.ok=False`.
+- `tests/test_error_path_unit.py::TestCliAuthFailureToFourOhOne` -
+  four tests for the stderr-marker mapping: 401 on `Not logged in`,
+  401 on `Invalid API key`, 502 regression guard on `connection
+  refused`, and a seeding test confirming a real request flips
+  `cli_health.ok` to False.
+- Suite total: 673 passed, 31 skipped (was 664/31 on v2.9.6; +9 new
+  tests).
+
+## [2.9.6] - 2026-05-11
+
+### Changed
+
+- `claude-agent-sdk`: 0.1.68 -> 0.1.81. Thirteen patch releases since
+  the v2.9.5 cut. Pin keeps the `[otel]` extra (the SDK still imports
+  `opentelemetry.propagate` unconditionally).
+- Sync upstream `RichardAtCT/claude-code-openai-wrapper#46`: dynamic
+  Anthropic Models API integration for `/v1/models`. When
+  `ANTHROPIC_API_KEY` is set the endpoint returns Anthropic's live
+  model list (cached `MODEL_LIST_CACHE_TTL_SECONDS`, default 3600s)
+  and the wrapper resolves the latest Sonnet as `DEFAULT_MODEL` at
+  startup. When the key is absent (Bedrock, Vertex, Claude CLI
+  subscription auth) the existing static catalog is served and
+  `DEFAULT_MODEL_FALLBACK=claude-sonnet-4-6` is used.
+  `CLAUDE_MODELS_OVERRIDE` pins the advertised list regardless of
+  auth. Concurrent cache refreshes are serialized via an async lock +
+  double-check pattern; failed fetches use a short
+  `MODEL_LIST_ERROR_TTL_SECONDS` (default 60s) to keep transient
+  outages from suppressing live discovery for a full hour. The
+  pre-existing `model_service` (used by `/v1/models/refresh` and
+  `/v1/models/status`) is left in place alongside the new in-line
+  cache.
+
+### Security
+
+- `python-multipart`: ^0.0.26 -> ^0.0.27 (closes Dependabot alert #8,
+  `GHSA-pp6c-gr5w-3c5g` Denial of Service via unbounded multipart
+  part headers). Supersedes Dependabot PR #16, which was opened with
+  a Poetry 2.2.1 lockfile that would have introduced cosmetic
+  regressions in the lock header and constraint formatting.
+- `urllib3` security floor: >=2.6.3 -> >=2.7.0 (closes Dependabot
+  alerts #9 `GHSA-mf9v-mfxr-j63j` decompression-bomb safeguard
+  bypass and #10 `GHSA-qccp-gfcp-xxvc` proxy redirect header leak).
+
+### CI
+
+- `check-sdk-version.yml`: when drift is detected the workflow now
+  opens a draft `chore/sdk-bump-<latest>` PR with the pin bump and
+  regenerated `poetry.lock` instead of only writing to the run
+  summary. The Monday cron pre-stages the upgrade; a human reviewer
+  bumps the project version, adds a CHANGELOG entry, and merges. The
+  existing `::warning::` annotation and `$GITHUB_STEP_SUMMARY` block
+  still fire as a fallback when PR creation can't run (existing
+  open PR for that pin, branch conflict, etc.). Idempotent by head
+  branch name. Permissions widened to `contents: write` and
+  `pull-requests: write`.
+
+### Tests
+
+Full suite at 664 passed, 31 skipped (+14 from the upstream
+`test_dynamic_models.py` suite added by PR #46).
+
+## [2.9.5] - 2026-04-27
+
+### Changed
+
+- `claude-agent-sdk`: 0.1.65 -> 0.1.68. The SDK now declares
+  `sniffio >= 1.0.0` as a direct runtime dependency; lock file
+  picks up `sniffio 1.3.1` accordingly. Pin keeps the `[otel]`
+  extra (the SDK still imports `opentelemetry.propagate`
+  unconditionally).
+
+### CI
+
+- `check-sdk-version.yml`: replaced the issue-creation step with a
+  `GITHUB_STEP_SUMMARY` write. The repo has issues disabled, which
+  caused the weekly job to fail at `gh issue create` once the pin
+  fell behind PyPI. Drift is still announced via the existing
+  `::warning::` annotation; the summary block on the run page
+  carries the version delta and remediation notes. Dropped the
+  unused `issues: write` permission.
+
+### Tests
+
+Full suite stays at 650 passed, 31 skipped.
+
+## [2.9.4] - 2026-04-24
+
+### Security
+
+Closes all seven open Dependabot alerts by bumping vulnerable deps.
+Three affect the runtime image; four are dev-only and reach the image
+only via a CI container.
+
+| Package | From | To | CVE | Severity | Runtime? |
+|---------|------|----|-----|----------|----------|
+| black | 24.10.0 | 26.3.1 | CVE-2026-32274 (arbitrary file write from unsanitized cache input) | high | dev only |
+| filelock | 3.20.1 | 3.29.0 | CVE-2026-22701 (TOCTOU symlink) | medium | dev only |
+| requests | 2.32.4 | 2.33.1 | CVE-2026-25645 (insecure tmp file reuse in `extract_zipped_paths`) | medium | runtime (also in dev group) |
+| pytest | 8.4.1 | 9.0.3 | CVE-2025-71176 (tmpdir handling) | medium | dev only |
+| python-multipart | 0.0.22 | 0.0.26 | CVE-2026-40347 (DoS via large preamble/epilogue) | medium | runtime |
+| python-dotenv | 1.1.1 | 1.2.2 | CVE-2026-28684 (symlink follow in `set_key`) | medium | runtime |
+| Pygments | 2.19.2 | 2.20.0 | CVE-2026-4539 (GUID ReDoS) | low | dev only (transitive) |
+
+### Build / tooling
+
+- `pyproject.toml`: direct constraint bumps to match the lock above,
+  plus `pytest-asyncio ^0.23 -> ^1.3.0` (required by the pytest 9
+  upgrade - pytest-asyncio < 1.x does not support pytest 9).
+- Reformatted three test files under `tests/` with black 26 so the
+  CI linting gate stays green.
+
+### Tests
+
+Full suite stays at 650 passed, 31 skipped under pytest 9.0.3.
+
+## [2.9.3] - 2026-04-24
+
+### Fixed
+
+- **Runtime `ModuleNotFoundError: No module named 'opentelemetry'` at
+  first SDK connect**. `claude-agent-sdk 0.1.65` imports
+  `opentelemetry.propagate` unconditionally at
+  `_internal/transport/subprocess_cli.py:413`, but PyPI declares
+  `opentelemetry-api` only as an optional `[otel]` extra. The 2.9.2
+  Docker image (rebuilt with `poetry install --only main`) therefore
+  shipped without OTel, and the first chat completion after a fresh
+  deploy raised during `connect()`. Fix: pin
+  `claude-agent-sdk = {version = "0.1.65", extras = ["otel"]}` in
+  `pyproject.toml` so the dependency resolves into the main group.
+  `poetry lock` regenerated; `opentelemetry-api 1.41.1` now ships in
+  the image.
+
+## [2.9.2] - 2026-04-24
+
+### Build / CI
+
+- `Dockerfile`: `poetry install --only main` now excludes dev packages
+  from the runtime image. Removes the one Trivy HIGH with an upstream
+  fix (CVE-2026-32274, black < 26.3.1) and drops image size from
+  1.18 GB to 775 MB. BUILD_INFO stamps cleanly.
+- Added `.dockerignore` so `COPY . /app` stops pulling `.git`, `.venv`,
+  `.hypothesis`, `.pytest_cache`, `tests`, `docs`, `.env*`, and editor
+  cruft.
+- Remaining Trivy HIGHs (7) are in the Debian 13.4 base (ncurses,
+  nghttp2, systemd); all `fix: null` upstream. Accepted risk until
+  `python:3.12-slim` rebases.
+- `.github/workflows/ci.yml`: added `timeout-minutes: 15`,
+  `fail-fast: false`, `poetry check --lock` to catch lockfile drift,
+  and replaced deprecated `safety check` with `pip-audit`. No Docker
+  smoke-build step - images are built and pushed locally, CI only
+  gates Python-side checks.
+- `.github/workflows/claude.yml`: repo-specific `claude_args` with a
+  read-only tool allowlist (no write commands, no PR mutations).
+- Ran `black` across `src/` and `tests/` so the linting gate in CI
+  actually passes; 18 files reformatted with no behavioural change.
+- Disabled the `Claude Code Review` workflow upstream; the file was
+  removed from the repo in 2.9.1 but `pull_request_target` kept
+  executing it from `main` until explicit disable.
+
+## [2.9.1] - 2026-04-24
+
+### Security
+
+Closes the ten CodeQL code-scanning alerts open on `main`.
+
+- **Workflow: `claude-code-review.yml` removed** (alert #1,
+  `actions/untrusted-checkout/high`). The file checked out
+  `pull_request.head.sha` inside a `pull_request_target` job, exposing
+  repo secrets to untrusted code. Deleted entirely; automated PR review
+  can be reintroduced later behind a non-privileged trigger.
+- **Workflow: `ci.yml` permissions pinned** (alert #2,
+  `actions/missing-workflow-permissions`). Added top-level
+  `permissions: {contents: read}`.
+- **Error responses no longer leak exception detail** (alerts #7-#10,
+  `py/stack-trace-exposure`). `str(e)` has been replaced with static,
+  client-safe strings in:
+  - `_build_assistant_error_response` (new `_safe_assistant_error_message`
+    helper keyed on the upstream subtype);
+  - the `generate_streaming_response` SSE error chunk;
+  - the chat-completions and Anthropic-messages 500 HTTPException
+    handlers;
+  - `/v1/debug/request`, which is now entirely gated behind
+    `DEBUG_MODE`/`VERBOSE` and emits only the exception *type name* when
+    enabled. All server-side logging of the full exception is preserved.
+- **`MessageAdapter.filter_content` regexes hardened against
+  polynomial ReDoS** (alerts #3-#6, `py/polynomial-redos`). The lazy
+  `<tag>.*?</tag>` patterns were rewritten to the non-backtracking
+  `<tag>[^<]*(?:<(?!/tag>)[^<]*)*</tag>` form and pre-compiled at module
+  scope. The image-reference pattern now uses fixed upper bounds
+  (`[^\]]{0,1024}` / `[^\s]{0,65536}`) instead of lazy quantifiers with
+  a lookahead. A 1 MB input length guard short-circuits
+  `filter_content` on pathological payloads.
+
+### Tests
+
+- New `tests/test_redos_safety.py`: six pathological inputs that the
+  pre-fix regexes would have spent seconds-to-minutes on each complete
+  in under 1 s. Plus behavioural regression tests asserting the
+  rewritten patterns still strip `<thinking>`, extract nested
+  `<attempt_completion>`/`<result>`, replace image tokens, and return
+  oversized input unchanged.
+
+### Notes
+
+- Client-visible error message text has changed (now generic strings
+  like "Chat completion failed"). The OpenAI-style `type` and `code`
+  fields are unchanged, so programmatic error routing is unaffected.
+- `/v1/debug/request` returns `{"debug_info": {"enabled": false, ...}}`
+  unless `DEBUG_MODE=true` or `VERBOSE=true` is set on the server.
+
+### Docker image
+
+- `Dockerfile`: `poetry install --no-root` is now scoped to `--only main`.
+  Dev-group packages (black, bandit, pytest, mypy, safety, etc.) no
+  longer ship inside the runtime image. This removes the one Trivy
+  HIGH with an available fix (CVE-2026-32274, `black < 26.3.1`) and
+  drops the image from 1.18 GB to 775 MB.
+- Added `.dockerignore` so `COPY . /app` stops pulling `.git`, `.venv`,
+  `.hypothesis`, `.pytest_cache`, `tests`, `docs`, `.env*`, and editor
+  cruft into the image. BUILD_INFO now stamps cleanly at build time.
+- Remaining Trivy HIGHs (7) are in the Debian 13.4 base - ncurses
+  (CVE-2025-69720), nghttp2 (CVE-2026-27135), and systemd
+  (CVE-2026-29111). All have `fix: null` upstream at the time of this
+  release; they will clear when `python:3.12-slim` rebases. Accepted
+  risk.
+
+### Workflows
+
+- `.github/workflows/ci.yml`: added `timeout-minutes: 15`,
+  `fail-fast: false`, `poetry check --lock` (catches the lockfile
+  drift that burned us on the 0.1.65 SDK bump), replaced deprecated
+  `safety check` with `pip-audit`, and added a `docker` job that
+  smoke-builds the prod image on every PR.
+- `.github/workflows/claude.yml`: repo-specific `claude_args` with a
+  read-only tool allowlist (no write commands, no PR mutations) and
+  inline documentation of why the `contains()` gate on user-controlled
+  event fields is safe.
+
+## [2.9.0] - 2026-04-23
+
+### Changed
+
+- **`claude-agent-sdk` bumped from `0.1.18` to `0.1.65`** (exact pin). 47 patch releases worth of CLI and subprocess-handling fixes. The rationale for bumping specifically now is the background-constant `error_during_execution` rate observed on 2.8.2 in production (~48/hr, `num_turns=2`, `usage.input_tokens=0`, `stderr_tail_chars=0` — CLI dying silently before reaching Claude). Notable fixes in the range:
+  - **0.1.52** — `control_cancel_request` handling (#751): in-flight hook callbacks properly cancelled when the CLI abandons them. A plausible source of 2-turn silent abort.
+  - **0.1.53** — string-prompt deadlock fix (#780): spawned `wait_for_result_and_end_input()` as a background task to avoid hangs on hook/MCP-heavy calls. Related symptom class.
+  - **0.1.57** — thinking-config serialization fix (#796): `thinking={"type":"adaptive"}` and `{"type":"disabled"}` now use `--thinking` flag not `--max-thinking-tokens`. Directly affects the path the 2.8.0 `WRAPPER_MAP_MAX_TOKENS_TO_THINKING` opt-in touches.
+  - **0.1.60** — `setting_sources=[]` no longer silently dropped (#822). W3C distributed-tracing propagation added.
+  - **0.1.51** — preserve dropped fields on `AssistantMessage` and `ResultMessage` for forward compatibility (#718); `ResultMessage.errors` field now populated (#749).
+  - Bundled Claude CLI advanced from 2.0.72 (at 0.1.18) to 2.1.118 (at 0.1.65) — 46 CLI versions of bug fixes, auth handling, and error reporting.
+
+### Tests
+
+- Full suite green on 0.1.65: 640 passed, 31 skipped. No test changes required — existing fixtures still match the dict shapes parse_claude_message consumes.
+
+### Expected runtime impact
+
+- Fewer `error_during_execution` subprocess failures (hypothesis to be confirmed post-deploy).
+- `ResultMessage.errors` may now carry actual strings on failure paths, so the `claude_sdk_error` log line's `errors=` field should start populating instead of always `errors=[]`. This is the data we've been missing.
+- `max_thinking_tokens` semantics on 0.1.57+ differ from 0.1.18 — our `WRAPPER_MAP_MAX_TOKENS_TO_THINKING=false` default makes this a no-op, but anyone opting in should retest.
+
+## [2.8.2] - 2026-04-23
+
+Dependency bump to clear trivy HIGH/CRITICAL findings against 2.8.1.
+No code change.
+
+### Security
+
+Locked versions after `poetry lock` with the new constraints:
+
+| Package | Before | After | CVEs cleared |
+|---|---|---|---|
+| fastapi | 0.115.14 | 0.128.1 | (bumped to allow starlette >=0.49) |
+| starlette | 0.46.2 | 0.50.0 | CVE-2025-62727 (HIGH, DoS via Range header) |
+| urllib3 | 2.5.0 | 2.6.3 | CVE-2025-66418, CVE-2025-66471, CVE-2026-21441 (HIGH) |
+| python-multipart | 0.0.18 | 0.0.22 | CVE-2026-24486 (HIGH, path traversal) |
+| cryptography | 46.0.3 | 46.0.7 | CVE-2026-26007 (HIGH) |
+| pyjwt | 2.10.1 | 2.12.1 | CVE-2026-32597 (HIGH) |
+| authlib | 1.6.6 | 1.7.0 | CVE-2026-27962 (CRITICAL), CVE-2026-28802, CVE-2026-28490, CVE-2026-28498 (HIGH) |
+| mcp | 1.20.0 | 1.27.0 | CVE-2025-66416 (HIGH) |
+| nltk | 3.9.2 | 3.9.4 | CVE-2025-14009 (CRITICAL), CVE-2026-0846 (HIGH) |
+
+### Remaining (no fix available upstream)
+
+- nltk CVE-2026-33231, CVE-2026-33236 (XML path traversal) — no patched version published; track upstream
+- Debian base-image packages: libncursesw6, libnghttp2-14, libsystemd0, libtinfo6, libudev1, ncurses-base, ncurses-bin — no fix in current debian:13 stream; addressed when base image is rebased
+
+### Changed
+
+- `pyproject.toml`: explicit security-floor pins added for `starlette`, `urllib3`, `cryptography`, `pyjwt`, `authlib`, `mcp`, `nltk`. Each is a transitive of fastapi/claude-agent-sdk/bundled CLI but needs a minimum version higher than the parent's ceiling allowed, so we list them directly. `fastapi` widened to `>=0.119,<1.0` to allow starlette 0.49.x+.
+
+## [2.8.1] - 2026-04-23
+
+Hotfix on top of 2.8.0 after observing breaker cascade during live
+reprocessing. Three small fixes; no new behavior.
+
+### Fixed
+
+- **Structured log extras now render in plain-text logs** (`src/main.py`): replaced every `logger.xxx("event", extra={...})` call with `logger.xxx(_kv("event", **fields))`. The wrapper's default format is `%(asctime)s - %(name)s - %(levelname)s - %(message)s` with no extras-printer, so `circuit_breaker_open`, `completion_result`, `claude_sdk_error*`, `claude_sdk_assistant_error`, and the streaming-path variants were all shipping to Loki with the state dict silently dropped. They now serialize inline as `event key=value key=value ...`.
+- **Circuit breaker defaults loosened** (`src/circuit_breaker.py`): `min_requests_for_trip` raised from 10 to 20; `failure_ratio_threshold` raised from 0.5 to 0.75. The previous values tripped mid-way through a single episode's 6-8 detection windows when the upstream SDK returned a transient burst of `error_during_execution` (5/10 = 0.5), turning a recoverable hiccup into a full-episode outage via 503 cascade. All thresholds plus enable-state are now env-configurable: `WRAPPER_CIRCUIT_BREAKER_ENABLED`, `WRAPPER_CIRCUIT_BREAKER_THRESHOLD`, `WRAPPER_CIRCUIT_BREAKER_MIN_REQUESTS`, `WRAPPER_CIRCUIT_BREAKER_OPEN_SECONDS`, `WRAPPER_CIRCUIT_BREAKER_WINDOW_SECONDS`. Setting `WRAPPER_CIRCUIT_BREAKER_ENABLED=false` short-circuits both `allow_request()` and `record()`, acting as a kill switch for situations where the breaker itself is the problem.
+
+### Added
+
+- **CLI subprocess stderr capture** (`src/claude_cli.py`): bounded ring buffer (40 lines) installed as `ClaudeAgentOptions.stderr` callback on every request. On non-success `ResultMessage`, the tail is logged at WARNING level with the session id and num_turns, AND attached to the yielded dict as `stderr_tail` so downstream `parse_claude_message` forwards it onto `ClaudeResultError.stderr_tail`. The `chat_completions` error handler now logs it alongside the `claude_sdk_error` k/v line. Fixes the 2.8.0 gap where `error_during_execution` with `input_tokens=0, num_turns=2` gave us no insight into WHY the CLI subprocess died.
+
+### Changed
+
+- `ClaudeResultError` gained a `stderr_tail` attribute (default `None`).
+- Breaker snapshot dict now also includes `enabled` and `min_requests_for_trip` so the snapshot body on `503 circuit_breaker_open` responses matches what the env var set.
+
+## [2.8.0] - 2026-04-23
+
+### Fixed
+
+- **SDK `error_max_turns` no longer leaks `[Request interrupted by user]` as response content** (`src/claude_cli.py`): `parse_claude_message` now raises `ClaudeResultError` when any `ResultMessage` has `is_error=True` or a subtype in `{error_max_turns, error_during_execution, error}`. The SDK inserts a synthetic `UserMessage(text='[Request interrupted by user]')` right before those results; previously the fallback loop returned that text as the assistant response, which shipped as valid content to OpenAI clients and propagated into downstream artifacts (e.g. MinusPod chapter titles). `UserMessage` is now explicitly filtered out of response-text collection (identifiable by `uuid` field with no `model` field).
+- **`max_turns=1` when `enable_tools=False` raised to `3`** (`src/main.py:_build_claude_options`): the hardcoded `max_turns=1` caused `error_max_turns` on any prompt where the agent engaged extended thinking and then needed a second turn to emit the final assistant message. New default is configurable via `WRAPPER_DEFAULT_MAX_TURNS`.
+- **`max_tokens -> max_thinking_tokens` mapping is off by default** (`src/models.py`): OpenAI `max_tokens` is a response-length cap; the Claude Agent SDK has no direct equivalent. Mapping it to `max_thinking_tokens` caused short prompts (e.g. `max_tokens=500` for a title) to burn the thinking budget before emitting output, occasionally busting `max_turns`. Opt in to the legacy mapping via `WRAPPER_MAP_MAX_TOKENS_TO_THINKING=true`.
+- **Non-success `ResultMessage` now produces a proper OpenAI-shaped HTTP response** (`src/main.py`): `error_max_turns` -> `200` with `finish_reason="length"` and empty `content`; other SDK errors -> `502` with a structured error body; streaming path emits a terminal SSE event with the matching `finish_reason` and `[DONE]`.
+
+### Added
+
+- **`ClaudeResultError` exception** (`src/claude_cli.py`): typed error surface for SDK failures. Carries `subtype`, `num_turns`, `errors`, `stop_reason`, and `error_message`.
+- **Structured AssistantMessage error taxonomy** (`src/main.py`): `AssistantMessage.error` literals map to HTTP status codes -- `rate_limit` -> 429 with `Retry-After: 30`, `billing_error` -> 402, `authentication_failed` -> 401, `invalid_request` -> 400, `server_error`/`unknown` -> 502. Parser also detects `RateLimitInfo` messages (SDK 0.1.49+, future-compatible).
+- **Circuit breaker on SDK errors** (`src/circuit_breaker.py`): in-process rolling-window breaker. Default: opens when >=50% of the last 60s are failures and >=10 requests, 30s cool-off, half-opens with a single probe. Completion handler returns `503 Retry-After: 30` with a structured body when the breaker is open.
+- **`/healthz/deep` endpoint** (`src/main.py`): end-to-end probe that actually exercises the completion path. Tracks a rolling window of 10 outcomes and returns `503` when the failure rate exceeds 20%. Unlike `/health` (process liveness only), this catches upstream-SDK incidents that leave the wrapper process up while returning garbage.
+- **Structured `completion_result` log line** (`src/main.py`): one INFO-level record per successful completion with `request_id`, `session_id`, `subtype`, `num_turns`, `duration_ms`, `total_cost_usd`, `is_error`, `finish_reason`, `model`, and token counts. Simplifies Grafana triage.
+- **`BUILD_INFO` image stamp** (`Dockerfile`): records the installed `claude-agent-sdk` version and bundled-CLI presence at build time. Logged at startup via `_log_build_info()`.
+- **Multi-stage Dockerfile with `dev` and `prod` targets**: `dev` keeps `--reload` for local iteration; `prod` runs with `--workers 2 --no-access-log` (override via `UVICORN_WORKERS`). `docker-compose.yml` defaults to the `prod` target.
+- **Regression tests** covering the sentinel leak and the error taxonomy: `tests/test_claude_cli_unit.py` (`test_error_max_turns_raises_instead_of_returning_sentinel`, `test_user_message_content_never_leaks_as_response`, `test_is_error_true_raises_even_when_subtype_missing`, `test_assistant_rate_limit_raises`), `tests/test_error_path_unit.py` (HTTP-shape translations for each error class), `tests/test_circuit_breaker_unit.py` (state machine).
+
+### Changed
+
+- **SDK pinned exactly** (`pyproject.toml`): `claude-agent-sdk = "0.1.18"` (was `^0.1.18`). The caret range resolved to whatever 0.1.x was latest at install time, which let semantics drift between Docker builds without a code change (SDK 0.1.57 changed how thinking config is serialized to the CLI). Bump this pin deliberately and regenerate `poetry.lock` in the same commit. Upstream latest at time of pin: `0.1.65`.
+- **`docker-compose.yml`**: adds `build.target: prod`, documents new env vars (`UVICORN_WORKERS`, `WRAPPER_DEFAULT_MAX_TURNS`, `WRAPPER_MAP_MAX_TOKENS_TO_THINKING`).
+
+### Notes
+
+- `claude-agent-sdk` stays pinned to `0.1.18` because that's the version the production image has been running. Bump to `0.1.65` in a separate commit after validating behavior changes across `0.1.18..0.1.65` (particularly `0.1.57` thinking handling and `0.1.49` `RateLimitInfo` surfacing).
+- Upstream consumer affected by the `error_max_turns` leak was MinusPod; see that project's `2.0.12` release notes for the consumer-side defensive changes landing in parallel.
+
+## [2.7.0] - 2026-04-16
+
+### Added
+
+- **Claude Opus 4.7** (`claude-opus-4-7`): new flagship model -- 1M token context window, 128K max output, $5/$25 per MTok, falls back to `claude-sonnet-4-6` on overload
+
+### Changed
+
+- **Model metadata corrections** (`src/constants.py`): aligned with Anthropic docs (`platform.claude.com/docs/en/about-claude/models/overview`)
+  - `claude-opus-4-6`: context window 200K -> 1M
+  - `claude-sonnet-4-6`: context window 200K -> 1M, max output 128K -> 64K (synchronous Messages API)
+  - `claude-opus-4-1-20250805`: max output 64K -> 32K
+  - `claude-opus-4-20250514`: max output 64K -> 32K
+- **Default model example**: `.env.example` `DEFAULT_MODEL` now matches code default (`claude-sonnet-4-6`)
+- **Landing page quickstart** (`src/main.py`): uses `claude-sonnet-4-6` instead of dated Sonnet 4.5 snapshot
+- **Debug endpoint example**: `example_valid_request.model` updated from retired `claude-3-sonnet-20240229` to `claude-sonnet-4-6`
+
+### Removed
+
+- **Retired models** removed from `CLAUDE_MODELS`, `MODEL_METADATA`, `MODEL_PRICING`:
+  - `claude-3-7-sonnet-20250219` (retired 2026-02-19)
+  - `claude-3-5-sonnet-20241022` (retired 2025-10-28)
+  - `claude-3-5-haiku-20241022` (retired 2026-02-19)
+- `_PRICING_HAIKU_35` constant (no remaining consumers)
+
+## [2.6.0] - 2026-04-02
+
+### Added
+
+- **OpenAI Function Calling** (`src/function_calling.py`): Simulates OpenAI tool/function calling via system prompt injection and response parsing
+  - Converts `tools` array and `tool_choice` into Claude-compatible system prompts
+  - Parses Claude's response for ```tool_calls``` blocks and bare JSON arrays
+  - Returns OpenAI-format `tool_calls` in the response with generated call IDs
+  - Handles multi-turn conversations: assistant tool_calls and tool result messages converted to text
+- **JSON Schema in response_format**: Support for `response_format.type = "json_schema"` with schema definition
+  - Schema injected into user prompt (not system_prompt) for SDK subprocess compatibility
+  - Includes explicit rules for required properties, exact names, and exact types
+- **Streaming Fence Stripping** (`JsonFenceStripper` in `src/message_adapter.py`): Real-time removal of markdown ```json fences during streaming
+  - Hold-back buffers detect and strip opening/closing fences across chunk boundaries
+  - Replaces full-buffer strategy for JSON streaming -- chunks flow in real-time
+- **CPU Watchdog** (`src/cpu_watchdog.py`): Background CPU monitor for Docker/Linux deployments
+  - Reads /proc/self/stat every 30s, sends SIGTERM after 3 consecutive strikes above 80% CPU
+  - Disabled by default, enable with `WATCHDOG_ENABLED=true`
+  - Configurable interval, threshold, and strike count via env vars
+
+### Changed
+
+- **Message model**: Added `tool` role, `tool_calls`, `tool_call_id` fields for function calling support
+- **ResponseFormat model**: Extended with `json_schema` type and `JsonSchema` model
+- **Choice/StreamChoice**: Added `tool_calls` finish reason
+
+## [2.5.2] - 2026-04-01
+
+### Fixed
+
+- **Removed fake tools**: Removed BashOutput, KillShell, and SlashCommand from tool inventory -- these do not exist in Claude Code's tool registry and were diversions in the source
+
+### Added
+
+- **11 real tools**: Added Brief, Config, ListPeers, REPL, Sleep, Monitor, SendUserFile, PushNotification, ListMcpResources, ReadMcpResource, VerifyPlanExecution -- all verified against Claude Code source (`src/tools.ts:getAllBaseTools()`)
+
+### Changed
+
+- Tool count: 33 -> 41 (removed 3 fake, added 11 real)
+
+## [2.5.1] - 2026-04-01
+
+### Fixed
+
+- **GitHub URL**: Corrected repository link from aaronlippold fork to ttlequals0/claude-code-openai-wrapper
+- **OpenAPI Version**: FastAPI docs version now uses dynamic `__version__` instead of hardcoded "1.0.0"
+
+### Changed
+
+- **Landing Page Redesign**: Complete UI overhaul replacing generic AI-generated aesthetics with a clean, utilitarian developer dashboard
+  - Dropped Pico CSS in favor of custom minimal CSS
+  - Typography: DM Sans headings, JetBrains Mono for code paths
+  - Muted neutral color palette with method-specific badge colors (blue GET, amber POST, red DELETE)
+  - Removed gradient logo container, pulsing animations, and decorative section icons
+- **Endpoint Documentation**: Landing page now lists all 25 endpoints grouped into 8 categories (Core API, Models, Sessions, Tools, MCP Servers, Cache, Auth/Debug, System) -- previously showed only 9
+- **Configuration Section**: Condensed from a full card into a compact footer line
+
+## [2.5.0] - 2026-03-31
+
+### Added
+
+- **Model Metadata**: Per-model context window sizes, default/max output token limits sourced from open-sourced Claude Code CLI
+- **Model Pricing Data**: Per-model pricing (input, output, cache read/write) for all supported models, sourced from Claude Code source
+- **Cost Tracker** (`src/cost_tracker.py`): New module for per-request and per-session cost estimation using authoritative pricing data
+  - Tracks input/output tokens, cache tokens, web search requests
+  - Per-model usage breakdown per session
+- **Retry Logic** (`src/retry.py`): New module implementing retry with exponential backoff and jitter
+  - Configurable max retries (default 10), base delay (500ms), max delay (30s)
+  - Model fallback: after 3 consecutive 529 (overloaded) errors, falls back from Opus to Sonnet
+  - Retryable status codes: 429, 529, 5xx, 401, 400
+- **New Tools**: Added 18 tools to match Claude Code's actual tool inventory:
+  - `Agent` (with `Task` as backward-compatible alias)
+  - `SendMessage`, `TaskCreate`, `TaskUpdate`, `TaskGet`, `TaskList`, `TaskOutput`, `TaskStop`
+  - `EnterPlanMode`, `ExitPlanMode`, `EnterWorktree`, `ExitWorktree`
+  - `ToolSearch`, `AskUserQuestion`
+  - `CronCreate`, `CronDelete`, `CronList`, `RemoteTrigger`
+- **Effort Level Support**: New `X-Claude-Effort` header (low, medium, high, max)
+- **Thinking Mode Support**: New `X-Claude-Thinking` header (adaptive, enabled, disabled)
+- **Max Tokens Validation**: Model-specific max_tokens validation and capping via `ParameterValidator.validate_max_tokens()`
+- **Model Fallback Map**: Automatic Opus-to-Sonnet fallback mapping for overload resilience
+
+### Changed
+
+- **Model List Updated**: Added `claude-sonnet-4-6` (latest) and re-added Claude 3.x models (`claude-3-7-sonnet-20250219`, `claude-3-5-sonnet-20241022`, `claude-3-5-haiku-20241022`) which are confirmed supported by Claude Code
+- **Default Model**: Changed from `claude-sonnet-4-5-20250929` to `claude-sonnet-4-6` (latest Sonnet)
+- **Tool Safety Classifications**: Updated based on Claude Code source -- `Bash` now marked as requiring permissions, `Agent`/`SendMessage`/`RemoteTrigger` marked as unsafe
+- **Default Disallowed Tools**: Added `SendMessage` and `RemoteTrigger` to default disallow list
+
+## [2.4.2] - 2026-02-06
+
+### Added
+
+- **Auth Method Awareness in Model Service**: Model refresh now respects `CLAUDE_AUTH_METHOD` configuration
+  - `anthropic` auth: Full support for dynamic model fetching from API
+  - `cli`, `bedrock`, `vertex` auth: Uses static fallback model list (API key not available)
+- **Auth Method in Responses**: `/v1/models/refresh` and `/v1/models/status` responses now include `auth_method` field
+- **Landing Page Updates**: Added `/v1/models/status` and `/v1/models/refresh` endpoint cards to the dashboard UI with interactive refresh button
+- **Unit Tests**: Comprehensive tests for different auth method behaviors in model service
+
+### Changed
+
+- **Updated Model List**: Added `claude-opus-4-6` (latest), removed outdated `claude-opus-4-5-20250929` from static fallback list
+- **Improved Error Messages**: Refresh endpoint now returns clear message when using non-anthropic auth methods
+
+## [2.4.1] - 2026-02-06
+
+### Added
+
+- **Dynamic Model Refresh**: New `POST /v1/models/refresh` endpoint to refresh models from Anthropic API at runtime without server restart
+- **Model Service Status**: New `GET /v1/models/status` endpoint returning service status including source (api/fallback) and last refresh timestamp
+- **Refresh Tracking**: ModelService now tracks `_last_refresh` timestamp and `_source` (api or fallback) for observability
+- **Unit Tests**: Comprehensive tests for model refresh functionality including success/failure scenarios, timestamp tracking, and status reporting
+
+### Changed
+
+- **ModelService**: Enhanced with `refresh_models()` async method and `get_status()` method for runtime model management
+
+## [2.4.0] - 2026-02-04
+
+### Added
+
+- **Improved JSON Mode Instructions**: Enhanced system prompt instructions with numbered rules format, explicit prohibition of preambles, and stronger emphasis on first/last character requirements
+- **Common Preamble Detection**: New `COMMON_PREAMBLES` constant with 19 common Claude preambles that are automatically stripped
+- **Balanced JSON Extraction**: New `_find_balanced_json()` helper method using brace/bracket matching that correctly handles escaped quotes and braces inside strings
+- **JSON Extraction Metadata**: New `JsonExtractionResult` dataclass and `extract_json_with_metadata()` method providing detailed extraction information
+- **Metadata-Enabled Enforcement**: New `enforce_json_format_with_metadata()` method returning both extracted content and extraction details
+- **Enhanced Extraction Diagnostics**: New `_log_extraction_diagnostics()` method for detailed debugging of extraction failures
+- **Request Deduplication Cache**: Optional caching layer for identical requests with LRU eviction and TTL expiration
+  - Configure via environment variables: `REQUEST_CACHE_ENABLED`, `REQUEST_CACHE_MAX_SIZE`, `REQUEST_CACHE_TTL_SECONDS`
+  - Enable per-request via `X-Enable-Cache: true` header
+- **Cache Management Endpoints**:
+  - `GET /v1/cache/stats` - View cache statistics
+  - `POST /v1/cache/clear` - Clear all cached entries
+- **Unit Tests**: Comprehensive tests for balanced JSON extraction, metadata tracking, and request cache
+
+### Changed
+
+- **JSON Extraction Priority**: Reordered extraction methods for better reliability:
+  1. Pure JSON (fast path)
+  2. Preamble removal + parse
+  3. Markdown code block extraction
+  4. Balanced brace/bracket matching
+  5. First-to-last fallback
+- **Improved Logging**: JSON enforcement now logs extraction method used (e.g., `method=preamble_removed`)
+- **Debug Output**: Enhanced debug logging with extraction metadata in both streaming and non-streaming modes
+
+### Fixed
+
+- JSON extraction now correctly handles escaped quotes (`\"`) within strings
+- JSON extraction no longer confused by braces/brackets inside string values
+
+## [2.3.1] - Previous Release
+
+Initial tracked version with JSON mode support.
diff --git a/Dockerfile b/Dockerfile
index 43f90bf..424e218 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,30 +1,63 @@
-FROM python:3.12-slim
+# syntax=docker/dockerfile:1.6
 
-# Install system deps (curl for Poetry installer)
-RUN apt-get update && apt-get install -y \
+# -----------------------------------------------------------------------------
+# Base: install Poetry and app dependencies. Shared by dev and prod stages.
+# -----------------------------------------------------------------------------
+FROM python:3.12-slim AS base
+
+# Install system deps (curl for Poetry installer).
+RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
+    ca-certificates \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Poetry globally
+# Install Poetry globally.
 RUN curl -sSL https://install.python-poetry.org | python3 -
-
-# Add Poetry to PATH
 ENV PATH="/root/.local/bin:${PATH}"
 
-# Note: Claude Code CLI is bundled with claude-agent-sdk >= 0.1.8
-# No separate Node.js/npm installation required
-
-# Copy the app code
-COPY . /app
+# Note: Claude Code CLI is bundled with claude-agent-sdk >= 0.1.8.
+# No separate Node.js/npm installation required.
 
-# Set working directory
 WORKDIR /app
 
-# Install Python dependencies with Poetry
-RUN poetry install --no-root
+# Copy manifests first so dependency install is cached when source changes.
+COPY pyproject.toml poetry.lock* /app/
+# --only main excludes the dev group (black, bandit, pytest, mypy, etc.),
+# which are only needed in CI and would otherwise ship inside the image and
+# expand the vulnerability surface (e.g. CVE-2026-32274 black < 26.3.1).
+RUN poetry install --no-root --only main --no-interaction --no-ansi
+
+# Copy the application source.
+COPY . /app
+
+# Build-info stamp: record the installed SDK and bundled Claude CLI versions
+# so the running container advertises what it actually ships. This turns
+# "which SDK shipped in the image?" from guesswork into a one-shot `cat`.
+# Must run via `poetry run` because dependencies are installed into the
+# Poetry-managed virtualenv, not the system site-packages.
+RUN poetry run python -c "\
+import importlib.metadata, pathlib, claude_agent_sdk;\
+sdk = importlib.metadata.version('claude-agent-sdk');\
+cli = pathlib.Path(claude_agent_sdk.__file__).parent / '_bundled' / 'claude';\
+open('/app/BUILD_INFO', 'w').write(f'claude-agent-sdk={sdk}\\nbundled_cli_present={cli.exists()}\\nbundled_cli_path={cli}\\n')\
+" || echo "BUILD_INFO stamp skipped (non-fatal)"
 
-# Expose the port (default 8000)
 EXPOSE 8000
 
-# Run the app with Uvicorn (development mode with reload; switch to --no-reload for prod)
-CMD ["poetry", "run", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
\ No newline at end of file
+# -----------------------------------------------------------------------------
+# Dev stage: --reload watches the filesystem for changes. Not suitable for prod
+# because it interferes with long-lived streaming connections and adds startup
+# cost; keep it strictly for local iteration.
+# -----------------------------------------------------------------------------
+FROM base AS dev
+CMD ["poetry", "run", "uvicorn", "src.main:app", \
+     "--host", "0.0.0.0", "--port", "8000", "--reload"]
+
+# -----------------------------------------------------------------------------
+# Prod stage: multi-worker, no reload. Default target for deployment images.
+# Override worker count via the UVICORN_WORKERS env var at runtime if needed.
+# -----------------------------------------------------------------------------
+FROM base AS prod
+ENV UVICORN_WORKERS=2
+CMD ["sh", "-c", "poetry run uvicorn src.main:app \
+    --host 0.0.0.0 --port 8000 --workers ${UVICORN_WORKERS} --no-access-log"]
diff --git a/README.md b/README.md
index 47c67e3..1981fd2 100644
--- a/README.md
+++ b/README.md
@@ -1,399 +1,218 @@
 # Claude Code OpenAI API Wrapper
 
-An OpenAI API-compatible wrapper for Claude Code, allowing you to use Claude Code with any OpenAI client library. **Now powered by the official Claude Agent SDK v0.1.18** with enhanced authentication and features.
+OpenAI API-compatible wrapper for Claude Code. Drop it in front of any OpenAI client library and talk to Claude instead.
 
 ## Version
 
-**Current Version:** 2.2.0
-- **Interactive Landing Page:** API explorer at root URL with live endpoint testing
-- **Anthropic Messages API:** Native `/v1/messages` endpoint alongside OpenAI format
-- **Explicit Auth Selection:** New `CLAUDE_AUTH_METHOD` env var for auth control
-- **Tool Execution Fix:** `enable_tools: true` now properly enables Claude Code tools
+**Current:** 2.9.7
 
-**Upgrading from v1.x?**
-1. Pull latest code: `git pull origin main`
-2. Update dependencies: `poetry install`
-3. Restart server - that's it!
+Highlights of recent releases (full history in [CHANGELOG.md](./CHANGELOG.md)):
 
-**Migration Resources:**
-- [MIGRATION_STATUS.md](./MIGRATION_STATUS.md) - Detailed v2.0.0 migration status
-- [UPGRADE_PLAN.md](./UPGRADE_PLAN.md) - Comprehensive migration strategy and technical details
+- **2.9.7** - Active Claude-CLI auth health probe (10-minute default, configurable via `CLI_AUTH_PROBE_INTERVAL_SECONDS`). `/v1/chat/completions` and `/v1/messages` now return **HTTP 401** with `error.type=authentication_error` when the bundled CLI loses its session, so OpenAI / Anthropic client libraries route the failure as `AuthenticationError` instead of a transient 502/503. `/v1/auth/status` exposes the new `cli_health` block. Defense-in-depth: `error_during_execution` results whose stderr matches `Not logged in / Please run /login / Invalid API key` also map to 401 and seed `cli_health` failed.
+- **2.9.6** - `claude-agent-sdk` 0.1.68 -> 0.1.81. urllib3 floor raised to 2.7.0 and `python-multipart` to 0.0.27 to close three HIGH Dependabot alerts. Pulled in upstream `RichardAtCT#46` so `/v1/models` returns Anthropic's live catalogue when `ANTHROPIC_API_KEY` is set (cached, with a short error TTL so transient outages do not stick for an hour). `check-sdk-version.yml` now opens a draft bump PR on drift instead of writing only to the job summary.
+- **2.9.x** (earlier) - CodeQL hardening: sanitised error responses (no more `str(e)` to clients), `filter_content` rewrite against polynomial ReDoS, `/v1/debug/request` gated behind `DEBUG_MODE`/`VERBOSE`, workflow permissions pinned. Image trimmed via `poetry install --only main` and a real `.dockerignore`.
+- **2.8.x** - Security dep bumps, breaker defaults loosened, CLI stderr capture, structured-log state unmasked.
+- **2.7.0** - Added `claude-opus-4-7`; retired `claude-3-*` family; corrected context-window and max-output metadata.
+- **2.6.0** - OpenAI function calling simulation (`tools` / `tool_choice`), JSON schema support in `response_format`, real-time streaming fence stripping, CPU watchdog.
+- **2.5.x** - Landing-page redesign, model catalogue from the open-sourced Claude Code source, 41 tools tracked, retry + model fallback, cost tracking, `X-Claude-Effort` / `X-Claude-Thinking` headers.
 
 ## Status
 
-🎉 **Production Ready** - All core features working and tested:
-- ✅ Chat completions endpoint with **official Claude Agent SDK v0.1.18**
-- ✅ **Anthropic Messages API** (`/v1/messages`) for native compatibility
-- ✅ Streaming and non-streaming responses
-- ✅ Full OpenAI SDK compatibility
-- ✅ **Interactive landing page** with API explorer
-- ✅ **Multi-provider authentication** (API key, Bedrock, Vertex AI, CLI auth)
-- ✅ **System prompt support** via SDK options
-- ✅ Model selection support with validation
-- ✅ **Fast by default** - Tools disabled for OpenAI compatibility (5-10x faster)
-- ✅ Optional tool usage (Read, Write, Bash, etc.) when explicitly enabled
-- ✅ **Real-time cost and token tracking** from SDK
-- ✅ **Session continuity** with conversation history across requests
-- ✅ **Session management endpoints** for full session control
-- ✅ Health, auth status, and models endpoints
-- ✅ **Development mode** with auto-reload
-
-## Features
-
-### 🔥 **Core API Compatibility**
-- OpenAI-compatible `/v1/chat/completions` endpoint
-- Anthropic-compatible `/v1/messages` endpoint
-- Support for both streaming and non-streaming responses
-- Compatible with OpenAI Python SDK and all OpenAI client libraries
-- Automatic model validation and selection
-
-### 🛠 **Claude Agent SDK Integration**
-- **Official Claude Agent SDK** integration (v0.1.18) 🆕
-- **Real-time cost tracking** - actual costs from SDK metadata
-- **Accurate token counting** - input/output tokens from SDK
-- **Session management** - proper session IDs and continuity
-- **Enhanced error handling** with detailed authentication diagnostics
-- **Modern SDK features** - Latest capabilities and improvements
-
-### 🔐 **Multi-Provider Authentication**
-- **Automatic detection** of authentication method
-- **Claude CLI auth** - works with existing `claude auth` setup
-- **Direct API key** - `ANTHROPIC_API_KEY` environment variable
-- **AWS Bedrock** - enterprise authentication with AWS credentials
-- **Google Vertex AI** - GCP authentication support
-
-### ⚡ **Advanced Features**
-- **System prompt support** via SDK options
-- **Optional tool usage** - Enable Claude Code tools (Read, Write, Bash, etc.) when needed
-- **Fast default mode** - Tools disabled by default for OpenAI API compatibility
-- **Development mode** with auto-reload (`uvicorn --reload`)
-- **Interactive API key protection** - Optional security with auto-generated tokens
-- **Comprehensive logging** and debugging capabilities
-
-### 🌐 **Interactive Landing Page**
-- **API Explorer** at root URL (`http://localhost:8000/`)
-- **Live endpoint testing** - Expandable accordions fetch real-time data
-- **Light/dark theme toggle** - Persists preference in localStorage
-- **Copy-to-clipboard** - One-click copy for Quick Start commands
-- **Version badge** and GitHub link
+Production ready. **673 tests passing (31 skipped)**. Streaming works. Sessions work. JSON mode works. Function calling works. Tools are off by default for speed - pass `enable_tools: true` to turn them on. Auth supports API key, Bedrock, Vertex AI, and CLI.
 
 ## Quick Start
 
-Get started in under 2 minutes:
-
 ```bash
-# 1. Clone and setup the wrapper
-git clone https://github.com/RichardAtCT/claude-code-openai-wrapper
+# Clone and install
+git clone https://github.com/ttlequals0/claude-code-openai-wrapper
 cd claude-code-openai-wrapper
-poetry install  # Installs SDK with bundled Claude Code CLI
+poetry install
 
-# 2. Authenticate (choose one method)
-export ANTHROPIC_API_KEY=your-api-key  # Recommended
-# OR use CLI auth: claude auth login
+# Authenticate (pick one)
+export ANTHROPIC_API_KEY=your-api-key
+# or: claude auth login
 
-# 3. Start the server
+# Start
 poetry run uvicorn src.main:app --reload --port 8000
 
-# 4. Test it works
-poetry run python test_endpoints.py
+# Test
+poetry run pytest tests/
 ```
 
-🎉 **That's it!** Your OpenAI-compatible Claude Code API is running on `http://localhost:8000`
+Server is at `http://localhost:8000`. Point your OpenAI client there.
 
 ## Prerequisites
 
-1. **Python 3.10+**: Required for the server (supports Python 3.10, 3.11, 3.12, 3.13)
-
-2. **Poetry**: For dependency management
+1. **Python 3.10+**
+2. **Poetry** for dependency management:
    ```bash
-   # Install Poetry (if not already installed)
    curl -sSL https://install.python-poetry.org | python3 -
    ```
+3. **Authentication** (pick one):
+   - `export ANTHROPIC_API_KEY=your-api-key` (recommended)
+   - `claude auth login` (CLI auth)
+   - AWS Bedrock or Google Vertex AI (see Configuration)
 
-3. **Authentication**: Choose one method:
-   - **Option A**: Set environment variable (Recommended)
-     ```bash
-     export ANTHROPIC_API_KEY=your-api-key
-     ```
-   - **Option B**: Authenticate via CLI
-     ```bash
-     claude auth login
-     ```
-   - **Option C**: Use AWS Bedrock or Google Vertex AI (see Configuration section)
-
-> **Note:** The Claude Code CLI is bundled with the SDK (v0.1.18+). No separate Node.js or npm installation required!
+The Claude Code CLI comes bundled with the SDK. No Node.js or npm needed.
 
 ## Installation
 
-1. Clone the repository:
-   ```bash
-   git clone https://github.com/RichardAtCT/claude-code-openai-wrapper
-   cd claude-code-openai-wrapper
-   ```
-
-2. Install dependencies with Poetry:
-   ```bash
-   poetry install
-   ```
-
-   This will create a virtual environment and install all dependencies.
-
-3. Configure environment:
-   ```bash
-   cp .env.example .env
-   # Edit .env with your preferences
-   ```
+```bash
+git clone https://github.com/ttlequals0/claude-code-openai-wrapper
+cd claude-code-openai-wrapper
+poetry install
+cp .env.example .env  # edit with your preferences
+```
 
 ## Configuration
 
-Edit the `.env` file:
+Edit `.env`:
 
 ```env
-# Claude CLI path (usually just "claude")
-CLAUDE_CLI_PATH=claude
-
-# Explicit authentication method (optional)
-# Options: cli, api_key, bedrock, vertex
-# If not set, auto-detects based on available credentials
-# CLAUDE_AUTH_METHOD=cli
+# Auth (optional - auto-detects if not set)
+# CLAUDE_AUTH_METHOD=cli|api_key|bedrock|vertex
 
-# Optional API key for client authentication
-# If not set, server will prompt for interactive API key protection on startup
+# Optional client API key protection
 # API_KEY=your-optional-api-key
 
-# Server port
 PORT=8000
-
-# Timeout in milliseconds
-MAX_TIMEOUT=600000
-
-# CORS origins
-CORS_ORIGINS=["*"]
-
-# Working directory for Claude Code (optional)
-# If not set, uses an isolated temporary directory for security
-# CLAUDE_CWD=/path/to/your/workspace
+MAX_TIMEOUT=600000           # milliseconds (10 min default)
+# CLAUDE_CWD=/path/to/workspace   # defaults to isolated temp dir
+# DEFAULT_MODEL=claude-sonnet-4-6  # override default model
 ```
 
-### 📁 **Working Directory Configuration**
-
-By default, Claude Code runs in an **isolated temporary directory** to prevent it from accessing the wrapper's source code. This enhances security by ensuring Claude Code only has access to the workspace you intend.
-
-**Configuration Options:**
+### Working Directory
 
-1. **Default (Recommended)**: Automatically creates a temporary isolated workspace
-   ```bash
-   # No configuration needed - secure by default
-   poetry run python main.py
-   ```
+By default, Claude Code runs in an isolated temporary directory so it can't access the wrapper's own source. Set `CLAUDE_CWD` to point it at a specific project instead.
 
-2. **Custom Directory**: Set a specific workspace directory
-   ```bash
-   export CLAUDE_CWD=/path/to/your/project
-   poetry run python main.py
-   ```
+### API Key Protection
 
-3. **Via .env file**: Add to your `.env` file
-   ```env
-   CLAUDE_CWD=/home/user/my-workspace
-   ```
+If no `API_KEY` is set, the server prompts on startup whether to generate one. Useful for remote access over VPN or Tailscale.
 
-**Important Notes:**
-- The temporary directory is automatically cleaned up when the server stops
-- This prevents Claude Code from accidentally modifying the wrapper's own code
-- Cross-platform compatible (Windows, macOS, Linux)
+### Rate Limiting
 
-### 🔐 **API Security Configuration**
+Per-IP rate limiting is on by default. Per-endpoint defaults and the env vars that override them:
 
-The server supports **interactive API key protection** for secure remote access:
+| Endpoint group | Default | Env var |
+|----------------|---------|---------|
+| `/v1/chat/completions`, `/v1/messages` | 10/min | `RATE_LIMIT_CHAT_PER_MINUTE` |
+| `/v1/debug/request` | 2/min | `RATE_LIMIT_DEBUG_PER_MINUTE` |
+| `/v1/auth/status` | 10/min | `RATE_LIMIT_AUTH_PER_MINUTE` |
+| `/v1/sessions/*` | 15/min | `RATE_LIMIT_SESSION_PER_MINUTE` |
+| `/health`, `/healthz/deep` | 30/min | `RATE_LIMIT_HEALTH_PER_MINUTE` |
+| everything else | 30/min | `RATE_LIMIT_PER_MINUTE` |
 
-1. **No API key set**: Server prompts "Enable API key protection? (y/N)" on startup
-   - Choose **No** (default): Server runs without authentication
-   - Choose **Yes**: Server generates and displays a secure API key
+Disable entirely with `RATE_LIMIT_ENABLED=false`.
 
-2. **Environment API key set**: Uses the configured `API_KEY` without prompting
+## Running the Server
 
 ```bash
-# Example: Interactive protection enabled
-poetry run python main.py
-
-# Output:
-# ============================================================
-# 🔐 API Endpoint Security Configuration
-# ============================================================
-# Would you like to protect your API endpoint with an API key?
-# This adds a security layer when accessing your server remotely.
-# 
-# Enable API key protection? (y/N): y
-# 
-# 🔑 API Key Generated!
-# ============================================================
-# API Key: Xf8k2mN9-vLp3qR5_zA7bW1cE4dY6sT0uI
-# ============================================================
-# 📋 IMPORTANT: Save this key - you'll need it for API calls!
-#    Example usage:
-#    curl -H "Authorization: Bearer Xf8k2mN9-vLp3qR5_zA7bW1cE4dY6sT0uI" \
-#         http://localhost:8000/v1/models
-# ============================================================
-```
-
-**Perfect for:**
-- 🏠 **Local development** - No authentication needed
-- 🌐 **Remote access** - Secure with generated tokens
-- 🔒 **VPN/Tailscale** - Add security layer for remote endpoints
-
-### 🛡️ **Rate Limiting**
-
-Built-in rate limiting protects against abuse and ensures fair usage:
-
-- **Chat Completions** (`/v1/chat/completions`): 10 requests/minute
-- **Debug Requests** (`/v1/debug/request`): 2 requests/minute
-- **Auth Status** (`/v1/auth/status`): 10 requests/minute
-- **Health Check** (`/health`): 30 requests/minute
-
-Rate limits are applied per IP address using a fixed window algorithm. When exceeded, the API returns HTTP 429 with a structured error response:
-
-```json
-{
-  "error": {
-    "message": "Rate limit exceeded. Try again in 60 seconds.",
-    "type": "rate_limit_exceeded",
-    "code": "too_many_requests",
-    "retry_after": 60
-  }
-}
-```
-
-Configure rate limiting through environment variables:
+# Development (auto-reload)
+poetry run uvicorn src.main:app --reload --port 8000
 
-```bash
-RATE_LIMIT_ENABLED=true
-RATE_LIMIT_CHAT_PER_MINUTE=10
-RATE_LIMIT_DEBUG_PER_MINUTE=2
-RATE_LIMIT_AUTH_PER_MINUTE=10
-RATE_LIMIT_HEALTH_PER_MINUTE=30
+# Production
+poetry run claude-wrapper
 ```
 
-## Running the Server
-
-1. Verify Claude Code is installed and working:
-   ```bash
-   claude --version
-   claude --print --model claude-haiku-4-5-20251001 "Hello"  # Test with fastest model
-   ```
-
-2. Start the server:
-
-   **Development mode (recommended - auto-reloads on changes):**
-   ```bash
-   poetry run uvicorn src.main:app --reload --port 8000
-   ```
-
-   **Production mode:**
-   ```bash
-   poetry run python main.py
-   ```
-
-   **Port Options for production mode:**
-   - Default: Uses port 8000 (or PORT from .env)
-   - If port is in use, automatically finds next available port
-   - Specify custom port: `poetry run python main.py 9000`
-   - Set in environment: `PORT=9000 poetry run python main.py`
-
 ## Docker
 
-Build and run the wrapper in a Docker container.
-
-### Build
-
-```bash
-docker build -t claude-wrapper:latest .
-```
-
-### Run
+Pre-built image on Docker Hub: `ttlequals0/claude-code-openai-wrapper`.
 
-**Production:**
 ```bash
+# Pull and run
 docker run -d -p 8000:8000 \
   -v ~/.claude:/root/.claude \
   --name claude-wrapper \
-  claude-wrapper:latest
-```
+  ttlequals0/claude-code-openai-wrapper:latest
 
-**With custom workspace:**
-```bash
+# Pin to a specific version
 docker run -d -p 8000:8000 \
   -v ~/.claude:/root/.claude \
-  -v /path/to/project:/workspace \
-  -e CLAUDE_CWD=/workspace \
-  claude-wrapper:latest
-```
+  --name claude-wrapper \
+  ttlequals0/claude-code-openai-wrapper:2.9.6
 
-**Development (hot reload):**
-```bash
-docker run -d -p 8000:8000 \
-  -v ~/.claude:/root/.claude \
-  -v $(pwd):/app \
-  claude-wrapper:latest \
-  poetry run uvicorn src.main:app --host 0.0.0.0 --port 8000 --reload
+# Or build locally (prod stage is the default target)
+docker build --platform linux/amd64 -t claude-wrapper:local .
 ```
 
-### Docker Compose
+Docker Compose (matches `docker-compose.yml` in the repo):
 
 ```yaml
 version: '3.8'
 services:
   claude-wrapper:
-    build: .
+    image: ttlequals0/claude-code-openai-wrapper:latest
+    pull_policy: always   # redeploy webhooks re-pull :latest
+    build:
+      context: .
+      target: prod
+    container_name: claude-wrapper
     ports:
       - "8000:8000"
     volumes:
       - ~/.claude:/root/.claude
     environment:
       - PORT=8000
-      - MAX_TIMEOUT=600
+      - MAX_TIMEOUT=600000
     restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
 ```
 
-Run: `docker-compose up -d` | Stop: `docker-compose down`
+### Environment variables
 
-### Environment Variables
+Listed in roughly the order you will reach for them.
 
 | Variable | Description | Default |
 |----------|-------------|---------|
 | `PORT` | Server port | `8000` |
-| `MAX_TIMEOUT` | Request timeout (seconds) | `300` |
-| `CLAUDE_CWD` | Working directory | temp dir |
-| `CLAUDE_AUTH_METHOD` | Auth method: `cli`, `api_key`, `bedrock`, `vertex` | auto-detect |
-| `ANTHROPIC_API_KEY` | Direct API key | - |
-| `API_KEYS` | Comma-separated client API keys | - |
-
-### Management
-
-```bash
-docker logs -f claude-wrapper        # View logs
-docker stop claude-wrapper           # Stop
-docker start claude-wrapper          # Start
-docker rm claude-wrapper             # Remove
-```
-
-### Test
-
-```bash
-curl http://localhost:8000/health
-curl http://localhost:8000/v1/models
-```
+| `CLAUDE_WRAPPER_HOST` | Bind address (`127.0.0.1` for local-only, `0.0.0.0` for all) | `0.0.0.0` |
+| `MAX_TIMEOUT` | Per-request timeout (ms) | `600000` (10 min) |
+| `MAX_REQUEST_SIZE` | Max request body size (bytes) | `10485760` (10 MB) |
+| `CLAUDE_CWD` | Working directory Claude Code runs in | isolated temp dir |
+| `CLAUDE_AUTH_METHOD` | `cli`, `api_key`, `bedrock`, `vertex` | auto-detect |
+| `API_KEY` | Require this key on every request; prompts at startup if unset | interactive prompt |
+| `ANTHROPIC_API_KEY` | Direct API key (for `api_key` auth). Optional — also unlocks live `/v1/models` discovery and dynamic latest-Sonnet default. | - |
+| `CLAUDE_CODE_USE_BEDROCK` | Enable AWS Bedrock backend | `false` |
+| `AWS_REGION` / `AWS_DEFAULT_REGION` / `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` | Bedrock credentials | - |
+| `CLAUDE_CODE_USE_VERTEX` | Enable Google Vertex AI backend | `false` |
+| `ANTHROPIC_VERTEX_PROJECT_ID` / `CLOUD_ML_REGION` / `GOOGLE_APPLICATION_CREDENTIALS` | Vertex credentials | - |
+| `DEFAULT_MODEL` | Default model id when request omits one. When unset and `ANTHROPIC_API_KEY` is configured, the wrapper resolves the latest Sonnet at startup; otherwise falls back to `claude-sonnet-4-6`. | auto |
+| `FAST_MODEL` | Speed/cost-optimized model alias used internally. | `claude-haiku-4-5-20251001` |
+| `CLAUDE_MODELS_OVERRIDE` | Comma-separated model IDs to advertise via `/v1/models`. Takes precedence over both live and static lists. | - |
+| `MODEL_LIST_CACHE_TTL_SECONDS` | Cache TTL for live `/v1/models` results. | `3600` |
+| `MODEL_LIST_ERROR_TTL_SECONDS` | Short cache TTL applied when the live fetch fails so transient outages don't suppress live discovery for the full hour. | `60` |
+| `MODEL_LIST_REQUEST_TIMEOUT_SECONDS` | HTTP timeout for the live model fetch (seconds). | `5` |
+| `ANTHROPIC_MODELS_URL` | Override the live models endpoint. Point at a proxy or staging URL during testing. | `https://api.anthropic.com/v1/models` |
+| `ANTHROPIC_VERSION` | `anthropic-version` header sent to the Models API. | `2023-06-01` |
+| `ANTHROPIC_BETA` / `ANTHROPIC_BETA_HEADER` | Optional `anthropic-beta` header forwarded to the Models API for beta-gated features. | - |
+| `CLI_AUTH_PROBE_INTERVAL_SECONDS` | Background CLI-auth probe cadence when `CLAUDE_AUTH_METHOD=claude_cli`. Each probe is a 1-turn `query` (~$0.001 at Sonnet pricing); a failure flips `cli_health.ok` so `/v1/chat/completions` and `/v1/messages` return 401 instead of letting the SDK fail loudly. Set `0` to disable. Ignored for non-cli auth methods. | `600` (10 min) |
+| `DEBUG_MODE` | Enable debug logging and unlock `/v1/debug/request` | `false` |
+| `VERBOSE` | Same unlock effect on `/v1/debug/request` | `false` |
+| `CORS_ORIGINS` | Allowed CORS origins (JSON array) | `["*"]` |
+| `REQUEST_CACHE_ENABLED` | Enable request-dedup cache | `false` |
+| `REQUEST_CACHE_TTL_SECONDS` | Cache entry TTL | service-managed |
+| `REQUEST_CACHE_MAX_SIZE` | Max cached entries | service-managed |
+| `WRAPPER_DEFAULT_MAX_TURNS` | Default `max_turns` when caller does not enable tools | `3` |
+| `WRAPPER_MAP_MAX_TOKENS_TO_THINKING` | Map OpenAI `max_tokens` to Claude `max_thinking_tokens` (legacy) | `false` |
+| `WATCHDOG_ENABLED` | Enable CPU watchdog (for Docker) | `true` |
+| `WATCHDOG_CPU_THRESHOLD` / `WATCHDOG_INTERVAL` / `WATCHDOG_STRIKES` | Watchdog tuning | see `src/cpu_watchdog.py` |
+| `UVICORN_WORKERS` | Worker count for the prod image | `2` |
+| `RATE_LIMIT_ENABLED` / `RATE_LIMIT_*_PER_MINUTE` | See rate-limit section above | - |
 
 ## Usage Examples
 
-### Using curl
+### curl
 
 ```bash
-# Basic chat completion (no auth)
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "claude-sonnet-4-5-20250929",
+    "model": "claude-sonnet-4-6",
     "messages": [
       {"role": "user", "content": "What is 2 + 2?"}
     ]
@@ -404,370 +223,259 @@ curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer your-generated-api-key" \
   -d '{
-    "model": "claude-sonnet-4-5-20250929",
+    "model": "claude-sonnet-4-6",
     "messages": [
       {"role": "user", "content": "Write a Python hello world script"}
     ],
     "stream": true
   }'
+
 ```
 
-### Using OpenAI Python SDK
+### OpenAI Python SDK
 
 ```python
 from openai import OpenAI
 
-# Configure client (automatically detects auth requirements)
 client = OpenAI(
     base_url="http://localhost:8000/v1",
-    api_key="your-api-key-if-required"  # Only needed if protection enabled
+    api_key="your-api-key-if-required"
 )
 
-# Alternative: Let examples auto-detect authentication
-# The wrapper's example files automatically check server auth status
-
-# Basic chat completion
+# Basic completion
 response = client.chat.completions.create(
-    model="claude-sonnet-4-5-20250929",
+    model="claude-sonnet-4-6",
     messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "What files are in the current directory?"}
     ]
 )
-
 print(response.choices[0].message.content)
-# Output: Fast response without tool usage (default behaviour)
 
-# Enable tools when you need them (e.g., to read files)
+# With tools enabled
 response = client.chat.completions.create(
-    model="claude-sonnet-4-5-20250929",
+    model="claude-sonnet-4-6",
     messages=[
         {"role": "user", "content": "What files are in the current directory?"}
     ],
-    extra_body={"enable_tools": True}  # Enable tools for file access
+    extra_body={"enable_tools": True}
 )
-print(response.choices[0].message.content)
-# Output: Claude will actually read your directory and list the files!
-
-# Check real costs and tokens
-print(f"Cost: ${response.usage.total_tokens * 0.000003:.6f}")  # Real cost tracking
-print(f"Tokens: {response.usage.total_tokens} ({response.usage.prompt_tokens} + {response.usage.completion_tokens})")
 
 # Streaming
 stream = client.chat.completions.create(
-    model="claude-sonnet-4-5-20250929",
-    messages=[
-        {"role": "user", "content": "Explain quantum computing"}
-    ],
+    model="claude-sonnet-4-6",
+    messages=[{"role": "user", "content": "Explain quantum computing"}],
     stream=True
 )
-
 for chunk in stream:
     if chunk.choices[0].delta.content:
         print(chunk.choices[0].delta.content, end="")
 ```
 
-## Supported Models
-
-All Claude models through November 2025 are supported:
+### Claude-specific headers
 
-### Claude 4.5 Family (Latest - Fall 2025)
-- **`claude-opus-4-5-20250929`** 🎯 Most Capable - Latest Opus with enhanced reasoning and capabilities
-- **`claude-sonnet-4-5-20250929`** ⭐ Recommended - Best coding model, superior reasoning and math
-- **`claude-haiku-4-5-20251001`** ⚡ Fast & Cheap - Similar performance to Sonnet 4 at 1/3 cost
+Claude-specific options via HTTP headers:
 
-### Claude 4.1 & 4.0 Family
-- **`claude-opus-4-1-20250805`** - Upgraded Opus 4 with improved agentic tasks and reasoning
-- `claude-opus-4-20250514` - Original Opus 4 with extended thinking mode
-- `claude-sonnet-4-20250514` - Original Sonnet 4 with hybrid reasoning
+| Header | Values | Description |
+|--------|--------|-------------|
+| `X-Claude-Max-Turns` | integer | Max conversation turns |
+| `X-Claude-Allowed-Tools` | comma-separated | Tools to allow |
+| `X-Claude-Permission-Mode` | `default`, `acceptEdits`, `bypassPermissions`, `plan` | Permission mode |
+| `X-Claude-Effort` | `low`, `medium`, `high`, `max` | Model effort level |
+| `X-Claude-Thinking` | `adaptive`, `enabled`, `disabled` | Extended thinking mode |
+| `X-Claude-Max-Thinking-Tokens` | integer | Thinking token budget |
+| `X-Enable-Cache` | `true` / `1` / `yes` | Opt in to response cache on this request |
 
-### Claude 3.x Family
-- `claude-3-7-sonnet-20250219` - Hybrid model with rapid/thoughtful response modes
-- `claude-3-5-sonnet-20241022` - Previous generation Sonnet
-- `claude-3-5-haiku-20241022` - Previous generation fast model
+## Supported Models
 
-**Note:** The model parameter is passed to Claude Code via the SDK's model selection.
+Model IDs, context windows, and pricing are sourced from the Anthropic models docs (`platform.claude.com/docs/en/about-claude/models/overview`) and mirrored in `src/constants.py`.
 
-## Session Continuity 🆕
+With `ANTHROPIC_API_KEY` set, `/v1/models` returns Anthropic's live catalogue (cached for `MODEL_LIST_CACHE_TTL_SECONDS`, default 1 hour) and the wrapper picks the latest Sonnet as `DEFAULT_MODEL` at startup. Without it (Bedrock, Vertex, or Claude CLI auth), the static list below is served and `claude-sonnet-4-6` is the fallback. `CLAUDE_MODELS_OVERRIDE=a,b,c` pins the list regardless of auth.
 
-The wrapper now supports **session continuity**, allowing you to maintain conversation context across multiple requests. This is a powerful feature that goes beyond the standard OpenAI API.
+### Latest
+| Model | Context | Max Output | Input $/MTok | Output $/MTok |
+|-------|---------|-----------|-------------|--------------|
+| `claude-opus-4-7` | 1M | 128K | $5 | $25 |
+| `claude-sonnet-4-6` (default) | 1M | 64K | $3 | $15 |
+| `claude-haiku-4-5-20251001` | 200K | 64K | $1 | $5 |
 
-### How It Works
+### Legacy (active, consider migrating)
+| Model | Context | Max Output | Input $/MTok | Output $/MTok |
+|-------|---------|-----------|-------------|--------------|
+| `claude-opus-4-6` | 1M | 128K | $5 | $25 |
+| `claude-opus-4-5-20251101` | 200K | 64K | $5 | $25 |
+| `claude-opus-4-1-20250805` | 200K | 32K | $15 | $75 |
+| `claude-sonnet-4-5-20250929` | 200K | 64K | $3 | $15 |
 
-- **Stateless Mode** (default): Each request is independent, just like the standard OpenAI API
-- **Session Mode**: Include a `session_id` to maintain conversation history across requests
+### Deprecated (retires 2026-06-15)
+| Model | Context | Max Output | Input $/MTok | Output $/MTok | Replacement |
+|-------|---------|-----------|-------------|--------------|-------------|
+| `claude-sonnet-4-20250514` | 200K | 64K | $3 | $15 | `claude-sonnet-4-6` |
+| `claude-opus-4-20250514` | 200K | 32K | $15 | $75 | `claude-opus-4-7` |
 
-### Using Sessions with OpenAI SDK
+**Note:** Claude 3.x models are not supported by the Claude Agent SDK.
 
-```python
-import openai
+## Session Continuity
 
-client = openai.OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="not-needed"
-)
+Pass a `session_id` to keep conversation context across requests:
 
-# Start a conversation with session continuity
+```python
+# Start a conversation
 response1 = client.chat.completions.create(
-    model="claude-sonnet-4-5-20250929",
-    messages=[
-        {"role": "user", "content": "Hello! My name is Alice and I'm learning Python."}
-    ],
-    extra_body={"session_id": "my-learning-session"}
+    model="claude-sonnet-4-6",
+    messages=[{"role": "user", "content": "My name is Alice."}],
+    extra_body={"session_id": "my-session"}
 )
 
-# Continue the conversation - Claude remembers the context
+# Continue it - Claude remembers the context
 response2 = client.chat.completions.create(
-    model="claude-sonnet-4-5-20250929",
-    messages=[
-        {"role": "user", "content": "What's my name and what am I learning?"}
-    ],
-    extra_body={"session_id": "my-learning-session"}  # Same session ID
+    model="claude-sonnet-4-6",
+    messages=[{"role": "user", "content": "What's my name?"}],
+    extra_body={"session_id": "my-session"}
 )
-# Claude will remember: "Your name is Alice and you're learning Python."
-```
-
-### Using Sessions with curl
-
-```bash
-# First message (add -H "Authorization: Bearer your-key" if auth enabled)
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "claude-sonnet-4-5-20250929",
-    "messages": [{"role": "user", "content": "My favourite color is blue."}],
-    "session_id": "my-session"
-  }'
-
-# Follow-up message - context is maintained
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "claude-sonnet-4-5-20250929",
-    "messages": [{"role": "user", "content": "What's my favourite color?"}],
-    "session_id": "my-session"
-  }'
 ```
 
-### Session Management
+Sessions expire after 1 hour of inactivity. Management endpoints:
+- `GET /v1/sessions` - list active sessions
+- `GET /v1/sessions/{id}` - session details
+- `DELETE /v1/sessions/{id}` - delete session
+- `GET /v1/sessions/stats` - session statistics
 
-The wrapper provides endpoints to manage active sessions:
+See `examples/session_continuity.py` for Python and curl examples.
 
-- `GET /v1/sessions` - List all active sessions
-- `GET /v1/sessions/{session_id}` - Get session details
-- `DELETE /v1/sessions/{session_id}` - Delete a session
-- `GET /v1/sessions/stats` - Get session statistics
+## API Endpoints
 
-```bash
-# List active sessions
-curl http://localhost:8000/v1/sessions
+### Core API
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/` | GET | Landing page with API explorer |
+| `/v1/chat/completions` | POST | OpenAI-compatible chat |
+| `/v1/messages` | POST | Anthropic-compatible messages |
+
+### Models
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/v1/models` | GET | List available models |
+| `/v1/models/status` | GET | Model service status |
+| `/v1/models/refresh` | POST | Refresh model catalogue |
+
+### Sessions
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/v1/sessions` | GET | List active sessions |
+| `/v1/sessions/stats` | GET | Session statistics |
+| `/v1/sessions/{id}` | GET | Get session by ID |
+| `/v1/sessions/{id}` | DELETE | Delete session |
+
+### Tools
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/v1/tools` | GET | List available tools |
+| `/v1/tools/config` | GET | Get tool configuration |
+| `/v1/tools/config` | POST | Update tool configuration |
+| `/v1/tools/stats` | GET | Tool usage statistics |
+
+### MCP Servers
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/v1/mcp/servers` | GET | List MCP servers |
+| `/v1/mcp/servers` | POST | Register MCP server |
+| `/v1/mcp/connect` | POST | Connect to MCP server |
+| `/v1/mcp/disconnect` | POST | Disconnect MCP server |
+| `/v1/mcp/stats` | GET | MCP statistics |
+
+### Cache / Auth / System
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/v1/cache/stats` | GET | Cache statistics |
+| `/v1/cache/clear` | POST | Clear request cache |
+| `/v1/auth/status` | GET | Auth status |
+| `/v1/compatibility` | POST | Parameter compatibility check |
+| `/v1/debug/request` | POST | Request debugging; **emits only `{"enabled": false}` unless `DEBUG_MODE` or `VERBOSE` is set** |
+| `/health` | GET | Liveness probe (no upstream call) |
+| `/healthz/deep` | GET | Deep readiness probe (performs an SDK round-trip) |
+| `/version` | GET | Wrapper version |
+
+## Function Calling
+
+Pass OpenAI-format tool definitions. The wrapper injects them into Claude's system prompt and parses structured responses back into `tool_calls` format.
 
-# Get session details
-curl http://localhost:8000/v1/sessions/my-session
+```python
+response = client.chat.completions.create(
+    model="claude-sonnet-4-6",
+    messages=[{"role": "user", "content": "What's the weather in NYC?"}],
+    tools=[{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get current weather for a location",
+            "parameters": {
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+                "required": ["location"],
+            },
+        },
+    }],
+    tool_choice="auto",
+)
 
-# Delete a session
-curl -X DELETE http://localhost:8000/v1/sessions/my-session
+# Response includes tool_calls when Claude decides to call a function
+if response.choices[0].finish_reason == "tool_calls":
+    for tc in response.choices[0].message.tool_calls:
+        print(f"Call: {tc.function.name}({tc.function.arguments})")
 ```
 
-### Session Features
-
-- **Automatic Expiration**: Sessions expire after 1 hour of inactivity
-- **Streaming Support**: Session continuity works with both streaming and non-streaming requests
-- **Memory Persistence**: Full conversation history is maintained within the session
-- **Efficient Storage**: Only active sessions are kept in memory
-
-### Examples
-
-See `examples/session_continuity.py` for comprehensive Python examples and `examples/session_curl_example.sh` for curl examples.
-
-## API Endpoints
-
-### Core Endpoints
-- `GET /` - Interactive landing page with API explorer
-- `POST /v1/chat/completions` - OpenAI-compatible chat completions (supports `session_id`)
-- `POST /v1/messages` - Anthropic-compatible messages endpoint
-- `GET /v1/models` - List available models
-- `GET /v1/auth/status` - Check authentication status and configuration
-- `GET /version` - Get API version
-- `GET /health` - Health check endpoint
-
-### Session Management Endpoints 🆕
-- `GET /v1/sessions` - List all active sessions
-- `GET /v1/sessions/{session_id}` - Get detailed session information
-- `DELETE /v1/sessions/{session_id}` - Delete a specific session
-- `GET /v1/sessions/stats` - Get session manager statistics
-
-## Limitations & Roadmap
-
-### 🚫 **Current Limitations**
-- **Images in messages** are converted to text placeholders
-- **Function calling** not supported (tools work automatically based on prompts)
-- **OpenAI parameters** not yet mapped: `temperature`, `top_p`, `max_tokens`, `logit_bias`, `presence_penalty`, `frequency_penalty`
-- **Multiple responses** (`n > 1`) not supported
-
-### 🛣 **Planned Enhancements** 
-- [ ] **Tool configuration** - allowed/disallowed tools endpoints  
-- [ ] **OpenAI parameter mapping** - temperature, top_p, max_tokens support
-- [ ] **Enhanced streaming** - better chunk handling
-- [ ] **MCP integration** - Model Context Protocol server support
-
-### ✅ **Recent Improvements (v2.2.0)**
-- **Interactive Landing Page**: API explorer with live endpoint testing
-- **Anthropic Messages API**: Native `/v1/messages` endpoint
-- **Explicit Auth Selection**: `CLAUDE_AUTH_METHOD` env var
-- **Tool Execution Fix**: `enable_tools: true` now works correctly
-
-### ✅ **v2.0.0 - v2.1.0 Features**
-- Claude Agent SDK v0.1.18 with bundled CLI
-- Multi-provider auth (CLI, API key, Bedrock, Vertex AI)
-- Session continuity and management
-- Real-time cost and token tracking
-- System prompt support
-
-## Troubleshooting
-
-1. **Claude CLI not found**:
-   ```bash
-   # Check Claude is in PATH
-   which claude
-   # Update CLAUDE_CLI_PATH in .env if needed
-   ```
+Supports `tool_choice`: `"auto"` (default), `"required"`, `"none"`, or `{"type": "function", "function": {"name": "..."}}`.
 
-2. **Authentication errors**:
-   ```bash
-   # Test authentication with fastest model
-   claude --print --model claude-haiku-4-5-20251001 "Hello"
-   # If this fails, re-authenticate if needed
-   ```
+Multi-turn tool conversations work - pass assistant messages with `tool_calls` and `tool` role result messages back. The wrapper converts them to text for Claude.
 
-3. **Timeout errors**:
-   - Increase `MAX_TIMEOUT` in `.env`
-   - Note: Claude Code can take time for complex requests
+## JSON Response Mode
 
-## Testing
+Set `response_format` to get JSON back:
 
-### 🧪 **Quick Test Suite**
-Test all endpoints with a simple script:
-```bash
-# Make sure server is running first
-poetry run python test_endpoints.py
+```python
+response = client.chat.completions.create(
+    model="claude-sonnet-4-6",
+    messages=[{"role": "user", "content": "List 3 colors with hex codes"}],
+    response_format={"type": "json_object"}
+)
 ```
 
-### 📝 **Basic Test Suite**
-Run the comprehensive test suite:
-```bash
-# Make sure server is running first  
-poetry run python test_basic.py
+With `json_object` mode, the wrapper adds system prompt instructions for JSON output, strips preambles like "Here is the JSON:", and uses brace-matching extraction as a fallback. Works streaming and non-streaming. JSON schema is also accepted via `response_format={"type": "json_schema", "json_schema": {...}}`.
 
-# With API key protection enabled, set TEST_API_KEY:
-TEST_API_KEY=your-generated-key poetry run python test_basic.py
-```
+## Limitations
 
-The test suite automatically detects whether API key protection is enabled and provides helpful guidance for providing the necessary authentication.
+- Images in messages are converted to text placeholders.
+- `temperature` and `top_p` are applied via system-prompt instructions (best-effort approximation, not native SDK parameters).
+- `presence_penalty` and `frequency_penalty` are accepted but ignored.
+- Multiple responses (`n > 1`) are not supported.
 
-### 🔍 **Authentication Test**
-Check authentication status:
-```bash
-curl http://localhost:8000/v1/auth/status | python -m json.tool
-```
+## Testing
 
-### ⚙️ **Development Tools**
 ```bash
-# Install development dependencies
-poetry install --with dev
-
-# Format code
-poetry run black .
-
-# Run full tests (when implemented)
+# Run the full test suite (673 tests, ~3 s on a laptop)
 poetry run pytest tests/
-```
-
-### ✅ **Expected Results**
-All tests should show:
-- **4/4 endpoint tests passing**
-- **4/4 basic tests passing** 
-- **Authentication method detected** (claude_cli, anthropic, bedrock, or vertex)
-- **Real cost tracking** (e.g., $0.001-0.005 per test call)
-- **Accurate token counts** from SDK metadata
 
-## Terms Compliance
-
-This wrapper is designed to be compliant with [Anthropic's Terms of Service](https://www.anthropic.com/legal).
-
-### Requirements for Users
-
-> **Important:** You must have your own valid Claude subscription or API access to use this wrapper.
-
-- **Claude Pro or Max subscription** - For CLI authentication (`claude auth login`)
-- **Anthropic API key** - Available at [platform.claude.com](https://platform.claude.com)
-- **AWS Bedrock or Google Vertex AI** - For enterprise cloud authentication
-
-This wrapper does not provide Claude access - it provides an OpenAI-compatible interface to Claude services you already have access to.
-
-### How This Wrapper Works
-
-- **Uses the official Claude Agent SDK** - The same SDK Anthropic provides for developers
-- **Each user authenticates individually** - No credential sharing or pooling
-- **Format translation only** - Converts OpenAI-format requests to Claude SDK calls
-- **No reselling** - Users access Claude through their own subscriptions/API keys
-
-### Personal vs Commercial Use
-
-| Use Case | Recommended Authentication | Notes |
-|----------|---------------------------|-------|
-| Personal projects | CLI Auth (Pro/Max) or API Key | Acceptable at moderate scale |
-| Business/Commercial | API Key, Bedrock, or Vertex AI | Use [platform.claude.com](https://platform.claude.com) |
-| High-scale applications | Bedrock or Vertex AI | Enterprise authentication recommended |
-
-**Note on Consumer Plans:** Claude Pro and Max subscriptions are primarily designed for individual, interactive use. Using them through wrappers or automated implementations is acceptable for personal projects at moderate scale. For business use or applications that scale significantly, Anthropic's commercial API offerings at [platform.claude.com](https://platform.claude.com) are more appropriate.
-
-### Authentication Methods
-
-| Method | Terms | Compliance |
-|--------|-------|------------|
-| `ANTHROPIC_API_KEY` | Commercial Terms | Explicitly allowed for programmatic access |
-| AWS Bedrock | Commercial Terms | Explicitly allowed for programmatic access |
-| Google Vertex AI | Commercial Terms | Explicitly allowed for programmatic access |
-| CLI Auth (Pro/Max) | Consumer Terms | Uses official SDK with official auth methods |
-
-### CLI Authentication Note
-
-Using CLI auth (`claude auth login`) with this wrapper is functionally equivalent to using Claude Code directly - both use the Claude Agent SDK with your personal subscription. Anthropic provides the SDK with CLI auth support, and this wrapper simply provides an alternative interface format.
-
-### What This Wrapper Does NOT Do
-
-- Does not share or pool credentials between users
-- Does not include or expose API keys or credentials
-- Does not resell API access
-- Does not train competing AI models
-- Does not scrape or harvest data
-- Does not bypass authentication or rate limits
-
-### User Responsibilities
-
-By using this wrapper, you agree to:
-- Comply with [Anthropic's Terms of Service](https://www.anthropic.com/legal/consumer-terms)
-- Comply with [Anthropic's Usage Policy](https://www.anthropic.com/legal/aup)
-- Use your own valid Claude subscription or API access
-- Not share your credentials with others
-- Use commercial API access for business applications
+# Quick endpoint test (server must be running)
+poetry run python tests/test_endpoints.py
+```
 
-### Disclaimer
+## Terms
 
-This is an independent open-source project, not affiliated with or endorsed by Anthropic. Users are responsible for ensuring their own usage complies with Anthropic's terms. Anthropic reserves the right to modify their Terms of Service at any time.
+You need your own Claude subscription or API access. This wrapper translates request formats - it does not provide Claude access.
 
-When in doubt, use `ANTHROPIC_API_KEY` authentication which is explicitly permitted for programmatic access under the Commercial Terms.
+| Use Case | Recommended Auth |
+|----------|------------------|
+| Personal projects | CLI Auth or API Key |
+| Business / commercial | API Key, Bedrock, or Vertex AI |
+| High-scale | Bedrock or Vertex AI |
 
-For Anthropic's official terms, see:
-- [Usage Policy](https://www.anthropic.com/legal/aup)
-- [Consumer Terms](https://www.anthropic.com/legal/consumer-terms)
-- [Commercial Terms](https://www.anthropic.com/legal/commercial-terms)
+See [Anthropic's Terms of Service](https://www.anthropic.com/legal).
 
-## Licence
+## License
 
-MIT Licence
+MIT
 
 ## Contributing
 
-Contributions are welcome! Please open an issue or submit a pull request.
+PRs welcome.
diff --git a/docker-compose.yml b/docker-compose.yml
index 6d0d141..bc801eb 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,16 +1,54 @@
-version: '3'
+version: '3.8'
 services:
   claude-wrapper:
-    build: .
+    image: ttlequals0/claude-code-openai-wrapper:latest
+    # Always pull a fresh :latest on stack redeploy so the Portainer
+    # webhook actually rolls forward. Without this, docker compose up
+    # reuses the locally cached :latest layer and the new image sits on
+    # Docker Hub unused.
+    pull_policy: always
+    # Build from local Dockerfile targeting the prod stage. Dev iteration
+    # should override with `target: dev` (or use `docker build --target dev`).
+    build:
+      context: .
+      target: prod
+    container_name: claude-wrapper
     ports:
       - "8000:8000"
     volumes:
+      # Mount Claude CLI credentials
       - ~/.claude:/root/.claude
       # Optional: Mount a specific workspace directory
-      # Uncomment and modify the line below to use a custom workspace
       # - ./workspace:/workspace
     environment:
       - PORT=8000
+      - MAX_TIMEOUT=600000
+      # Worker count for the prod image. Default 2.
+      # - UVICORN_WORKERS=2
+      # Default max_turns when the caller does not enable_tools. Raising from
+      # the legacy 1 to 3 prevents error_max_turns on short prompts where the
+      # agent engages extended thinking and needs a second turn to emit the
+      # final message.
+      # - WRAPPER_DEFAULT_MAX_TURNS=3
+      # Opt-in: map OpenAI max_tokens to Claude max_thinking_tokens (legacy
+      # behavior; now off by default because it's a semantic mismatch).
+      # - WRAPPER_MAP_MAX_TOKENS_TO_THINKING=false
+      # Authentication (choose one method):
+      # Option 1: Direct API key (recommended)
+      # - ANTHROPIC_API_KEY=your-api-key
+      # Option 2: Explicit auth method selection
+      # - CLAUDE_AUTH_METHOD=cli  # Options: cli, api_key, bedrock, vertex
       # Optional: Set Claude's working directory (defaults to isolated temp dir)
-      # Uncomment and modify the line below to set a custom working directory
       # - CLAUDE_CWD=/workspace
+      # Optional: Enable debug logging
+      # - DEBUG_MODE=true
+      # Optional: Rate limiting configuration
+      # - RATE_LIMIT_ENABLED=true
+      # - RATE_LIMIT_CHAT_PER_MINUTE=10
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
diff --git a/docs/MIGRATION_STATUS.md b/docs/MIGRATION_STATUS.md
deleted file mode 100644
index bdb586a..0000000
--- a/docs/MIGRATION_STATUS.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Claude Agent SDK Migration Status
-
-**Date:** 2025-11-02
-**Status:** ✅ **MIGRATION COMPLETE** (Testing limited by environment)
-
-## ✅ Completed
-
-1. **Dependency Updates**
-   - ✅ Updated `pyproject.toml` from `claude-code-sdk ^0.0.14` to `claude-agent-sdk ^0.1.6`
-   - ✅ Updated version to 2.0.0
-   - ✅ Successfully ran `poetry lock` and `poetry install`
-   - ✅ Verified claude-agent-sdk 0.1.6 installation
-
-2. **Code Updates**
-   - ✅ Updated imports: `claude_code_sdk` → `claude_agent_sdk`
-   - ✅ Renamed `ClaudeCodeOptions` → `ClaudeAgentOptions` throughout codebase
-   - ✅ Updated all SDK references in log messages and comments
-   - ✅ Fixed f-string syntax error in `main.py` line 149
-   - ✅ Updated compatibility endpoint response field names
-
-3. **Files Modified**
-   - ✅ `pyproject.toml` - Dependencies and version
-   - ✅ `claude_cli.py` - Imports, options class, logging
-   - ✅ `main.py` - SDK references, syntax fix
-
-4. **Basic Testing**
-   - ✅ SDK imports successfully (`from claude_agent_sdk import query, ClaudeAgentOptions, Message`)
-   - ✅ Server starts without import errors
-   - ✅ Health endpoint works (`/health`)
-   - ✅ Models endpoint works (`/v1/models`)
-   - ✅ Auth status endpoint works (`/v1/auth/status`)
-
-## ⚠️ Environment-Specific Issue (Not a Migration Problem)
-
-### Issue: SDK Query Hangs During Testing
-
-**Root Cause Identified:**
-The testing environment is **INSIDE Claude Code's own container** (`CLAUDE_CODE_REMOTE=true`), which creates a recursive situation when trying to use the Claude Code SDK from within Claude Code itself.
-
-**Environment Details:**
-```
-CLAUDE_CODE_VERSION=2.0.25
-CLAUDE_CODE_REMOTE=true
-CLAUDE_CODE_ENTRYPOINT=remote
-CLAUDE_CODE_CONTAINER_ID=container_011CUjNxa7A9jwwXtRTAocKf...
-```
-
-**Why This Happens:**
-- The wrapper is designed to run in a **normal environment** (user's machine, VPS, Docker container)
-- It then calls Claude Code CLI as an external tool
-- Testing from within Claude Code itself creates recursion/nesting issues
-- This is NOT a problem with the migration code itself
-
-**Expected Behavior in Production:**
-The wrapper is designed to be deployed to:
-- ✅ User's local machine (macOS, Linux, Windows)
-- ✅ Docker container (standalone)
-- ✅ VPS/cloud server (AWS, GCP, DigitalOcean, etc.)
-- ✅ Any standard Python environment with Claude Code CLI installed
-
-**Current Workaround for Testing:**
-- Disabled SDK verification during startup to allow server to start
-- Basic endpoints (health, models, auth) work fine
-- Chat completions cannot be fully tested in this environment
-
-## ✅ Migration Assessment
-
-**The migration is COMPLETE and CORRECT.**
-
-All code changes have been successfully implemented:
-- Dependencies updated
-- Imports changed
-- Class names renamed
-- Syntax errors fixed
-- References updated
-
-**The hanging issue is environmental, not a code problem.**
-
-When deployed to a proper environment (not inside Claude Code), the wrapper will work as expected with the new Claude Agent SDK v0.1.6.
-
-## 📋 Deployment Checklist
-
-For users deploying the migrated wrapper:
-
-### Prerequisites
-1. ✅ Python 3.10+
-2. ✅ Node.js installed
-3. ✅ Claude Code 2.0.0+ installed: `npm install -g @anthropic-ai/claude-code`
-4. ✅ Authentication configured (API key, Bedrock, Vertex, or CLI auth)
-
-### Installation
-```bash
-git clone https://github.com/RichardAtCT/claude-code-openai-wrapper
-cd claude-code-openai-wrapper
-git checkout claude/research-api-updates-011CUjNxYatBANZZq6bssaxN
-poetry install
-poetry run uvicorn src.main:app --host 0.0.0.0 --port 8000
-```
-
-### Verification
-```bash
-# Test endpoints
-curl http://localhost:8000/health
-curl http://localhost:8000/v1/models
-
-# Test chat completion
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "claude-3-5-haiku-20241022",
-    "messages": [{"role": "user", "content": "Hello!"}]
-  }'
-```
-
-## 📚 References
-
-- [Claude Agent SDK PyPI](https://pypi.org/project/claude-agent-sdk/)
-- [Migration Guide](https://docs.claude.com/en/docs/claude-code/sdk/migration-guide)
-- [UPGRADE_PLAN.md](./UPGRADE_PLAN.md) - Original migration plan
-- [GitHub Issue #289](https://github.com/anthropics/claude-agent-sdk-python/issues/289) - System prompt defaults
-
-## 💡 Next Steps
-
-1. **For Maintainer:** Update README.md to reflect v2.0.0 and new SDK
-2. **For Users:** Deploy to proper environment and test end-to-end
-3. **Future Work:** Consider OpenAI API 2025 enhancements (Phase 2 of upgrade plan)
-
----
-
-**Last Updated:** 2025-11-02 17:52:00 UTC
-**Updated By:** Claude (Migration Assistant)
-**Status:** ✅ Migration Complete (Environmental testing limitations noted)
diff --git a/docs/UPGRADE_PLAN.md b/docs/UPGRADE_PLAN.md
deleted file mode 100644
index a2348ea..0000000
--- a/docs/UPGRADE_PLAN.md
+++ /dev/null
@@ -1,807 +0,0 @@
-# Claude Code OpenAI Wrapper - Upgrade Plan
-
-**Date:** 2025-11-02
-**Current Version:** claude-code-sdk 0.0.14
-**Target Version:** claude-agent-sdk 0.1.6
-
-## Executive Summary
-
-This document outlines a comprehensive plan to upgrade the Claude Code OpenAI Wrapper to use the latest Claude Agent SDK (v0.1.6) and implement the latest OpenAI API standards as of 2025. The upgrade involves a critical SDK migration and implementation of new OpenAI API features.
-
----
-
-## 1. Claude Agent SDK Migration
-
-### 1.1 Current State Analysis
-
-**Current Implementation:**
-- **SDK:** `claude-code-sdk` version 0.0.14 (deprecated)
-- **Import:** `from claude_code_sdk import query, ClaudeCodeOptions, Message`
-- **Main File:** `claude_cli.py` (lines 11, 114-131)
-- **Usage Pattern:** Direct SDK `query()` function with `ClaudeCodeOptions`
-
-**Issues with Current Version:**
-- The `claude-code-sdk` package is deprecated (last version 0.0.25)
-- Missing latest features and improvements
-- No longer maintained or supported
-- Security and performance improvements not available
-
-### 1.2 Target State
-
-**Target SDK:** `claude-agent-sdk` version 0.1.6
-- **Released:** October 31, 2025
-- **Python Requirements:** Python >=3.10
-- **Additional Requirements:**
-  - Node.js
-  - Claude Code 2.0.0+ (`npm install -g @anthropic-ai/claude-code`)
-
-### 1.3 Breaking Changes & Migration Steps
-
-#### 1.3.1 Package Installation Changes
-
-**Before:**
-```bash
-pip install claude-code-sdk
-```
-
-**After:**
-```bash
-pip uninstall claude-code-sdk
-pip install claude-agent-sdk
-```
-
-**pyproject.toml Update:**
-```toml
-# Before:
-claude-code-sdk = "^0.0.14"
-
-# After:
-claude-agent-sdk = "^0.1.6"
-```
-
-#### 1.3.2 Import Statement Changes
-
-**Before (claude_cli.py:11):**
-```python
-from claude_code_sdk import query, ClaudeCodeOptions, Message
-```
-
-**After:**
-```python
-from claude_agent_sdk import query, ClaudeAgentOptions, Message
-```
-
-#### 1.3.3 Options Class Rename
-
-**Breaking Change:** `ClaudeCodeOptions` → `ClaudeAgentOptions`
-
-**Files to Update:**
-- `claude_cli.py` (lines 11, 63, 114)
-
-**Before:**
-```python
-options = ClaudeCodeOptions(
-    max_turns=max_turns,
-    cwd=self.cwd
-)
-```
-
-**After:**
-```python
-options = ClaudeAgentOptions(
-    max_turns=max_turns,
-    cwd=self.cwd
-)
-```
-
-#### 1.3.4 System Prompt Configuration Changes
-
-**Critical Breaking Change:** System prompt no longer defaults to Claude Code preset.
-
-**Current Implementation (claude_cli.py:124-125):**
-```python
-if system_prompt:
-    options.system_prompt = system_prompt
-```
-
-**New Implementation:**
-```python
-if system_prompt:
-    # New structured system prompt format
-    options.system_prompt = {
-        "type": "text",
-        "text": system_prompt
-    }
-else:
-    # Restore Claude Code default behavior (RECOMMENDED)
-    options.system_prompt = {
-        "type": "preset",
-        "preset": "claude_code"
-    }
-```
-
-**Alternative Approaches:**
-1. **Keep current behavior:** Set `type: "text"` with custom system prompts
-2. **Use Claude Code preset:** Set `type: "preset", preset: "claude_code"`
-3. **No system prompt:** Omit the field entirely for vanilla Claude behavior
-
-#### 1.3.5 Settings Sources Configuration
-
-**Breaking Change:** SDK no longer reads filesystem settings by default.
-
-**Current Behavior:** Automatically loads from:
-- `CLAUDE.md`
-- `settings.json`
-- Slash commands
-- User/project settings
-
-**New Behavior:** Must explicitly enable:
-```python
-options = ClaudeAgentOptions(
-    max_turns=max_turns,
-    cwd=self.cwd,
-    setting_sources=['user', 'project', 'local']  # Add if needed
-)
-```
-
-**Recommendation:** Only add if the wrapper needs to load filesystem settings.
-
-#### 1.3.6 New Features Available
-
-The Claude Agent SDK provides several new capabilities:
-
-**1. In-Process MCP Servers (Custom Tools)**
-```python
-from claude_agent_sdk import tool, create_sdk_mcp_server
-
-@tool("custom_tool", "Description", {"arg": str})
-async def custom_tool(args):
-    return {"content": [{"type": "text", "text": "Result"}]}
-
-server = create_sdk_mcp_server(
-    name="wrapper-tools",
-    version="1.0.0",
-    tools=[custom_tool]
-)
-```
-
-**Benefits:**
-- No subprocess overhead
-- Better performance than external MCP servers
-- Easier debugging
-- Simplified deployment
-
-**2. Hooks for Deterministic Processing**
-```python
-async def validate_tool(input_data, tool_use_id, context):
-    # Validate before execution
-    pass
-
-options = ClaudeAgentOptions(
-    hooks={
-        "PreToolUse": [
-            HookMatcher(matcher="Bash", hooks=[validate_tool])
-        ]
-    }
-)
-```
-
-**3. ClaudeSDKClient for Bidirectional Conversations**
-```python
-from claude_agent_sdk import ClaudeSDKClient
-
-async with ClaudeSDKClient(options=options) as client:
-    await client.query("Your prompt")
-    async for msg in client.receive_response():
-        print(msg)
-```
-
-### 1.4 Migration Implementation Plan
-
-#### Phase 1: Dependency Update
-- [ ] Update `pyproject.toml` with `claude-agent-sdk = "^0.1.6"`
-- [ ] Remove `claude-code-sdk` from dependencies
-- [ ] Run `poetry lock` and `poetry install`
-- [ ] Verify installation: `poetry show claude-agent-sdk`
-
-#### Phase 2: Code Updates
-- [ ] Update imports in `claude_cli.py`
-- [ ] Rename `ClaudeCodeOptions` to `ClaudeAgentOptions`
-- [ ] Update system prompt handling with new structured format
-- [ ] Add Claude Code preset as default system prompt
-- [ ] Review and update authentication flow (if needed)
-
-#### Phase 3: Testing
-- [ ] Update verification tests in `verify_cli()` method
-- [ ] Test all existing functionality:
-  - Basic completions
-  - Streaming responses
-  - Session continuity
-  - Tool usage (enable/disable)
-  - Authentication methods
-- [ ] Run existing test suite: `test_endpoints.py`, `test_basic.py`
-- [ ] Test with different authentication methods
-- [ ] Verify Docker deployment still works
-
-#### Phase 4: Documentation Updates
-- [ ] Update README.md with new SDK version
-- [ ] Update installation instructions
-- [ ] Document breaking changes for users
-- [ ] Update Docker image with new dependencies
-- [ ] Update example files if needed
-
----
-
-## 2. OpenAI API Standards Update (2025)
-
-### 2.1 Current OpenAI API Compliance Status
-
-**Currently Supported:**
-- ✅ Chat completions endpoint (`/v1/chat/completions`)
-- ✅ Basic streaming with `stream: true`
-- ✅ Message roles (system, user, assistant)
-- ✅ Model selection
-- ✅ Session management (custom extension)
-
-**Currently Not Supported:**
-- ❌ `temperature` parameter (0-2)
-- ❌ `max_tokens` / `max_completion_tokens` parameter
-- ❌ `top_p` parameter (nucleus sampling)
-- ❌ `frequency_penalty` parameter
-- ❌ `presence_penalty` parameter
-- ❌ `logit_bias` parameter
-- ❌ `n` parameter (multiple completions)
-- ❌ `stop` sequences
-- ❌ `stream_options` for usage data in streaming
-- ❌ Image content in messages (currently converted to placeholders)
-- ❌ Function calling / tools (OpenAI format)
-
-### 2.2 New OpenAI API Features (2025)
-
-#### 2.2.1 Max Tokens Evolution
-
-**Breaking Change:** `max_tokens` deprecated in favor of `max_completion_tokens` for certain models.
-
-**Current Parameter:** `max_tokens`
-**New Parameter:** `max_completion_tokens` (for o1-series models)
-
-**Reason:** Support for "hidden tokens" in reasoning models (o1-preview, o1-mini)
-
-**Implementation Strategy:**
-```python
-# In models.py ChatCompletionRequest
-max_tokens: Optional[int] = None  # Legacy support
-max_completion_tokens: Optional[int] = None  # New standard
-
-# Map to Claude options
-def to_claude_options(self):
-    options = {}
-    # Prefer max_completion_tokens if available
-    max_tok = self.max_completion_tokens or self.max_tokens
-    if max_tok:
-        options['max_thinking_tokens'] = max_tok  # Map to Claude
-    return options
-```
-
-#### 2.2.2 Stream Options Enhancement
-
-**New Feature:** `stream_options` parameter for usage data in streaming responses.
-
-**Current Implementation:** No usage data in streaming
-**New Implementation:**
-```python
-# Request:
-{
-    "stream": true,
-    "stream_options": {
-        "include_usage": true
-    }
-}
-
-# Response: Additional final chunk with usage data
-{
-    "id": "chatcmpl-...",
-    "usage": {
-        "prompt_tokens": 100,
-        "completion_tokens": 50,
-        "total_tokens": 150
-    }
-}
-```
-
-**Files to Update:**
-- `models.py`: Add `stream_options` field to `ChatCompletionRequest`
-- `main.py`: Update `generate_streaming_response()` to emit usage chunk
-
-#### 2.2.3 GPT-5 New Parameters (Optional)
-
-If targeting cutting-edge compatibility:
-
-**1. Verbosity Parameter:**
-```python
-verbosity: Optional[Literal["low", "medium", "high"]] = None
-# Controls response length/detail
-```
-
-**2. Reasoning Effort Parameter:**
-```python
-reasoning_effort: Optional[Literal["minimal", "low", "medium", "high"]] = None
-# For reasoning models - control depth of reasoning
-```
-
-**Note:** These are GPT-5 specific. Implementation is optional for Claude wrapper.
-
-### 2.3 Priority Parameter Implementation
-
-Based on user demand and compatibility, prioritize:
-
-#### Priority 1 (High Impact):
-1. **`temperature`** - Most commonly used parameter
-2. **`max_tokens` / `max_completion_tokens`** - Essential for output control
-3. **`stream_options.include_usage`** - Better streaming experience
-
-#### Priority 2 (Medium Impact):
-4. **`top_p`** - Alternative to temperature
-5. **`stop`** - Stop sequences for generation control
-6. **`presence_penalty` / `frequency_penalty`** - Fine-tuning repetition
-
-#### Priority 3 (Low Impact):
-7. **`n`** - Multiple completions (complex to implement with Claude)
-8. **`logit_bias`** - Advanced use case
-9. **GPT-5 specific parameters** - Future-proofing
-
-### 2.4 Parameter Mapping Strategy
-
-**Challenge:** Map OpenAI parameters to Claude SDK parameters.
-
-**Temperature Mapping:**
-```python
-# OpenAI: 0-2 (default 1)
-# Claude: No direct equivalent in SDK
-
-# Options:
-# 1. Include in system prompt
-# 2. Use custom headers if SDK supports
-# 3. Document as unsupported with warning
-```
-
-**Max Tokens Mapping:**
-```python
-# OpenAI: max_tokens / max_completion_tokens
-# Claude: max_thinking_tokens (for extended thinking)
-
-# Map in to_claude_options():
-if self.max_completion_tokens or self.max_tokens:
-    options['max_thinking_tokens'] = self.max_completion_tokens or self.max_tokens
-```
-
-**Top-P Mapping:**
-```python
-# Similar to temperature - no direct Claude SDK equivalent
-# Could combine with temperature in system prompt instruction
-```
-
-### 2.5 OpenAI API Implementation Plan
-
-#### Phase 1: Core Parameters
-- [ ] Add `max_completion_tokens` to request model
-- [ ] Add backward compatibility for `max_tokens`
-- [ ] Implement parameter mapping to Claude options
-- [ ] Add validation for parameter ranges
-
-#### Phase 2: Streaming Enhancements
-- [ ] Add `stream_options` to request model
-- [ ] Implement usage tracking in streaming responses
-- [ ] Emit final usage chunk when `include_usage: true`
-
-#### Phase 3: Advanced Parameters
-- [ ] Add `temperature` (document limitations)
-- [ ] Add `top_p` (document limitations)
-- [ ] Add `stop` sequences
-- [ ] Add `presence_penalty` / `frequency_penalty`
-- [ ] Document which parameters are best-effort vs full support
-
-#### Phase 4: Testing & Documentation
-- [ ] Test parameter validation
-- [ ] Test parameter mapping
-- [ ] Create compatibility matrix in README
-- [ ] Update API documentation
-- [ ] Add examples for new parameters
-
----
-
-## 3. Implementation Priorities & Timeline
-
-### 3.1 Recommended Approach
-
-**Option A: Sequential Migration** (Lower Risk)
-1. Complete Claude Agent SDK migration first
-2. Test thoroughly
-3. Then implement OpenAI API updates
-
-**Option B: Parallel Development** (Faster but Higher Risk)
-1. Create feature branches for each workstream
-2. Develop simultaneously
-3. Integrate and test together
-
-**Recommendation:** Option A for stability, Option B if timeline is critical.
-
-### 3.2 Estimated Timeline
-
-**Phase 1: Claude Agent SDK Migration**
-- Dependency updates: 1-2 hours
-- Code updates: 2-4 hours
-- Testing: 2-3 hours
-- **Total: 1 day**
-
-**Phase 2: OpenAI API Core Parameters**
-- Model updates: 2-3 hours
-- Implementation: 3-4 hours
-- Testing: 2-3 hours
-- **Total: 1 day**
-
-**Phase 3: Streaming & Advanced Features**
-- Implementation: 4-6 hours
-- Testing: 2-3 hours
-- **Total: 1 day**
-
-**Phase 4: Documentation & Polish**
-- Documentation: 3-4 hours
-- Final testing: 2-3 hours
-- **Total: 0.5 day**
-
-**Total Estimated Time:** 3.5-4 days
-
-### 3.3 Risk Assessment
-
-**High Risk Items:**
-1. ⚠️ System prompt migration (breaking change)
-2. ⚠️ Behavior changes from SDK defaults
-3. ⚠️ Authentication flow changes
-
-**Medium Risk Items:**
-1. ⚠️ Parameter mapping accuracy
-2. ⚠️ Streaming usage data implementation
-3. ⚠️ Backward compatibility
-
-**Low Risk Items:**
-1. Dependency updates
-2. Import statement changes
-3. Documentation updates
-
-### 3.4 Rollback Strategy
-
-**If Migration Fails:**
-1. Revert `pyproject.toml` changes
-2. Run `poetry lock && poetry install`
-3. Restore original code from git
-
-**Recommended:**
-- Create migration branch: `feature/sdk-migration`
-- Test thoroughly before merging to main
-- Tag current version before migration: `git tag v1.0.0-pre-migration`
-
----
-
-## 4. Compatibility Matrix (Post-Upgrade)
-
-### 4.1 Claude SDK Features
-
-| Feature | Current (0.0.14) | Target (0.1.6) | Status |
-|---------|-----------------|----------------|--------|
-| Basic completions | ✅ | ✅ | Maintained |
-| Streaming | ✅ | ✅ | Maintained |
-| System prompts | ✅ | ✅ | Breaking change |
-| Tool control | ✅ | ✅ | Maintained |
-| Session continuity | ✅ | ✅ | Maintained |
-| In-process MCP | ❌ | ✅ | **New** |
-| Hooks | ❌ | ✅ | **New** |
-| Settings sources | Auto | Manual | Breaking change |
-
-### 4.2 OpenAI API Compliance
-
-| Feature | Pre-Upgrade | Post-Upgrade | Notes |
-|---------|------------|--------------|-------|
-| Chat completions | ✅ | ✅ | Core feature |
-| Streaming | ✅ | ✅ | Enhanced with usage |
-| `model` | ✅ | ✅ | Maintained |
-| `messages` | ✅ | ✅ | Maintained |
-| `temperature` | ❌ | ⚠️ | Best-effort |
-| `max_tokens` | ❌ | ✅ | **New** |
-| `max_completion_tokens` | ❌ | ✅ | **New** |
-| `stream_options` | ❌ | ✅ | **New** |
-| `top_p` | ❌ | ⚠️ | Best-effort |
-| `stop` | ❌ | 🔄 | Planned |
-| `n` | ❌ | ❌ | Not supported |
-| Function calling | ❌ | ❌ | Not supported |
-
-**Legend:**
-- ✅ Fully supported
-- ⚠️ Partial/best-effort support
-- 🔄 Planned for implementation
-- ❌ Not supported
-
----
-
-## 5. Testing Strategy
-
-### 5.1 Test Coverage Requirements
-
-**Unit Tests:**
-- [ ] SDK initialization with new `ClaudeAgentOptions`
-- [ ] System prompt configuration variations
-- [ ] Parameter validation for new OpenAI params
-- [ ] Parameter mapping to Claude options
-
-**Integration Tests:**
-- [ ] End-to-end completion request
-- [ ] Streaming with usage data
-- [ ] Session continuity across SDK version
-- [ ] Authentication methods (API key, Bedrock, Vertex)
-
-**Regression Tests:**
-- [ ] All existing `test_endpoints.py` tests pass
-- [ ] All existing `test_basic.py` tests pass
-- [ ] Session tests still functional
-- [ ] Docker deployment works
-
-### 5.2 Test Files to Update
-
-1. **`test_endpoints.py`**
-   - Update expected behaviors
-   - Add tests for new parameters
-
-2. **`test_basic.py`**
-   - Verify SDK migration doesn't break basics
-   - Add streaming usage tests
-
-3. **`test_session_continuity.py`**
-   - Ensure sessions work with new SDK
-   - Test session persistence
-
-4. **New Test Files Needed:**
-   - `test_parameter_mapping.py` - Test OpenAI → Claude parameter mapping
-   - `test_sdk_migration.py` - Verify SDK upgrade behaviors
-
-### 5.3 Manual Testing Checklist
-
-- [ ] Basic chat completion works
-- [ ] Streaming works with usage data
-- [ ] Temperature parameter accepted (even if best-effort)
-- [ ] Max tokens limiting works
-- [ ] Session continuity maintained
-- [ ] All authentication methods work
-- [ ] Docker container builds and runs
-- [ ] Example files work (`examples/openai_sdk.py`, etc.)
-
----
-
-## 6. Documentation Updates Required
-
-### 6.1 README.md Updates
-
-**Sections to Update:**
-1. **Status section** - Update SDK version to 0.1.6
-2. **Features section** - Add new OpenAI parameter support
-3. **Prerequisites** - Update Claude Code version requirement (2.0.0+)
-4. **Installation** - Update dependency instructions
-5. **Limitations & Roadmap** - Update with implemented features
-6. **Supported Models** - Verify model list is current
-
-**New Sections to Add:**
-- **Parameter Support Matrix** - Document OpenAI parameter compatibility
-- **Migration Guide** - For users upgrading from older versions
-
-### 6.2 Code Documentation
-
-- [ ] Update docstrings in `claude_cli.py`
-- [ ] Update comments explaining new SDK behavior
-- [ ] Document system prompt configuration options
-- [ ] Add examples for new parameters
-
-### 6.3 Example Files
-
-Files to review/update:
-- `examples/openai_sdk.py` - Add parameter examples
-- `examples/streaming.py` - Add stream_options example
-- `examples/session_continuity.py` - Verify compatibility
-
----
-
-## 7. Rollout Plan
-
-### 7.1 Pre-Release Steps
-
-1. **Create feature branch:** `feature/upgrade-sdk-and-api`
-2. **Tag current version:** `git tag v1.0.0-stable`
-3. **Update dependencies** in branch
-4. **Implement changes** following this plan
-5. **Test thoroughly** with all test suites
-6. **Update documentation** completely
-7. **Test Docker build** and deployment
-
-### 7.2 Release Steps
-
-1. **Merge to main** after all tests pass
-2. **Tag new version:** `git tag v2.0.0` (major version due to breaking changes)
-3. **Update GitHub release notes** with:
-   - Breaking changes
-   - New features
-   - Migration instructions
-4. **Update Docker Hub** with new image
-5. **Notify users** via GitHub discussions/issues
-
-### 7.3 Post-Release Monitoring
-
-- Monitor GitHub issues for migration problems
-- Be ready to provide support for breaking changes
-- Consider creating a `v1.x` maintenance branch for critical fixes
-
----
-
-## 8. Breaking Changes for End Users
-
-### 8.1 System Prompt Behavior
-
-**Breaking Change:** Default system prompt behavior changes.
-
-**Impact:** Users relying on Claude Code default system prompt may see different behavior.
-
-**Migration:**
-- No action needed if using custom system prompts
-- Default now restored via `preset: "claude_code"` in SDK options
-
-### 8.2 Settings Files
-
-**Breaking Change:** Settings files no longer auto-loaded.
-
-**Impact:** Users with `CLAUDE.md`, custom settings.json may see different behavior.
-
-**Migration:**
-- Explicitly enable via `setting_sources` if needed
-- Most users won't be affected (wrapper doesn't rely on these)
-
-### 8.3 Dependency Requirements
-
-**Change:** New package name and version requirements.
-
-**Impact:** Users building from source need to update dependencies.
-
-**Migration:**
-```bash
-poetry lock --no-update
-poetry install
-# Or for Docker:
-docker build --no-cache -t claude-wrapper:v2 .
-```
-
----
-
-## 9. Success Criteria
-
-The upgrade is considered successful when:
-
-✅ **Functional Requirements:**
-- [ ] All existing tests pass with new SDK
-- [ ] Streaming responses work correctly
-- [ ] Session continuity maintained
-- [ ] Authentication methods all functional
-- [ ] Docker deployment successful
-- [ ] At least 3 new OpenAI parameters implemented (`max_tokens`, `temperature`, `stream_options`)
-
-✅ **Quality Requirements:**
-- [ ] No regressions in existing functionality
-- [ ] Response times similar or better than before
-- [ ] Error handling maintains quality
-- [ ] Documentation complete and accurate
-
-✅ **User Experience:**
-- [ ] Clear migration guide available
-- [ ] Breaking changes well documented
-- [ ] Examples updated and working
-- [ ] GitHub issues addressed proactively
-
----
-
-## 10. Additional Recommendations
-
-### 10.1 Consider Future Enhancements
-
-**After migration is stable:**
-1. **Implement In-Process MCP Tools** - Leverage new SDK capability for custom tools
-2. **Add Hooks for Validation** - Use SDK hooks for tool usage validation
-3. **Explore ClaudeSDKClient** - For more interactive conversation patterns
-4. **Function Calling Translation** - Map OpenAI function calls to Claude tools
-
-### 10.2 Monitoring & Observability
-
-Consider adding:
-- **Metrics collection** - Track SDK performance, error rates
-- **Usage analytics** - Understand which parameters are most used
-- **Error reporting** - Better error tracking for debugging
-
-### 10.3 Community Engagement
-
-- Share migration experience in GitHub discussions
-- Contribute back to Claude Agent SDK if bugs found
-- Update examples and share best practices
-
----
-
-## Appendix A: Quick Reference
-
-### Key Code Changes
-
-**Import Change:**
-```python
-# Before
-from claude_code_sdk import query, ClaudeCodeOptions, Message
-
-# After
-from claude_agent_sdk import query, ClaudeAgentOptions, Message
-```
-
-**Options Change:**
-```python
-# Before
-options = ClaudeCodeOptions(max_turns=1, cwd="/path")
-
-# After
-options = ClaudeAgentOptions(
-    max_turns=1,
-    cwd="/path",
-    system_prompt={"type": "preset", "preset": "claude_code"}
-)
-```
-
-**Dependency Change:**
-```toml
-# Before
-claude-code-sdk = "^0.0.14"
-
-# After
-claude-agent-sdk = "^0.1.6"
-```
-
-### Key Commands
-
-```bash
-# Update dependencies
-poetry remove claude-code-sdk
-poetry add claude-agent-sdk@^0.1.6
-poetry lock
-poetry install
-
-# Test changes
-poetry run python test_endpoints.py
-poetry run python test_basic.py
-
-# Build Docker
-docker build -t claude-wrapper:v2 .
-
-# Tag for release
-git tag v2.0.0
-git push origin v2.0.0
-```
-
----
-
-## Appendix B: Reference Links
-
-### Official Documentation
-- [Claude Agent SDK PyPI](https://pypi.org/project/claude-agent-sdk/)
-- [Claude Agent SDK GitHub](https://github.com/anthropics/claude-agent-sdk-python)
-- [Migration Guide](https://docs.claude.com/en/docs/claude-code/sdk/migration-guide)
-- [OpenAI API Reference](https://platform.openai.com/docs/api-reference)
-
-### Related Issues
-- [System prompt defaults issue #289](https://github.com/anthropics/claude-agent-sdk-python/issues/289)
-
-### Community Resources
-- [Claude Agent SDK Migration Guide Blog](https://kane.mx/posts/2025/claude-agent-sdk-update/)
-
----
-
-**Document Version:** 1.0
-**Last Updated:** 2025-11-02
-**Next Review:** After Phase 1 completion
diff --git a/poetry.lock b/poetry.lock
index 03d8e92..8b44797 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,16 @@
-# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.3.4 and should not be changed by hand.
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+description = "Document parameters, class attributes, return types, and variables inline, with Annotated."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320"},
+    {file = "annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4"},
+]
 
 [[package]]
 name = "annotated-types"
@@ -49,18 +61,32 @@ files = [
 
 [[package]]
 name = "authlib"
-version = "1.6.6"
+version = "1.7.0"
 description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
 optional = false
-python-versions = ">=3.9"
-groups = ["dev"]
+python-versions = ">=3.10"
+groups = ["main", "dev"]
 files = [
-    {file = "authlib-1.6.6-py2.py3-none-any.whl", hash = "sha256:7d9e9bc535c13974313a87f53e8430eb6ea3d1cf6ae4f6efcd793f2e949143fd"},
-    {file = "authlib-1.6.6.tar.gz", hash = "sha256:45770e8e056d0f283451d9996fbb59b70d45722b45d854d58f32878d0a40c38e"},
+    {file = "authlib-1.7.0-py2.py3-none-any.whl", hash = "sha256:e36817afb02f6f0b6bf55f150782499ddd6ddf44b402bb055d3263cc65ac9ae0"},
+    {file = "authlib-1.7.0.tar.gz", hash = "sha256:b3e326c9aa9cc3ea95fe7d89fd880722d3608da4d00e8a27e061e64b48d801d5"},
 ]
 
 [package.dependencies]
 cryptography = "*"
+joserfc = ">=1.6.0"
+
+[[package]]
+name = "backports-asyncio-runner"
+version = "1.2.0"
+description = "Backport of asyncio.Runner, a context manager that controls event loop life cycle."
+optional = false
+python-versions = "<3.11,>=3.8"
+groups = ["dev"]
+markers = "python_version == \"3.10\""
+files = [
+    {file = "backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5"},
+    {file = "backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162"},
+]
 
 [[package]]
 name = "backports-datetime-fromisoformat"
@@ -147,42 +173,48 @@ yaml = ["PyYAML"]
 
 [[package]]
 name = "black"
-version = "24.10.0"
+version = "26.3.1"
 description = "The uncompromising code formatter."
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.10"
 groups = ["dev"]
 files = [
-    {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"},
-    {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"},
-    {file = "black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:649fff99a20bd06c6f727d2a27f401331dc0cc861fb69cde910fe95b01b5928f"},
-    {file = "black-24.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe4d6476887de70546212c99ac9bd803d90b42fc4767f058a0baa895013fbb3e"},
-    {file = "black-24.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a2221696a8224e335c28816a9d331a6c2ae15a2ee34ec857dcf3e45dbfa99ad"},
-    {file = "black-24.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9da3333530dbcecc1be13e69c250ed8dfa67f43c4005fb537bb426e19200d50"},
-    {file = "black-24.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4007b1393d902b48b36958a216c20c4482f601569d19ed1df294a496eb366392"},
-    {file = "black-24.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:394d4ddc64782e51153eadcaaca95144ac4c35e27ef9b0a42e121ae7e57a9175"},
-    {file = "black-24.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e39e0fae001df40f95bd8cc36b9165c5e2ea88900167bddf258bacef9bbdc3"},
-    {file = "black-24.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d37d422772111794b26757c5b55a3eade028aa3fde43121ab7b673d050949d65"},
-    {file = "black-24.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14b3502784f09ce2443830e3133dacf2c0110d45191ed470ecb04d0f5f6fcb0f"},
-    {file = "black-24.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:30d2c30dc5139211dda799758559d1b049f7f14c580c409d6ad925b74a4208a8"},
-    {file = "black-24.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cbacacb19e922a1d75ef2b6ccaefcd6e93a2c05ede32f06a21386a04cedb981"},
-    {file = "black-24.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1f93102e0c5bb3907451063e08b9876dbeac810e7da5a8bfb7aeb5a9ef89066b"},
-    {file = "black-24.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddacb691cdcdf77b96f549cf9591701d8db36b2f19519373d60d31746068dbf2"},
-    {file = "black-24.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:680359d932801c76d2e9c9068d05c6b107f2584b2a5b88831c83962eb9984c1b"},
-    {file = "black-24.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:17374989640fbca88b6a448129cd1745c5eb8d9547b464f281b251dd00155ccd"},
-    {file = "black-24.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:63f626344343083322233f175aaf372d326de8436f5928c042639a4afbbf1d3f"},
-    {file = "black-24.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfa1d0cb6200857f1923b602f978386a3a2758a65b52e0950299ea014be6800"},
-    {file = "black-24.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cd9c95431d94adc56600710f8813ee27eea544dd118d45896bb734e9d7a0dc7"},
-    {file = "black-24.10.0-py3-none-any.whl", hash = "sha256:3bb2b7a1f7b685f85b11fed1ef10f8a9148bceb49853e47a294a3dd963c1dd7d"},
-    {file = "black-24.10.0.tar.gz", hash = "sha256:846ea64c97afe3bc677b761787993be4991810ecc7a4a937816dd6bddedc4875"},
+    {file = "black-26.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:86a8b5035fce64f5dcd1b794cf8ec4d31fe458cf6ce3986a30deb434df82a1d2"},
+    {file = "black-26.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5602bdb96d52d2d0672f24f6ffe5218795736dd34807fd0fd55ccd6bf206168b"},
+    {file = "black-26.3.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c54a4a82e291a1fee5137371ab488866b7c86a3305af4026bdd4dc78642e1ac"},
+    {file = "black-26.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:6e131579c243c98f35bce64a7e08e87fb2d610544754675d4a0e73a070a5aa3a"},
+    {file = "black-26.3.1-cp310-cp310-win_arm64.whl", hash = "sha256:5ed0ca58586c8d9a487352a96b15272b7fa55d139fc8496b519e78023a8dab0a"},
+    {file = "black-26.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:28ef38aee69e4b12fda8dba75e21f9b4f979b490c8ac0baa7cb505369ac9e1ff"},
+    {file = "black-26.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf9bf162ed91a26f1adba8efda0b573bc6924ec1408a52cc6f82cb73ec2b142c"},
+    {file = "black-26.3.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:474c27574d6d7037c1bc875a81d9be0a9a4f9ee95e62800dab3cfaadbf75acd5"},
+    {file = "black-26.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:5e9d0d86df21f2e1677cc4bd090cd0e446278bcbbe49bf3659c308c3e402843e"},
+    {file = "black-26.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:9a5e9f45e5d5e1c5b5c29b3bd4265dcc90e8b92cf4534520896ed77f791f4da5"},
+    {file = "black-26.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e6f89631eb88a7302d416594a32faeee9fb8fb848290da9d0a5f2903519fc1"},
+    {file = "black-26.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:41cd2012d35b47d589cb8a16faf8a32ef7a336f56356babd9fcf70939ad1897f"},
+    {file = "black-26.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f76ff19ec5297dd8e66eb64deda23631e642c9393ab592826fd4bdc97a4bce7"},
+    {file = "black-26.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:ddb113db38838eb9f043623ba274cfaf7d51d5b0c22ecb30afe58b1bb8322983"},
+    {file = "black-26.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:dfdd51fc3e64ea4f35873d1b3fb25326773d55d2329ff8449139ebaad7357efb"},
+    {file = "black-26.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:855822d90f884905362f602880ed8b5df1b7e3ee7d0db2502d4388a954cc8c54"},
+    {file = "black-26.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8a33d657f3276328ce00e4d37fe70361e1ec7614da5d7b6e78de5426cb56332f"},
+    {file = "black-26.3.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f1cd08e99d2f9317292a311dfe578fd2a24b15dbce97792f9c4d752275c1fa56"},
+    {file = "black-26.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:c7e72339f841b5a237ff14f7d3880ddd0fc7f98a1199e8c4327f9a4f478c1839"},
+    {file = "black-26.3.1-cp313-cp313-win_arm64.whl", hash = "sha256:afc622538b430aa4c8c853f7f63bc582b3b8030fd8c80b70fb5fa5b834e575c2"},
+    {file = "black-26.3.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2d6bfaf7fd0993b420bed691f20f9492d53ce9a2bcccea4b797d34e947318a78"},
+    {file = "black-26.3.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f89f2ab047c76a9c03f78d0d66ca519e389519902fa27e7a91117ef7611c0568"},
+    {file = "black-26.3.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b07fc0dab849d24a80a29cfab8d8a19187d1c4685d8a5e6385a5ce323c1f015f"},
+    {file = "black-26.3.1-cp314-cp314-win_amd64.whl", hash = "sha256:0126ae5b7c09957da2bdbd91a9ba1207453feada9e9fe51992848658c6c8e01c"},
+    {file = "black-26.3.1-cp314-cp314-win_arm64.whl", hash = "sha256:92c0ec1f2cc149551a2b7b47efc32c866406b6891b0ee4625e95967c8f4acfb1"},
+    {file = "black-26.3.1-py3-none-any.whl", hash = "sha256:2bd5aa94fc267d38bb21a70d7410a89f1a1d318841855f698746f8e7f51acd1b"},
+    {file = "black-26.3.1.tar.gz", hash = "sha256:2c50f5063a9641c7eed7795014ba37b0f5fa227f3d408b968936e24bc0566b07"},
 ]
 
 [package.dependencies]
 click = ">=8.0.0"
 mypy-extensions = ">=0.4.3"
 packaging = ">=22.0"
-pathspec = ">=0.9.0"
+pathspec = ">=1.0.0"
 platformdirs = ">=2"
+pytokens = ">=0.4.0,<0.5.0"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 
@@ -190,7 +222,7 @@ typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 colorama = ["colorama (>=0.4.3)"]
 d = ["aiohttp (>=3.10)"]
 jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
-uvloop = ["uvloop (>=0.15.2)"]
+uvloop = ["uvloop (>=0.15.2) ; sys_platform != \"win32\"", "winloop (>=0.5.0) ; sys_platform == \"win32\""]
 
 [[package]]
 name = "certifi"
@@ -406,26 +438,31 @@ files = [
 
 [[package]]
 name = "claude-agent-sdk"
-version = "0.1.18"
+version = "0.1.81"
 description = "Python SDK for Claude Code"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "claude_agent_sdk-0.1.18-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9e45b4e3c20c072c3e3325fa60bab9a4b5a7cbbce64ca274b8d7d0af42dd9dd8"},
-    {file = "claude_agent_sdk-0.1.18-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:3c41bd8f38848609ae0d5da8d7327a4c2d7057a363feafb6fd70df611ea204cc"},
-    {file = "claude_agent_sdk-0.1.18-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:983f15e51253f40c55136a86d7cc63e023a3576428b05fa1459093d461b2d215"},
-    {file = "claude_agent_sdk-0.1.18-py3-none-win_amd64.whl", hash = "sha256:36f5b84d5c3c8773ee9b56aeb5ab345d1033231db37f80d1f20ac15239bef41c"},
-    {file = "claude_agent_sdk-0.1.18.tar.gz", hash = "sha256:4fcb8730cc77dea562fbe9aa48c65eced3ef58a6bb1f34f77e50e8258902477d"},
+    {file = "claude_agent_sdk-0.1.81-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e4bc8797cc2bc882031cf6b287a550ae2bb38a3822aa081e9ffc81bb4bed51da"},
+    {file = "claude_agent_sdk-0.1.81-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:a3cdbc00e18ed6b0f11387833bf2d4b7779e0f5f3a9ea63f27b6d6e62f304256"},
+    {file = "claude_agent_sdk-0.1.81-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:e08a03b414af5814573cf89646653c1398193557f536914103f8f0708068ed27"},
+    {file = "claude_agent_sdk-0.1.81-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:a75b3421eeabc57c31ee2515a7c58ddf17886a3166ee9481f0750ddb27eba8d8"},
+    {file = "claude_agent_sdk-0.1.81-py3-none-win_amd64.whl", hash = "sha256:4214cef9c4fb4f6b850d23f5f931e0e556803f4c32c1ae9f87206d2327b4a1a8"},
+    {file = "claude_agent_sdk-0.1.81.tar.gz", hash = "sha256:9a3e873c99cd98b2e11ae5e65fd250f38ea192c3a8ddd117ed69a10bbf2b913b"},
 ]
 
 [package.dependencies]
 anyio = ">=4.0.0"
-mcp = ">=0.1.0"
+mcp = ">=1.19.0"
+opentelemetry-api = {version = ">=1.20.0", optional = true, markers = "extra == \"otel\""}
+sniffio = ">=1.0.0"
 typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
 dev = ["anyio[trio] (>=4.0.0)", "mypy (>=1.0.0)", "pytest (>=7.0.0)", "pytest-asyncio (>=0.20.0)", "pytest-cov (>=4.0.0)", "ruff (>=0.1.0)"]
+examples = ["asyncpg (>=0.27.0)", "boto3 (>=1.28.0)", "fakeredis (>=2.20.0)", "moto[s3] (>=5.0.0)", "redis (>=4.2.0)"]
+otel = ["opentelemetry-api (>=1.20.0)"]
 
 [[package]]
 name = "click"
@@ -565,66 +602,61 @@ toml = ["tomli ; python_full_version <= \"3.11.0a6\""]
 
 [[package]]
 name = "cryptography"
-version = "46.0.3"
+version = "46.0.7"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = "!=3.9.0,!=3.9.1,>=3.8"
 groups = ["main", "dev"]
 files = [
-    {file = "cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e"},
-    {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926"},
-    {file = "cryptography-46.0.3-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71"},
-    {file = "cryptography-46.0.3-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac"},
-    {file = "cryptography-46.0.3-cp311-abi3-win32.whl", hash = "sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018"},
-    {file = "cryptography-46.0.3-cp311-abi3-win_amd64.whl", hash = "sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb"},
-    {file = "cryptography-46.0.3-cp311-abi3-win_arm64.whl", hash = "sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c"},
-    {file = "cryptography-46.0.3-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:00a5e7e87938e5ff9ff5447ab086a5706a957137e6e433841e9d24f38a065217"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c8daeb2d2174beb4575b77482320303f3d39b8e81153da4f0fb08eb5fe86a6c5"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:39b6755623145ad5eff1dab323f4eae2a32a77a7abef2c5089a04a3d04366715"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:db391fa7c66df6762ee3f00c95a89e6d428f4d60e7abc8328f4fe155b5ac6e54"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:78a97cf6a8839a48c49271cdcbd5cf37ca2c1d6b7fdd86cc864f302b5e9bf459"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:dfb781ff7eaa91a6f7fd41776ec37c5853c795d3b358d4896fdbb5df168af422"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:6f61efb26e76c45c4a227835ddeae96d83624fb0d29eb5df5b96e14ed1a0afb7"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:23b1a8f26e43f47ceb6d6a43115f33a5a37d57df4ea0ca295b780ae8546e8044"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:b419ae593c86b87014b9be7396b385491ad7f320bde96826d0dd174459e54665"},
-    {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:50fc3343ac490c6b08c0cf0d704e881d0d660be923fd3076db3e932007e726e3"},
-    {file = "cryptography-46.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:22d7e97932f511d6b0b04f2bfd818d73dcd5928db509460aaf48384778eb6d20"},
-    {file = "cryptography-46.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d55f3dffadd674514ad19451161118fd010988540cee43d8bc20675e775925de"},
-    {file = "cryptography-46.0.3-cp314-cp314t-win32.whl", hash = "sha256:8a6e050cb6164d3f830453754094c086ff2d0b2f3a897a1d9820f6139a1f0914"},
-    {file = "cryptography-46.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:760f83faa07f8b64e9c33fc963d790a2edb24efb479e3520c14a45741cd9b2db"},
-    {file = "cryptography-46.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:516ea134e703e9fe26bcd1277a4b59ad30586ea90c365a87781d7887a646fe21"},
-    {file = "cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04"},
-    {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506"},
-    {file = "cryptography-46.0.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963"},
-    {file = "cryptography-46.0.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4"},
-    {file = "cryptography-46.0.3-cp38-abi3-win32.whl", hash = "sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df"},
-    {file = "cryptography-46.0.3-cp38-abi3-win_amd64.whl", hash = "sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f"},
-    {file = "cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372"},
-    {file = "cryptography-46.0.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a23582810fedb8c0bc47524558fb6c56aac3fc252cb306072fd2815da2a47c32"},
-    {file = "cryptography-46.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e7aec276d68421f9574040c26e2a7c3771060bc0cff408bae1dcb19d3ab1e63c"},
-    {file = "cryptography-46.0.3-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7ce938a99998ed3c8aa7e7272dca1a610401ede816d36d0693907d863b10d9ea"},
-    {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:191bb60a7be5e6f54e30ba16fdfae78ad3a342a0599eb4193ba88e3f3d6e185b"},
-    {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c70cc23f12726be8f8bc72e41d5065d77e4515efae3690326764ea1b07845cfb"},
-    {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:9394673a9f4de09e28b5356e7fff97d778f8abad85c9d5ac4a4b7e25a0de7717"},
-    {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94cd0549accc38d1494e1f8de71eca837d0509d0d44bf11d158524b0e12cebf9"},
-    {file = "cryptography-46.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c"},
-    {file = "cryptography-46.0.3.tar.gz", hash = "sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1"},
+    {file = "cryptography-46.0.7-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ad9ef796328c5e3c4ceed237a183f5d41d21150f972455a9d926593a1dcb308"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:73510b83623e080a2c35c62c15298096e2a5dc8d51c3b4e1740211839d0dea77"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cbd5fb06b62bd0721e1170273d3f4d5a277044c47ca27ee257025146c34cbdd1"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:420b1e4109cc95f0e5700eed79908cef9268265c773d3a66f7af1eef53d409ef"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:24402210aa54baae71d99441d15bb5a1919c195398a87b563df84468160a65de"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:8a469028a86f12eb7d2fe97162d0634026d92a21f3ae0ac87ed1c4a447886c83"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:9694078c5d44c157ef3162e3bf3946510b857df5a3955458381d1c7cfc143ddb"},
+    {file = "cryptography-46.0.7-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:42a1e5f98abb6391717978baf9f90dc28a743b7d9be7f0751a6f56a75d14065b"},
+    {file = "cryptography-46.0.7-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91bbcb08347344f810cbe49065914fe048949648f6bd5c2519f34619142bbe85"},
+    {file = "cryptography-46.0.7-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5d1c02a14ceb9148cc7816249f64f623fbfee39e8c03b3650d842ad3f34d637e"},
+    {file = "cryptography-46.0.7-cp311-abi3-win32.whl", hash = "sha256:d23c8ca48e44ee015cd0a54aeccdf9f09004eba9fc96f38c911011d9ff1bd457"},
+    {file = "cryptography-46.0.7-cp311-abi3-win_amd64.whl", hash = "sha256:397655da831414d165029da9bc483bed2fe0e75dde6a1523ec2fe63f3c46046b"},
+    {file = "cryptography-46.0.7-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:d151173275e1728cf7839aaa80c34fe550c04ddb27b34f48c232193df8db5842"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:db0f493b9181c7820c8134437eb8b0b4792085d37dbb24da050476ccb664e59c"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ebd6daf519b9f189f85c479427bbd6e9c9037862cf8fe89ee35503bd209ed902"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:b7b412817be92117ec5ed95f880defe9cf18a832e8cafacf0a22337dc1981b4d"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:fbfd0e5f273877695cb93baf14b185f4878128b250cc9f8e617ea0c025dfb022"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ffca7aa1d00cf7d6469b988c581598f2259e46215e0140af408966a24cf086ce"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:60627cf07e0d9274338521205899337c5d18249db56865f943cbe753aa96f40f"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:80406c3065e2c55d7f49a9550fe0c49b3f12e5bfff5dedb727e319e1afb9bf99"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:c5b1ccd1239f48b7151a65bc6dd54bcfcc15e028c8ac126d3fada09db0e07ef1"},
+    {file = "cryptography-46.0.7-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:d5f7520159cd9c2154eb61eb67548ca05c5774d39e9c2c4339fd793fe7d097b2"},
+    {file = "cryptography-46.0.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fcd8eac50d9138c1d7fc53a653ba60a2bee81a505f9f8850b6b2888555a45d0e"},
+    {file = "cryptography-46.0.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:65814c60f8cc400c63131584e3e1fad01235edba2614b61fbfbfa954082db0ee"},
+    {file = "cryptography-46.0.7-cp314-cp314t-win32.whl", hash = "sha256:fdd1736fed309b4300346f88f74cd120c27c56852c3838cab416e7a166f67298"},
+    {file = "cryptography-46.0.7-cp314-cp314t-win_amd64.whl", hash = "sha256:e06acf3c99be55aa3b516397fe42f5855597f430add9c17fa46bf2e0fb34c9bb"},
+    {file = "cryptography-46.0.7-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:462ad5cb1c148a22b2e3bcc5ad52504dff325d17daf5df8d88c17dda1f75f2a4"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:84d4cced91f0f159a7ddacad249cc077e63195c36aac40b4150e7a57e84fffe7"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:128c5edfe5e5938b86b03941e94fac9ee793a94452ad1365c9fc3f4f62216832"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5e51be372b26ef4ba3de3c167cd3d1022934bc838ae9eaad7e644986d2a3d163"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cdf1a610ef82abb396451862739e3fc93b071c844399e15b90726ef7470eeaf2"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1d25aee46d0c6f1a501adcddb2d2fee4b979381346a78558ed13e50aa8a59067"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:cdfbe22376065ffcf8be74dc9a909f032df19bc58a699456a21712d6e5eabfd0"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:abad9dac36cbf55de6eb49badd4016806b3165d396f64925bf2999bcb67837ba"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:935ce7e3cfdb53e3536119a542b839bb94ec1ad081013e9ab9b7cfd478b05006"},
+    {file = "cryptography-46.0.7-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:35719dc79d4730d30f1c2b6474bd6acda36ae2dfae1e3c16f2051f215df33ce0"},
+    {file = "cryptography-46.0.7-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7bbc6ccf49d05ac8f7d7b5e2e2c33830d4fe2061def88210a126d130d7f71a85"},
+    {file = "cryptography-46.0.7-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a1529d614f44b863a7b480c6d000fe93b59acee9c82ffa027cfadc77521a9f5e"},
+    {file = "cryptography-46.0.7-cp38-abi3-win32.whl", hash = "sha256:f247c8c1a1fb45e12586afbb436ef21ff1e80670b2861a90353d9b025583d246"},
+    {file = "cryptography-46.0.7-cp38-abi3-win_amd64.whl", hash = "sha256:506c4ff91eff4f82bdac7633318a526b1d1309fc07ca76a3ad182cb5b686d6d3"},
+    {file = "cryptography-46.0.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:fc9ab8856ae6cf7c9358430e49b368f3108f050031442eaeb6b9d87e4dcf4e4f"},
+    {file = "cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d3b99c535a9de0adced13d159c5a9cf65c325601aa30f4be08afd680643e9c15"},
+    {file = "cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d02c738dacda7dc2a74d1b2b3177042009d5cab7c7079db74afc19e56ca1b455"},
+    {file = "cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:04959522f938493042d595a736e7dbdff6eb6cc2339c11465b3ff89343b65f65"},
+    {file = "cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:3986ac1dee6def53797289999eabe84798ad7817f3e97779b5061a95b0ee4968"},
+    {file = "cryptography-46.0.7-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:258514877e15963bd43b558917bc9f54cf7cf866c38aa576ebf47a77ddbc43a4"},
+    {file = "cryptography-46.0.7.tar.gz", hash = "sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5"},
 ]
 
 [package.dependencies]
@@ -638,7 +670,7 @@ nox = ["nox[uv] (>=2024.4.15)"]
 pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.14)", "ruff (>=0.11.11)"]
 sdist = ["build (>=1.0.0)"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi (>=2024)", "cryptography-vectors (==46.0.3)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
+test = ["certifi (>=2024)", "cryptography-vectors (==46.0.7)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]
@@ -714,35 +746,37 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "fastapi"
-version = "0.115.14"
+version = "0.128.1"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "fastapi-0.115.14-py3-none-any.whl", hash = "sha256:6c0c8bf9420bd58f565e585036d971872472b4f7d3f6c73b698e10cffdefb3ca"},
-    {file = "fastapi-0.115.14.tar.gz", hash = "sha256:b1de15cdc1c499a4da47914db35d0e4ef8f1ce62b624e94e0e5824421df99739"},
+    {file = "fastapi-0.128.1-py3-none-any.whl", hash = "sha256:ee82146bbf91ea5bbf2bb8629e4c6e056c4fbd997ea6068501b11b15260b50fb"},
+    {file = "fastapi-0.128.1.tar.gz", hash = "sha256:ce5be4fa26d4ce6f54debcc873d1fb8e0e248f5c48d7502ba6c61457ab2dc766"},
 ]
 
 [package.dependencies]
-pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
-starlette = ">=0.40.0,<0.47.0"
+annotated-doc = ">=0.0.2"
+pydantic = ">=2.7.0"
+starlette = ">=0.40.0,<0.51.0"
 typing-extensions = ">=4.8.0"
 
 [package.extras]
-all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=3.1.5)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
-standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
+all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.8)", "httpx (>=0.23.0,<1.0.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=3.1.5)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
+standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.8)", "httpx (>=0.23.0,<1.0.0)", "jinja2 (>=3.1.5)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
+standard-no-fastapi-cloud-cli = ["email-validator (>=2.0.0)", "fastapi-cli[standard-no-fastapi-cloud-cli] (>=0.0.8)", "httpx (>=0.23.0,<1.0.0)", "jinja2 (>=3.1.5)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
 
 [[package]]
 name = "filelock"
-version = "3.20.1"
+version = "3.29.0"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.10"
 groups = ["dev"]
 files = [
-    {file = "filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a"},
-    {file = "filelock-3.20.1.tar.gz", hash = "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c"},
+    {file = "filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258"},
+    {file = "filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90"},
 ]
 
 [[package]]
@@ -922,6 +956,30 @@ files = [
 [package.extras]
 all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
+[[package]]
+name = "importlib-metadata"
+version = "8.7.1"
+description = "Read metadata from Python packages"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151"},
+    {file = "importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb"},
+]
+
+[package.dependencies]
+zipp = ">=3.20"
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=3.4)"]
+perf = ["ipython"]
+test = ["flufl.flake8", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
+type = ["mypy (<1.19) ; platform_python_implementation == \"PyPy\"", "pytest-mypy (>=1.0.1)"]
+
 [[package]]
 name = "iniconfig"
 version = "2.1.0"
@@ -1045,12 +1103,30 @@ version = "1.5.3"
 description = "Lightweight pipelining with Python functions"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713"},
     {file = "joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3"},
 ]
 
+[[package]]
+name = "joserfc"
+version = "1.6.4"
+description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+files = [
+    {file = "joserfc-1.6.4-py3-none-any.whl", hash = "sha256:3e4a22b509b41908989237a045e25c8308d5fd47ab96bdae2dd8057c6451003a"},
+    {file = "joserfc-1.6.4.tar.gz", hash = "sha256:34ce5f499bfcc5e9ad4cc75077f9278ab3227b71da9aaf28f9ab705f8a560d3c"},
+]
+
+[package.dependencies]
+cryptography = ">=45.0.1"
+
+[package.extras]
+drafts = ["pycryptodome"]
+
 [[package]]
 name = "jsonschema"
 version = "4.25.1"
@@ -1065,7 +1141,7 @@ files = [
 
 [package.dependencies]
 attrs = ">=22.2.0"
-jsonschema-specifications = ">=2023.03.6"
+jsonschema-specifications = ">=2023.3.6"
 referencing = ">=0.28.4"
 rpds-py = ">=0.7.1"
 
@@ -1350,14 +1426,14 @@ tests = ["pytest", "simplejson"]
 
 [[package]]
 name = "mcp"
-version = "1.20.0"
+version = "1.27.0"
 description = "Model Context Protocol SDK"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "mcp-1.20.0-py3-none-any.whl", hash = "sha256:d0dc06f93653f7432ff89f694721c87f79876b6f93741bf628ad1e48f7ac5e5d"},
-    {file = "mcp-1.20.0.tar.gz", hash = "sha256:9ccc09eaadbfbcbbdab1c9723cfe2e0d1d9e324d7d3ce7e332ef90b09ed35177"},
+    {file = "mcp-1.27.0-py3-none-any.whl", hash = "sha256:5ce1fa81614958e267b21fb2aa34e0aea8e2c6ede60d52aba45fd47246b4d741"},
+    {file = "mcp-1.27.0.tar.gz", hash = "sha256:d3dc35a7eec0d458c1da4976a48f982097ddaab87e278c5511d5a4a56e852b83"},
 ]
 
 [package.dependencies]
@@ -1372,6 +1448,8 @@ python-multipart = ">=0.0.9"
 pywin32 = {version = ">=310", markers = "sys_platform == \"win32\""}
 sse-starlette = ">=1.6.1"
 starlette = ">=0.27"
+typing-extensions = ">=4.9.0"
+typing-inspection = ">=0.4.1"
 uvicorn = {version = ">=0.31.1", markers = "sys_platform != \"emscripten\""}
 
 [package.extras]
@@ -1467,14 +1545,14 @@ files = [
 
 [[package]]
 name = "nltk"
-version = "3.9.2"
+version = "3.9.4"
 description = "Natural Language Toolkit"
 optional = false
-python-versions = ">=3.9"
-groups = ["dev"]
+python-versions = ">=3.10"
+groups = ["main", "dev"]
 files = [
-    {file = "nltk-3.9.2-py3-none-any.whl", hash = "sha256:1e209d2b3009110635ed9709a67a1a3e33a10f799490fa71cf4bec218c11c88a"},
-    {file = "nltk-3.9.2.tar.gz", hash = "sha256:0f409e9b069ca4177c1903c3e843eef90c7e92992fa4931ae607da6de49e1419"},
+    {file = "nltk-3.9.4-py3-none-any.whl", hash = "sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f"},
+    {file = "nltk-3.9.4.tar.gz", hash = "sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0"},
 ]
 
 [package.dependencies]
@@ -1519,6 +1597,22 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 realtime = ["websockets (>=13,<16)"]
 voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.41.1"
+description = "OpenTelemetry Python API"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "opentelemetry_api-1.41.1-py3-none-any.whl", hash = "sha256:a22df900e75c76dc08440710e51f52f1aa6b451b429298896023e60db5b3139f"},
+    {file = "opentelemetry_api-1.41.1.tar.gz", hash = "sha256:0ad1814d73b875f84494387dae86ce0b12c68556331ce6ce8fe789197c949621"},
+]
+
+[package.dependencies]
+importlib-metadata = ">=6.0,<8.8.0"
+typing-extensions = ">=4.5.0"
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -1533,16 +1627,21 @@ files = [
 
 [[package]]
 name = "pathspec"
-version = "0.12.1"
+version = "1.1.0"
 description = "Utility library for gitignore style pattern matching of file paths."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["dev"]
 files = [
-    {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
-    {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
+    {file = "pathspec-1.1.0-py3-none-any.whl", hash = "sha256:574b128f7456bd899045ccd142dd446af7e6cfd0072d63ad73fbc55fbb4aaa42"},
+    {file = "pathspec-1.1.0.tar.gz", hash = "sha256:f5d7c555da02fd8dde3e4a2354b6aba817a89112fa8f333f7917a2a4834dd080"},
 ]
 
+[package.extras]
+hyperscan = ["hyperscan (>=0.7)"]
+optional = ["typing-extensions (>=4)"]
+re2 = ["google-re2 (>=1.1)"]
+
 [[package]]
 name = "platformdirs"
 version = "4.3.8"
@@ -1749,14 +1848,14 @@ yaml = ["pyyaml (>=6.0.1)"]
 
 [[package]]
 name = "pygments"
-version = "2.19.2"
+version = "2.20.0"
 description = "Pygments is a syntax highlighting package written in Python."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["dev"]
 files = [
-    {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
-    {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
+    {file = "pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176"},
+    {file = "pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f"},
 ]
 
 [package.extras]
@@ -1764,42 +1863,43 @@ windows-terminal = ["colorama (>=0.4.6)"]
 
 [[package]]
 name = "pyjwt"
-version = "2.10.1"
+version = "2.12.1"
 description = "JSON Web Token implementation in Python"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"},
-    {file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"},
+    {file = "pyjwt-2.12.1-py3-none-any.whl", hash = "sha256:28ca37c070cad8ba8cd9790cd940535d40274d22f80ab87f3ac6a713e6e8454c"},
+    {file = "pyjwt-2.12.1.tar.gz", hash = "sha256:c74a7a2adf861c04d002db713dd85f84beb242228e671280bf709d765b03672b"},
 ]
 
 [package.dependencies]
 cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""}
+typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
 crypto = ["cryptography (>=3.4.0)"]
-dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"]
+dev = ["coverage[toml] (==7.10.7)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=8.4.2,<9.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"]
 docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
-tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
+tests = ["coverage[toml] (==7.10.7)", "pytest (>=8.4.2,<9.0.0)"]
 
 [[package]]
 name = "pytest"
-version = "8.4.1"
+version = "9.0.3"
 description = "pytest: simple powerful testing with Python"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.10"
 groups = ["dev"]
 files = [
-    {file = "pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7"},
-    {file = "pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c"},
+    {file = "pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9"},
+    {file = "pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c"},
 ]
 
 [package.dependencies]
 colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""}
 exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""}
-iniconfig = ">=1"
-packaging = ">=20"
+iniconfig = ">=1.0.1"
+packaging = ">=22"
 pluggy = ">=1.5,<2"
 pygments = ">=2.7.2"
 tomli = {version = ">=1", markers = "python_version < \"3.11\""}
@@ -1809,21 +1909,23 @@ dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.23.8"
+version = "1.3.0"
 description = "Pytest support for asyncio"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.10"
 groups = ["dev"]
 files = [
-    {file = "pytest_asyncio-0.23.8-py3-none-any.whl", hash = "sha256:50265d892689a5faefb84df80819d1ecef566eb3549cf915dfb33569359d1ce2"},
-    {file = "pytest_asyncio-0.23.8.tar.gz", hash = "sha256:759b10b33a6dc61cce40a8bd5205e302978bbbcc00e279a8b61d9a6a3c82e4d3"},
+    {file = "pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5"},
+    {file = "pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5"},
 ]
 
 [package.dependencies]
-pytest = ">=7.0.0,<9"
+backports-asyncio-runner = {version = ">=1.1,<2", markers = "python_version < \"3.11\""}
+pytest = ">=8.2,<10"
+typing-extensions = {version = ">=4.12", markers = "python_version < \"3.13\""}
 
 [package.extras]
-docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1)"]
 testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"]
 
 [[package]]
@@ -1848,14 +1950,14 @@ testing = ["process-tests", "pytest-xdist", "virtualenv"]
 
 [[package]]
 name = "python-dotenv"
-version = "1.1.1"
+version = "1.2.2"
 description = "Read key-value pairs from a .env file and set them as environment variables"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc"},
-    {file = "python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab"},
+    {file = "python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a"},
+    {file = "python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3"},
 ]
 
 [package.extras]
@@ -1863,16 +1965,71 @@ cli = ["click (>=5.0)"]
 
 [[package]]
 name = "python-multipart"
-version = "0.0.18"
+version = "0.0.27"
 description = "A streaming multipart parser for Python"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "python_multipart-0.0.18-py3-none-any.whl", hash = "sha256:efe91480f485f6a361427a541db4796f9e1591afc0fb8e7a4ba06bfbc6708996"},
-    {file = "python_multipart-0.0.18.tar.gz", hash = "sha256:7a68db60c8bfb82e460637fa4750727b45af1d5e2ed215593f917f64694d34fe"},
+    {file = "python_multipart-0.0.27-py3-none-any.whl", hash = "sha256:6fccfad17a27334bd0193681b369f476eda3409f17381a2d65aa7df3f7275645"},
+    {file = "python_multipart-0.0.27.tar.gz", hash = "sha256:9870a6a8c5a20a5bf4f07c017bd1489006ff8836cff097b6933355ee2b49b602"},
+]
+
+[[package]]
+name = "pytokens"
+version = "0.4.1"
+description = "A Fast, spec compliant Python 3.14+ tokenizer that runs on older Pythons."
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "pytokens-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a44ed93ea23415c54f3face3b65ef2b844d96aeb3455b8a69b3df6beab6acc5"},
+    {file = "pytokens-0.4.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:add8bf86b71a5d9fb5b89f023a80b791e04fba57960aa790cc6125f7f1d39dfe"},
+    {file = "pytokens-0.4.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:670d286910b531c7b7e3c0b453fd8156f250adb140146d234a82219459b9640c"},
+    {file = "pytokens-0.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4e691d7f5186bd2842c14813f79f8884bb03f5995f0575272009982c5ac6c0f7"},
+    {file = "pytokens-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:27b83ad28825978742beef057bfe406ad6ed524b2d28c252c5de7b4a6dd48fa2"},
+    {file = "pytokens-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d70e77c55ae8380c91c0c18dea05951482e263982911fc7410b1ffd1dadd3440"},
+    {file = "pytokens-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a58d057208cb9075c144950d789511220b07636dd2e4708d5645d24de666bdc"},
+    {file = "pytokens-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b49750419d300e2b5a3813cf229d4e5a4c728dae470bcc89867a9ad6f25a722d"},
+    {file = "pytokens-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9907d61f15bf7261d7e775bd5d7ee4d2930e04424bab1972591918497623a16"},
+    {file = "pytokens-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:ee44d0f85b803321710f9239f335aafe16553b39106384cef8e6de40cb4ef2f6"},
+    {file = "pytokens-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:140709331e846b728475786df8aeb27d24f48cbcf7bcd449f8de75cae7a45083"},
+    {file = "pytokens-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d6c4268598f762bc8e91f5dbf2ab2f61f7b95bdc07953b602db879b3c8c18e1"},
+    {file = "pytokens-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24afde1f53d95348b5a0eb19488661147285ca4dd7ed752bbc3e1c6242a304d1"},
+    {file = "pytokens-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ad948d085ed6c16413eb5fec6b3e02fa00dc29a2534f088d3302c47eb59adf9"},
+    {file = "pytokens-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:3f901fe783e06e48e8cbdc82d631fca8f118333798193e026a50ce1b3757ea68"},
+    {file = "pytokens-0.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8bdb9d0ce90cbf99c525e75a2fa415144fd570a1ba987380190e8b786bc6ef9b"},
+    {file = "pytokens-0.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5502408cab1cb18e128570f8d598981c68a50d0cbd7c61312a90507cd3a1276f"},
+    {file = "pytokens-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:29d1d8fb1030af4d231789959f21821ab6325e463f0503a61d204343c9b355d1"},
+    {file = "pytokens-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:970b08dd6b86058b6dc07efe9e98414f5102974716232d10f32ff39701e841c4"},
+    {file = "pytokens-0.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:9bd7d7f544d362576be74f9d5901a22f317efc20046efe2034dced238cbbfe78"},
+    {file = "pytokens-0.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4a14d5f5fc78ce85e426aa159489e2d5961acf0e47575e08f35584009178e321"},
+    {file = "pytokens-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f50fd18543be72da51dd505e2ed20d2228c74e0464e4262e4899797803d7fa"},
+    {file = "pytokens-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc74c035f9bfca0255c1af77ddd2d6ae8419012805453e4b0e7513e17904545d"},
+    {file = "pytokens-0.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f66a6bbe741bd431f6d741e617e0f39ec7257ca1f89089593479347cc4d13324"},
+    {file = "pytokens-0.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:b35d7e5ad269804f6697727702da3c517bb8a5228afa450ab0fa787732055fc9"},
+    {file = "pytokens-0.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:8fcb9ba3709ff77e77f1c7022ff11d13553f3c30299a9fe246a166903e9091eb"},
+    {file = "pytokens-0.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79fc6b8699564e1f9b521582c35435f1bd32dd06822322ec44afdeba666d8cb3"},
+    {file = "pytokens-0.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d31b97b3de0f61571a124a00ffe9a81fb9939146c122c11060725bd5aea79975"},
+    {file = "pytokens-0.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:967cf6e3fd4adf7de8fc73cd3043754ae79c36475c1c11d514fc72cf5490094a"},
+    {file = "pytokens-0.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:584c80c24b078eec1e227079d56dc22ff755e0ba8654d8383b2c549107528918"},
+    {file = "pytokens-0.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:da5baeaf7116dced9c6bb76dc31ba04a2dc3695f3d9f74741d7910122b456edc"},
+    {file = "pytokens-0.4.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11edda0942da80ff58c4408407616a310adecae1ddd22eef8c692fe266fa5009"},
+    {file = "pytokens-0.4.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0fc71786e629cef478cbf29d7ea1923299181d0699dbe7c3c0f4a583811d9fc1"},
+    {file = "pytokens-0.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dcafc12c30dbaf1e2af0490978352e0c4041a7cde31f4f81435c2a5e8b9cabb6"},
+    {file = "pytokens-0.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:42f144f3aafa5d92bad964d471a581651e28b24434d184871bd02e3a0d956037"},
+    {file = "pytokens-0.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:34bcc734bd2f2d5fe3b34e7b3c0116bfb2397f2d9666139988e7a3eb5f7400e3"},
+    {file = "pytokens-0.4.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:941d4343bf27b605e9213b26bfa1c4bf197c9c599a9627eb7305b0defcfe40c1"},
+    {file = "pytokens-0.4.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ad72b851e781478366288743198101e5eb34a414f1d5627cdd585ca3b25f1db"},
+    {file = "pytokens-0.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:682fa37ff4d8e95f7df6fe6fe6a431e8ed8e788023c6bcc0f0880a12eab80ad1"},
+    {file = "pytokens-0.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:30f51edd9bb7f85c748979384165601d028b84f7bd13fe14d3e065304093916a"},
+    {file = "pytokens-0.4.1-py3-none-any.whl", hash = "sha256:26cef14744a8385f35d0e095dc8b3a7583f6c953c2e3d269c7f82484bf5ad2de"},
+    {file = "pytokens-0.4.1.tar.gz", hash = "sha256:292052fe80923aae2260c073f822ceba21f3872ced9a68bb7953b348e561179a"},
 ]
 
+[package.extras]
+dev = ["black", "build", "mypy", "pytest", "pytest-cov", "setuptools", "tox", "twine", "wheel"]
+
 [[package]]
 name = "pywin32"
 version = "311"
@@ -1990,7 +2147,7 @@ version = "2025.11.3"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "regex-2025.11.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2b441a4ae2c8049106e8b39973bfbddfb25a179dda2bdb99b0eeb60c40a6a3af"},
     {file = "regex-2025.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fa2eed3f76677777345d2f81ee89f5de2f5745910e805f7af7386a920fa7313"},
@@ -2111,25 +2268,25 @@ files = [
 
 [[package]]
 name = "requests"
-version = "2.32.4"
+version = "2.33.1"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.10"
 groups = ["dev"]
 files = [
-    {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"},
-    {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"},
+    {file = "requests-2.33.1-py3-none-any.whl", hash = "sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a"},
+    {file = "requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517"},
 ]
 
 [package.dependencies]
-certifi = ">=2017.4.17"
+certifi = ">=2023.5.7"
 charset_normalizer = ">=2,<4"
 idna = ">=2.5,<4"
-urllib3 = ">=1.21.1,<3"
+urllib3 = ">=1.26,<3"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<8)"]
 
 [[package]]
 name = "rich"
@@ -2499,18 +2656,19 @@ uvicorn = ["uvicorn (>=0.34.0)"]
 
 [[package]]
 name = "starlette"
-version = "0.46.2"
+version = "0.50.0"
 description = "The little ASGI library that shines."
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "starlette-0.46.2-py3-none-any.whl", hash = "sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35"},
-    {file = "starlette-0.46.2.tar.gz", hash = "sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5"},
+    {file = "starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca"},
+    {file = "starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca"},
 ]
 
 [package.dependencies]
 anyio = ">=3.6.2,<5"
+typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\""}
 
 [package.extras]
 full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"]
@@ -2604,7 +2762,7 @@ version = "4.67.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
     {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
@@ -2667,21 +2825,21 @@ typing-extensions = ">=4.12.0"
 
 [[package]]
 name = "urllib3"
-version = "2.5.0"
+version = "2.7.0"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=3.9"
-groups = ["dev"]
+python-versions = ">=3.10"
+groups = ["main", "dev"]
 files = [
-    {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
-    {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
+    {file = "urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897"},
+    {file = "urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c"},
 ]
 
 [package.extras]
-brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
+brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""]
 h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
+zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
 
 [[package]]
 name = "uvicorn"
@@ -3050,7 +3208,27 @@ files = [
     {file = "wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3"},
 ]
 
+[[package]]
+name = "zipp"
+version = "3.23.1"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "zipp-3.23.1-py3-none-any.whl", hash = "sha256:0b3596c50a5c700c9cb40ba8d86d9f2cc4807e9bedb06bcdf7fac85633e444dc"},
+    {file = "zipp-3.23.1.tar.gz", hash = "sha256:32120e378d32cd9714ad503c1d024619063ec28aad2248dc6672ad13edfa5110"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+type = ["pytest-mypy"]
+
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.10"
-content-hash = "995cbb6b6bfbf14612eff7e0690ca47fc7b0c01fd2ef3351dea01d6940be0ed6"
+content-hash = "e93f0b093845d634e17489539d1ac9fdcd9c916e5635fc62c169b53304804ae3"
diff --git a/pyproject.toml b/pyproject.toml
index e0cc381..05e6c9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "claude-code-openai-wrapper"
-version = "2.2.0"
+version = "2.9.7"
 description = "OpenAI API-compatible wrapper for Claude Code"
 authors = ["Richard Atkinson <richardatk01@gmail.com>"]
 readme = "README.md"
@@ -9,21 +9,48 @@ packages = [{include = "src"}]
 
 [tool.poetry.dependencies]
 python = "^3.10"
-fastapi = "^0.115.0"
+# fastapi bumped from ^0.115.0 so the transitive starlette floor can reach
+# >=0.49.1 (CVE-2025-62727). 0.115.x caps starlette at <0.47; 0.118.x
+# caps at <0.49; 0.119.x allows 0.49. Range stays under the next major.
+fastapi = ">=0.119.0,<1.0"
 uvicorn = {extras = ["standard"], version = "^0.32.0"}
 pydantic = "^2.10.0"
-python-dotenv = "^1.0.1"
+python-dotenv = "^1.2.2"
 httpx = "^0.27.2"
 sse-starlette = "^2.1.3"
-python-multipart = "^0.0.18"
-claude-agent-sdk = "^0.1.18"
+python-multipart = "^0.0.27"
+# Pin the SDK exactly. Bump deliberately and regenerate poetry.lock in the
+# same commit so Docker builds stay reproducible. 2.9.0 bump from 0.1.18 to
+# 0.1.65 to pull in 47 patch releases worth of CLI fixes aimed at the
+# silent `error_during_execution` path observed on 2.8.2.
+claude-agent-sdk = {version = "0.1.81", extras = ["otel"]}
 slowapi = "^0.1.9"
 
+# Security floors for transitive dependencies. Each one pinned here is a
+# CVE that showed up in a trivy HIGH/CRITICAL scan against a prior build
+# image and has no fix until the transitive version is advanced past the
+# vulnerable one. Poetry resolves these against the rest of the graph and
+# fails the lock step if they're incompatible with their parents, which is
+# the early warning we want for each dep bump.
+# - starlette: fastapi 0.115.x allows 0.49.x
+# - urllib3: httpx 0.27.x pulls via requests/certifi chain
+#   (>=2.7.0 closes GHSA-qccp-gfcp-xxvc proxy redirect header leak and
+#    GHSA-mf9v-mfxr-j63j decompression-bomb safeguard bypass)
+# - cryptography, PyJWT, Authlib, mcp: via claude-agent-sdk 0.1.18
+# - nltk: via the bundled claude CLI
+starlette = ">=0.49.1"
+urllib3 = ">=2.7.0"
+cryptography = ">=46.0.5"
+pyjwt = ">=2.12.0"
+authlib = ">=1.6.9"
+mcp = ">=1.23.0"
+nltk = ">=3.9.3"
+
 [tool.poetry.group.dev.dependencies]
-black = "^24.0.0"
-pytest = "^8.0.0"
-pytest-asyncio = "^0.23.0"
-requests = "^2.32.0"
+black = "^26.3.1"
+pytest = "^9.0.3"
+pytest-asyncio = "^1.3.0"
+requests = "^2.33.0"
 openai = "^1.0.0"
 pytest-cov = "^7.0.0"
 mypy = "^1.14.0"
diff --git a/src/__init__.py b/src/__init__.py
index ca47b3b..32465bc 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -1,3 +1,3 @@
 """Claude Code OpenAI Wrapper - A FastAPI-based OpenAI-compatible API for Claude Code."""
 
-__version__ = "2.2.0"
+__version__ = "2.9.7"
diff --git a/src/auth.py b/src/auth.py
index 7b23e69..ed7f83d 100644
--- a/src/auth.py
+++ b/src/auth.py
@@ -1,5 +1,7 @@
 import os
 import logging
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
 from typing import Optional, Dict, Any, Tuple
 from fastapi import HTTPException, Request
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
@@ -284,3 +286,106 @@ def get_claude_code_auth_info() -> Dict[str, Any]:
         "status": auth_manager.auth_status,
         "environment_variables": list(auth_manager.get_claude_code_env_vars().keys()),
     }
+
+
+# Markers the Claude CLI emits to stderr (or wraps in SDK exceptions) when its
+# stored session is missing or expired. Compared case-insensitively against the
+# concatenation of an exception's str() and any captured stderr_tail.
+_CLI_AUTH_FAILURE_MARKERS = (
+    "not logged in",
+    "please run /login",
+    "invalid api key",
+    "authentication_error",
+    "401",
+)
+
+
+def _classify_probe_error(blob: str) -> str:
+    lowered = (blob or "").lower()
+    if any(marker in lowered for marker in _CLI_AUTH_FAILURE_MARKERS):
+        return "auth_failure"
+    return "unknown"
+
+
+@dataclass
+class CliHealth:
+    """Latest observed health of the Claude CLI auth path.
+
+    The probe loop (run only when auth_method == 'claude_cli') refreshes this
+    on an interval; the chat / messages handlers consult `ok` to short-circuit
+    with HTTP 401 before round-tripping through the SDK.
+    """
+
+    ok: bool = True
+    last_probed_at: Optional[datetime] = None
+    last_ok_at: Optional[datetime] = None
+    error_kind: Optional[str] = None
+    error_message: Optional[str] = None
+
+    def mark_ok(self) -> None:
+        now = datetime.now(timezone.utc)
+        self.ok = True
+        self.last_probed_at = now
+        self.last_ok_at = now
+        self.error_kind = None
+        self.error_message = None
+
+    def mark_failed(self, kind: str, message: str) -> None:
+        self.ok = False
+        self.last_probed_at = datetime.now(timezone.utc)
+        self.error_kind = kind
+        # Trim to keep logs and /v1/auth/status compact.
+        self.error_message = (message or "")[:500]
+
+    def as_dict(self) -> Dict[str, Any]:
+        return {
+            "ok": self.ok,
+            "last_probed_at": self.last_probed_at.isoformat() if self.last_probed_at else None,
+            "last_ok_at": self.last_ok_at.isoformat() if self.last_ok_at else None,
+            "error_kind": self.error_kind,
+            "error_message": self.error_message,
+        }
+
+
+cli_health = CliHealth()
+
+
+async def probe_cli_auth(cli=None) -> bool:
+    """Run a 1-turn CLI probe and update `cli_health`.
+
+    Reuses `claude_cli.verify_cli()` (which already issues a short
+    `query(prompt="Hello", max_turns=1)`); on any exception, classifies the
+    failure as auth_failure if the marker set matches, else unknown.
+
+    `cli` is the ClaudeCodeCLI instance to probe. When omitted, lazy-resolves
+    the module-level singleton from `src.main` so a periodic probe exercises
+    exactly the same instance that real requests use. The parameter is
+    primarily there for tests, which inject a mock.
+
+    Returns True when the probe succeeded, False otherwise. Never raises.
+    """
+    if cli is None:
+        # Lazy import - src.main imports src.auth at module load.
+        from src import main as _main  # noqa: WPS433 - intentional lazy import
+
+        cli = _main.claude_cli
+
+    try:
+        ok = await cli.verify_cli()
+        if ok:
+            cli_health.mark_ok()
+            logger.info("cli_auth_probe_ok")
+            return True
+        cli_health.mark_failed("unknown", "verify_cli returned False")
+        logger.warning("cli_auth_probe_failed kind=unknown reason=verify_cli_returned_false")
+        return False
+    except Exception as exc:  # noqa: BLE001 - the probe must never propagate
+        message = str(exc)
+        kind = _classify_probe_error(message)
+        cli_health.mark_failed(kind, message)
+        logger.warning(
+            "cli_auth_probe_failed kind=%s error=%s",
+            kind,
+            message[:200].replace("\n", " "),
+        )
+        return False
diff --git a/src/circuit_breaker.py b/src/circuit_breaker.py
new file mode 100644
index 0000000..5ef92b5
--- /dev/null
+++ b/src/circuit_breaker.py
@@ -0,0 +1,210 @@
+"""Simple in-process circuit breaker for upstream-SDK failures.
+
+When the Claude Agent SDK returns a high rate of errors over a short window,
+continuing to forward requests just amplifies load on an already-bad upstream
+and delays each caller by the full wall-clock of a doomed attempt. This
+breaker cuts that loop: once the recent failure rate crosses a threshold,
+new requests fail-fast with 503 for a short cool-off period, then half-open
+by allowing a single probe request through. A success closes the breaker;
+another failure re-opens it.
+
+The breaker is intentionally small and has no external dependencies. It is
+suitable for a single wrapper process; multi-replica deployments should
+either accept independent breaker state or place a shared breaker
+(e.g. via Redis) in front.
+"""
+
+from __future__ import annotations
+
+import os
+import threading
+import time
+from collections import deque
+from dataclasses import dataclass
+from typing import Deque, Tuple
+
+
+def _env_bool(name: str, default: bool) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in ("true", "1", "yes", "on")
+
+
+def _env_float(name: str, default: float) -> float:
+    raw = os.getenv(name)
+    if raw is None or raw.strip() == "":
+        return default
+    try:
+        return float(raw)
+    except ValueError:
+        return default
+
+
+def _env_int(name: str, default: int) -> int:
+    raw = os.getenv(name)
+    if raw is None or raw.strip() == "":
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+@dataclass(frozen=True)
+class CircuitBreakerConfig:
+    """Breaker thresholds.
+
+    Defaults tuned from the MinusPod incident on 2026-04-23: the original
+    ``min_requests_for_trip=10, failure_ratio_threshold=0.5`` tripped mid-way
+    through a single episode's 6-8 window detection sequence, turning a
+    recoverable upstream hiccup into a full-episode outage. The new defaults
+    (20 / 0.75) require sustained failure across multiple caller-initiated
+    retries before the breaker shuts the door.
+
+    All fields respect env overrides:
+      WRAPPER_CIRCUIT_BREAKER_WINDOW_SECONDS
+      WRAPPER_CIRCUIT_BREAKER_THRESHOLD
+      WRAPPER_CIRCUIT_BREAKER_MIN_REQUESTS
+      WRAPPER_CIRCUIT_BREAKER_OPEN_SECONDS
+    """
+
+    window_seconds: float = 60.0
+    failure_ratio_threshold: float = 0.75
+    min_requests_for_trip: int = 20
+    open_seconds: float = 30.0
+
+    @classmethod
+    def from_env(cls) -> "CircuitBreakerConfig":
+        return cls(
+            window_seconds=_env_float("WRAPPER_CIRCUIT_BREAKER_WINDOW_SECONDS", 60.0),
+            failure_ratio_threshold=_env_float("WRAPPER_CIRCUIT_BREAKER_THRESHOLD", 0.75),
+            min_requests_for_trip=_env_int("WRAPPER_CIRCUIT_BREAKER_MIN_REQUESTS", 20),
+            open_seconds=_env_float("WRAPPER_CIRCUIT_BREAKER_OPEN_SECONDS", 30.0),
+        )
+
+
+def circuit_breaker_enabled() -> bool:
+    """When ``WRAPPER_CIRCUIT_BREAKER_ENABLED=false`` the breaker module-level
+    singleton always answers ``allow_request() == True`` and ``record()`` is a
+    no-op. Used as a kill switch while the upstream SDK is misbehaving and the
+    breaker would just amplify the outage for every concurrent caller."""
+    return _env_bool("WRAPPER_CIRCUIT_BREAKER_ENABLED", True)
+
+
+class CircuitBreakerState:
+    CLOSED = "closed"
+    OPEN = "open"
+    HALF_OPEN = "half_open"
+
+
+class CircuitBreaker:
+    """Rolling-window failure-rate breaker.
+
+    Thread-safe. Every request records an outcome with ``record()``, and
+    ``allow_request()`` returns False when the breaker is open and no cool-off
+    probe window has elapsed yet. On half-open, a single probe is allowed
+    through; its outcome either closes or re-opens the breaker.
+    """
+
+    def __init__(
+        self,
+        config: CircuitBreakerConfig | None = None,
+        enabled: bool | None = None,
+    ) -> None:
+        self._cfg = config or CircuitBreakerConfig()
+        self._enabled = enabled if enabled is not None else True
+        self._history: Deque[Tuple[float, bool]] = deque()
+        self._lock = threading.Lock()
+        self._state = CircuitBreakerState.CLOSED
+        self._opened_at: float | None = None
+        self._probe_in_flight = False
+
+    def _prune(self, now: float) -> None:
+        cutoff = now - self._cfg.window_seconds
+        while self._history and self._history[0][0] < cutoff:
+            self._history.popleft()
+
+    def _failure_ratio_locked(self, now: float) -> Tuple[int, float]:
+        self._prune(now)
+        n = len(self._history)
+        if n == 0:
+            return 0, 0.0
+        failures = sum(1 for _, ok in self._history if not ok)
+        return n, failures / n
+
+    def allow_request(self) -> bool:
+        if not self._enabled:
+            return True
+        now = time.monotonic()
+        with self._lock:
+            if self._state == CircuitBreakerState.OPEN:
+                if self._opened_at is not None and now - self._opened_at >= self._cfg.open_seconds:
+                    # Enter half-open and let exactly one probe through.
+                    self._state = CircuitBreakerState.HALF_OPEN
+                    self._probe_in_flight = True
+                    return True
+                return False
+            if self._state == CircuitBreakerState.HALF_OPEN:
+                if self._probe_in_flight:
+                    # Another probe is already out; shed new load until it
+                    # resolves.
+                    return False
+                self._probe_in_flight = True
+                return True
+            return True
+
+    def record(self, success: bool) -> None:
+        if not self._enabled:
+            return
+        now = time.monotonic()
+        with self._lock:
+            self._history.append((now, success))
+            if self._state == CircuitBreakerState.HALF_OPEN:
+                self._probe_in_flight = False
+                if success:
+                    self._state = CircuitBreakerState.CLOSED
+                    self._opened_at = None
+                else:
+                    self._state = CircuitBreakerState.OPEN
+                    self._opened_at = now
+                return
+
+            if self._state == CircuitBreakerState.CLOSED:
+                n, ratio = self._failure_ratio_locked(now)
+                if (
+                    n >= self._cfg.min_requests_for_trip
+                    and ratio >= self._cfg.failure_ratio_threshold
+                ):
+                    self._state = CircuitBreakerState.OPEN
+                    self._opened_at = now
+
+    @property
+    def state(self) -> str:
+        with self._lock:
+            return self._state
+
+    def snapshot(self) -> dict:
+        now = time.monotonic()
+        with self._lock:
+            n, ratio = self._failure_ratio_locked(now)
+            return {
+                "state": self._state,
+                "enabled": self._enabled,
+                "window_size": n,
+                "failure_ratio": round(ratio, 3),
+                "threshold": self._cfg.failure_ratio_threshold,
+                "min_requests_for_trip": self._cfg.min_requests_for_trip,
+                "window_seconds": self._cfg.window_seconds,
+                "opened_at_monotonic": self._opened_at,
+            }
+
+
+# Module-level singleton used by the completions handler. Config + enabled
+# flag both come from env at import time so the same image can be tuned
+# without a code change (WRAPPER_CIRCUIT_BREAKER_ENABLED, _THRESHOLD,
+# _MIN_REQUESTS, _OPEN_SECONDS, _WINDOW_SECONDS).
+sdk_circuit_breaker = CircuitBreaker(
+    config=CircuitBreakerConfig.from_env(),
+    enabled=circuit_breaker_enabled(),
+)
diff --git a/src/claude_cli.py b/src/claude_cli.py
index d87057e..8a88288 100644
--- a/src/claude_cli.py
+++ b/src/claude_cli.py
@@ -8,9 +8,84 @@
 
 from claude_agent_sdk import query, ClaudeAgentOptions
 
+from src.retry import RetryState, retry_delay
+
 logger = logging.getLogger(__name__)
 
 
+# ResultMessage subtypes that mean the SDK failed to produce a valid response.
+# The SDK inserts a synthetic UserMessage(text='[Request interrupted by user]')
+# before emitting a ResultMessage with one of these subtypes; without explicit
+# handling, the sentinel leaks into the OpenAI response body.
+_ERROR_RESULT_SUBTYPES = frozenset(
+    {
+        "error_max_turns",
+        "error_during_execution",
+        "error",
+    }
+)
+
+# AssistantMessage.error literal values that the SDK attaches when the
+# upstream API fails mid-response. Source: claude_agent_sdk.types
+# AssistantMessageError = Literal["authentication_failed", "billing_error",
+# "rate_limit", "invalid_request", "server_error", "unknown"].
+_ASSISTANT_ERROR_VALUES = frozenset(
+    {
+        "authentication_failed",
+        "billing_error",
+        "rate_limit",
+        "invalid_request",
+        "server_error",
+        "unknown",
+    }
+)
+
+
+def _extract_text_blocks(content: List[Any]) -> List[str]:
+    """Flatten a list of SDK content blocks into plain text strings.
+
+    Accepts TextBlock objects (with a ``.text`` attribute), dict blocks of the
+    form ``{"type": "text", "text": ...}``, and bare strings. Ignores other
+    block types (e.g. ``ToolUseBlock``).
+    """
+    text_parts: List[str] = []
+    for block in content:
+        if hasattr(block, "text"):
+            text_parts.append(block.text)
+        elif isinstance(block, dict) and block.get("type") == "text":
+            text_parts.append(block.get("text", ""))
+        elif isinstance(block, str):
+            text_parts.append(block)
+    return text_parts
+
+
+class ClaudeResultError(Exception):
+    """Raised when the Claude Agent SDK emits a non-success ResultMessage.
+
+    Callers in the HTTP layer translate this into a proper OpenAI-compatible
+    response: error_max_turns -> 200 with finish_reason='length' and empty
+    content; other subtypes -> 5xx with a structured error body.
+    """
+
+    def __init__(
+        self,
+        subtype: Optional[str],
+        num_turns: Optional[int] = None,
+        errors: Optional[List[str]] = None,
+        stop_reason: Optional[str] = None,
+        error_message: Optional[str] = None,
+        stderr_tail: Optional[str] = None,
+    ):
+        self.subtype = subtype
+        self.num_turns = num_turns
+        self.errors = errors or []
+        self.stop_reason = stop_reason
+        self.error_message = error_message
+        self.stderr_tail = stderr_tail
+        detail = error_message or (self.errors[0] if self.errors else subtype)
+        super().__init__(f"Claude SDK returned {subtype} after {num_turns} turns: {detail}")
+
+
 class ClaudeCodeCLI:
     def __init__(self, timeout: int = 600000, cwd: Optional[str] = None):
         self.timeout = timeout / 1000  # Convert ms to seconds
@@ -104,6 +179,8 @@ async def run_completion(
         session_id: Optional[str] = None,
         continue_session: bool = False,
         permission_mode: Optional[str] = None,
+        effort: Optional[str] = None,
+        thinking: Optional[str] = None,
     ) -> AsyncGenerator[Dict[str, Any], None]:
         """Run Claude Agent using the Python SDK and yield response chunks."""
 
@@ -116,8 +193,23 @@ async def run_completion(
                     os.environ[key] = value
 
             try:
+                # Capture the CLI subprocess's stderr into a bounded ring so we
+                # can attach it to non-success ResultMessage log lines. The
+                # bundled Claude CLI prints its real failure reason
+                # (auth rejection, permission denial, network error) to
+                # stderr, but previously we only saw the typed SDK error
+                # subtype (``error_during_execution``) and zero context.
+                stderr_buffer: List[str] = []
+                _STDERR_MAX_LINES = 40
+
+                def _stderr_capture(line: str) -> None:
+                    stderr_buffer.append(line)
+                    if len(stderr_buffer) > _STDERR_MAX_LINES:
+                        del stderr_buffer[: len(stderr_buffer) - _STDERR_MAX_LINES]
+
                 # Build SDK options
                 options = ClaudeAgentOptions(max_turns=max_turns, cwd=self.cwd)
+                options.stderr = _stderr_capture
 
                 # Set model if specified
                 if model:
@@ -141,37 +233,91 @@ async def run_completion(
                 if permission_mode:
                     options.permission_mode = permission_mode
 
+                # Set effort level and thinking mode if specified
+                if effort:
+                    options.effort = effort
+                if thinking:
+                    options.thinking = thinking
+
                 # Handle session continuity
                 if continue_session:
                     options.continue_session = True
                 elif session_id:
                     options.resume = session_id
 
-                # Run the query and yield messages
-                async for message in query(prompt=prompt, options=options):
-                    # Debug logging
-                    logger.debug(f"Raw SDK message type: {type(message)}")
-                    logger.debug(f"Raw SDK message: {message}")
-
-                    # Convert message object to dict if needed
-                    if hasattr(message, "__dict__") and not isinstance(message, dict):
-                        # Convert object to dict for consistent handling
-                        message_dict = {}
-
-                        # Get all attributes from the object
-                        for attr_name in dir(message):
-                            if not attr_name.startswith("_"):  # Skip private attributes
-                                try:
-                                    attr_value = getattr(message, attr_name)
-                                    if not callable(attr_value):  # Skip methods
-                                        message_dict[attr_name] = attr_value
-                                except:
-                                    pass
-
-                        logger.debug(f"Converted message dict: {message_dict}")
-                        yield message_dict
-                    else:
-                        yield message
+                # Run the query with retry logic
+                retry_state = RetryState()
+                current_model = model
+
+                while True:
+                    try:
+                        if current_model and current_model != model:
+                            options.model = current_model
+
+                        async for message in query(prompt=prompt, options=options):
+                            logger.debug(f"Raw SDK message type: {type(message)}")
+                            logger.debug(f"Raw SDK message: {message}")
+
+                            if hasattr(message, "__dict__") and not isinstance(message, dict):
+                                message_dict = {}
+                                for attr_name in dir(message):
+                                    if not attr_name.startswith("_"):
+                                        try:
+                                            attr_value = getattr(message, attr_name)
+                                            if not callable(attr_value):
+                                                message_dict[attr_name] = attr_value
+                                        except:
+                                            pass
+                                logger.debug(f"Converted message dict: {message_dict}")
+
+                                # If the SDK is reporting a non-success result,
+                                # surface whatever the CLI subprocess wrote to
+                                # stderr so triage doesn't have to guess why it
+                                # died. Attach to the dict too so callers
+                                # (parse_claude_message, HTTP layer) can relay it.
+                                subtype = message_dict.get("subtype")
+                                is_error = message_dict.get("is_error") is True
+                                if subtype in _ERROR_RESULT_SUBTYPES or is_error:
+                                    stderr_tail = "\n".join(stderr_buffer).strip()
+                                    if stderr_tail:
+                                        logger.warning(
+                                            f"SDK {subtype} stderr tail "
+                                            f"(session={message_dict.get('session_id')}, "
+                                            f"num_turns={message_dict.get('num_turns')}):\n"
+                                            f"{stderr_tail}"
+                                        )
+                                        message_dict["stderr_tail"] = stderr_tail
+                                    else:
+                                        logger.warning(
+                                            f"SDK {subtype} with empty stderr "
+                                            f"(session={message_dict.get('session_id')}, "
+                                            f"num_turns={message_dict.get('num_turns')})"
+                                        )
+
+                                yield message_dict
+                            else:
+                                yield message
+
+                        break  # Success, exit retry loop
+
+                    except Exception as query_error:
+                        error_str = str(query_error)
+                        status_code = getattr(query_error, "status_code", None)
+
+                        retry_state.record_attempt(status_code)
+
+                        # Check for model fallback on overload
+                        if current_model:
+                            fallback = retry_state.get_fallback_model(current_model)
+                            if fallback:
+                                current_model = fallback
+                                options.model = current_model
+
+                        if retry_state.should_retry(status_code=status_code, error=query_error):
+                            await retry_delay(retry_state)
+                            continue
+
+                        raise  # Not retryable, propagate
 
             finally:
                 # Restore original environment (if we changed anything)
@@ -184,12 +330,16 @@ async def run_completion(
 
         except Exception as e:
             logger.error(f"Claude Agent SDK error: {e}")
-            # Yield error message in the expected format
+            # Emit a dict that matches the shape parse_claude_message expects
+            # for a ResultMessage, so the HTTP layer surfaces the failure via
+            # ClaudeResultError rather than silently returning empty content.
             yield {
                 "type": "result",
                 "subtype": "error_during_execution",
                 "is_error": True,
                 "error_message": str(e),
+                "num_turns": 0,
+                "duration_ms": 0,
             }
 
     def parse_claude_message(self, messages: List[Dict[str, Any]]) -> Optional[str]:
@@ -197,41 +347,105 @@ def parse_claude_message(self, messages: List[Dict[str, Any]]) -> Optional[str]:
 
         Prioritizes ResultMessage.result for multi-turn conversations,
         falls back to last AssistantMessage content.
+
+        Raises:
+            ClaudeResultError: if any ResultMessage indicates an error (e.g.
+                error_max_turns, error_during_execution) or has is_error=True.
+                The SDK inserts a synthetic UserMessage with text
+                '[Request interrupted by user]' immediately before such a
+                ResultMessage; without this check the sentinel leaks as
+                response content. Callers translate this into a proper
+                HTTP response.
         """
-        # First, check for ResultMessage with 'result' field (multi-turn completion)
+        # Reject errored ResultMessages outright. The SDK puts a synthetic
+        # UserMessage('[Request interrupted by user]') just before these, and
+        # we must not let that text escape as response content.
+        for message in messages:
+            subtype = message.get("subtype")
+            is_error = message.get("is_error") is True
+            if subtype in _ERROR_RESULT_SUBTYPES or is_error:
+                raise ClaudeResultError(
+                    subtype=subtype,
+                    num_turns=message.get("num_turns"),
+                    errors=message.get("errors"),
+                    stop_reason=message.get("stop_reason"),
+                    error_message=message.get("error_message"),
+                    stderr_tail=message.get("stderr_tail"),
+                )
+
+        # AssistantMessage.error carries upstream-API failure details (rate
+        # limit, billing, auth). Surface those as ClaudeResultError too so the
+        # HTTP layer can map each literal to the right status code (429, 402,
+        # 401, 400, 502) rather than returning partial content with finish_reason=stop.
+        for message in messages:
+            assistant_error = message.get("error")
+            if isinstance(assistant_error, str) and assistant_error in _ASSISTANT_ERROR_VALUES:
+                raise ClaudeResultError(
+                    subtype=f"assistant_{assistant_error}",
+                    num_turns=None,
+                    errors=[assistant_error],
+                    stop_reason=message.get("stop_reason"),
+                    error_message=None,
+                )
+
+        # RateLimitInfo messages (SDK 0.1.49+): emitted by the CLI when the
+        # rate-limit state changes. If status is 'rejected', the upstream has
+        # cut us off and callers should back off rather than consume the
+        # partial response.
+        for message in messages:
+            if (
+                isinstance(message, dict)
+                and message.get("status") == "rejected"
+                and "resets_at" in message
+                and "rate_limit_type" in message
+            ):
+                resets_at = message.get("resets_at")
+                raise ClaudeResultError(
+                    subtype="assistant_rate_limit",
+                    num_turns=None,
+                    errors=["rate_limit"],
+                    stop_reason=None,
+                    error_message=f"upstream rate_limit ({message.get('rate_limit_type')}); resets_at={resets_at}",
+                )
+
+        # Prefer ResultMessage.result (multi-turn completion).
         for message in messages:
             if message.get("subtype") == "success" and "result" in message:
                 return message["result"]
 
-        # Collect all text from AssistantMessages (take the last one with text)
+        # Fall back to AssistantMessage content. Skip SDK UserMessage dicts
+        # (the wrapper's dict conversion produces a UserMessage with a uuid
+        # field and no model field; the AssistantMessage has model).
         last_text = None
         for message in messages:
-            # Look for AssistantMessage type (new SDK format)
-            if "content" in message and isinstance(message["content"], list):
-                text_parts = []
-                for block in message["content"]:
-                    # Handle TextBlock objects
-                    if hasattr(block, "text"):
-                        text_parts.append(block.text)
-                    elif isinstance(block, dict) and block.get("type") == "text":
-                        text_parts.append(block.get("text", ""))
-                    elif isinstance(block, str):
-                        text_parts.append(block)
+            if not isinstance(message, dict):
+                continue
+
+            # Skip UserMessage shapes so the synthetic interrupt sentinel
+            # cannot leak through as response text.
+            if (
+                isinstance(message.get("content"), list)
+                and "uuid" in message
+                and "model" not in message
+            ):
+                continue
 
+            # Primary path: any message with a content list is treated as an
+            # AssistantMessage (same as the pre-fix behavior) once UserMessage
+            # is excluded above.
+            if isinstance(message.get("content"), list):
+                text_parts = _extract_text_blocks(message["content"])
                 if text_parts:
                     last_text = "\n".join(text_parts)
+                continue
 
-            # Fallback: look for old format
-            elif message.get("type") == "assistant" and "message" in message:
+            # Legacy fallback: { type: "assistant", message: { content: ... } }
+            if message.get("type") == "assistant" and "message" in message:
                 sdk_message = message["message"]
                 if isinstance(sdk_message, dict) and "content" in sdk_message:
                     content = sdk_message["content"]
                     if isinstance(content, list) and len(content) > 0:
-                        # Handle content blocks (Anthropic SDK format)
-                        text_parts = []
-                        for block in content:
-                            if isinstance(block, dict) and block.get("type") == "text":
-                                text_parts.append(block.get("text", ""))
+                        text_parts = _extract_text_blocks(content)
                         if text_parts:
                             last_text = "\n".join(text_parts)
                     elif isinstance(content, str):
@@ -239,6 +453,18 @@ def parse_claude_message(self, messages: List[Dict[str, Any]]) -> Optional[str]:
 
         return last_text
 
+    @staticmethod
+    def _extract_text_blocks(content: List[Any]) -> List[str]:
+        text_parts = []
+        for block in content:
+            if hasattr(block, "text"):
+                text_parts.append(block.text)
+            elif isinstance(block, dict) and block.get("type") == "text":
+                text_parts.append(block.get("text", ""))
+            elif isinstance(block, str):
+                text_parts.append(block)
+        return text_parts
+
     def extract_metadata(self, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
         """Extract metadata like costs, tokens, and session info from SDK messages."""
         metadata = {
diff --git a/src/constants.py b/src/constants.py
index 5fb452b..acc0bf2 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -25,26 +25,51 @@ async def chat_endpoint(): ...
 """
 
 import os
+from typing import Optional
 
-# Claude Agent SDK Tool Names
-# These are the built-in tools available in the Claude Agent SDK
-# See: https://docs.anthropic.com/en/docs/claude-code/sdk
+# Claude Code tool inventory (sourced from open-sourced Claude Code CLI)
 CLAUDE_TOOLS = [
-    "Task",  # Launch agents for complex tasks
-    "Bash",  # Execute bash commands
-    "Glob",  # File pattern matching
-    "Grep",  # Search file contents
-    "Read",  # Read files
-    "Edit",  # Edit files
-    "Write",  # Write files
-    "NotebookEdit",  # Edit Jupyter notebooks
-    "WebFetch",  # Fetch web content
-    "TodoWrite",  # Manage todo lists
-    "WebSearch",  # Search the web
-    "BashOutput",  # Get bash output
-    "KillShell",  # Kill bash shells
-    "Skill",  # Execute skills
-    "SlashCommand",  # Execute slash commands
+    "Agent",
+    "Task",
+    "SendMessage",
+    "ListPeers",
+    "Bash",
+    "Glob",
+    "Grep",
+    "Read",
+    "Edit",
+    "Write",
+    "NotebookEdit",
+    "WebFetch",
+    "WebSearch",
+    "TaskCreate",
+    "TaskUpdate",
+    "TaskGet",
+    "TaskList",
+    "TaskOutput",
+    "TaskStop",
+    "EnterPlanMode",
+    "ExitPlanMode",
+    "VerifyPlanExecution",
+    "EnterWorktree",
+    "ExitWorktree",
+    "ToolSearch",
+    "AskUserQuestion",
+    "CronCreate",
+    "CronDelete",
+    "CronList",
+    "RemoteTrigger",
+    "TodoWrite",
+    "Skill",
+    "Brief",
+    "Config",
+    "REPL",
+    "Sleep",
+    "Monitor",
+    "SendUserFile",
+    "PushNotification",
+    "ListMcpResources",
+    "ReadMcpResource",
 ]
 
 # Default tools to allow when tools are enabled
@@ -58,40 +83,133 @@ async def chat_endpoint(): ...
     "Edit",
 ]
 
-# Tools to disallow by default (potentially dangerous or slow)
+# Tools to disallow by default (potentially dangerous or resource-intensive)
 DEFAULT_DISALLOWED_TOOLS = [
-    "Task",  # Can spawn sub-agents
+    "Agent",  # Can spawn sub-agents
+    "Task",  # Alias for Agent
     "WebFetch",  # External network access
     "WebSearch",  # External network access
+    "SendMessage",  # External communication
+    "RemoteTrigger",  # Remote execution
 ]
 
-# Claude Models
-# Models supported by Claude Agent SDK (as of November 2025)
-# NOTE: Claude Agent SDK only supports Claude 4+ models, not Claude 3.x
-CLAUDE_MODELS = [
-    # Claude 4.5 Family (Latest - Fall 2025) - RECOMMENDED
-    "claude-opus-4-5-20250929",  # Latest Opus 4.5 - Most capable
-    "claude-sonnet-4-5-20250929",  # Recommended - best coding model
-    "claude-haiku-4-5-20251001",  # Fast & cheap
-    # Claude 4.1
-    "claude-opus-4-1-20250805",  # Upgraded Opus 4
-    # Claude 4.0 Family (Original - May 2025)
-    "claude-opus-4-20250514",
+# Model metadata (sourced from open-sourced Claude Code CLI)
+# Only models that differ from the default are listed explicitly.
+_DEFAULT_MODEL_META = {
+    "context_window": 200_000,
+    "default_max_output": 32_000,
+    "max_output_limit": 64_000,
+}
+
+_MODEL_OVERRIDES = {
+    "claude-opus-4-7": {
+        "context_window": 1_000_000,
+        "default_max_output": 64_000,
+        "max_output_limit": 128_000,
+    },
+    "claude-opus-4-6": {
+        "context_window": 1_000_000,
+        "default_max_output": 64_000,
+        "max_output_limit": 128_000,
+    },
+    "claude-sonnet-4-6": {"context_window": 1_000_000},
+    "claude-opus-4-1-20250805": {"default_max_output": 32_000, "max_output_limit": 32_000},
+    "claude-opus-4-20250514": {"default_max_output": 32_000, "max_output_limit": 32_000},
+}
+
+# Static fallback list (order: newest first). Exposed by /v1/models and
+# accepted by validation when the live Anthropic Models API is unavailable
+# or not configured. Operators can override the advertised list without
+# rebuilding the image via CLAUDE_MODELS_OVERRIDE=model-a,model-b.
+# NOTE: Claude Agent SDK only supports Claude 4+ models, not Claude 3.x.
+_ALL_MODEL_IDS = [
+    "claude-opus-4-7",
+    "claude-opus-4-6",
+    "claude-sonnet-4-6",
+    "claude-opus-4-5-20251101",
+    "claude-sonnet-4-5-20250929",
+    "claude-haiku-4-5-20251001",
+    "claude-opus-4-1-20250805",
     "claude-sonnet-4-20250514",
-    # Claude 3.x Family - NOT SUPPORTED by Claude Agent SDK
-    # These models work with Anthropic API but NOT with Claude Code
-    # Uncomment only if using direct Anthropic API (not Claude Agent SDK)
-    # "claude-3-7-sonnet-20250219",
-    # "claude-3-5-sonnet-20241022",
-    # "claude-3-5-haiku-20241022",
+    "claude-opus-4-20250514",
 ]
 
-# Default model (recommended for most use cases)
-# Can be overridden via DEFAULT_MODEL environment variable
-DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "claude-sonnet-4-5-20250929")
-
-# Fast model (for speed/cost optimization)
-FAST_MODEL = "claude-haiku-4-5-20251001"
+MODEL_METADATA = {
+    model_id: {**_DEFAULT_MODEL_META, **_MODEL_OVERRIDES.get(model_id, {})}
+    for model_id in _ALL_MODEL_IDS
+}
+
+# CLAUDE_MODELS is derived from MODEL_METADATA so the metadata table is the
+# single source of truth; CLAUDE_MODELS_OVERRIDE replaces the advertised list
+# without touching the metadata catalog (validation still consults the catalog).
+DEFAULT_CLAUDE_MODELS = list(MODEL_METADATA.keys())
+_models_override = os.getenv("CLAUDE_MODELS_OVERRIDE", "").strip()
+CLAUDE_MODELS = (
+    [model.strip() for model in _models_override.split(",") if model.strip()]
+    if _models_override
+    else DEFAULT_CLAUDE_MODELS
+)
+
+# Default model selection.
+# DEFAULT_MODEL_ENV is the explicit operator override; when unset, the wrapper
+# resolves the latest Sonnet from Anthropic's live Models API at startup and
+# stores it in RESOLVED_DEFAULT_MODEL. DEFAULT_MODEL_FALLBACK is used until/if
+# that resolution succeeds.
+DEFAULT_MODEL_ENV: Optional[str] = os.getenv("DEFAULT_MODEL")
+DEFAULT_MODEL_FALLBACK = "claude-sonnet-4-6"
+DEFAULT_MODEL = DEFAULT_MODEL_ENV or DEFAULT_MODEL_FALLBACK
+RESOLVED_DEFAULT_MODEL: Optional[str] = None
+
+# Fast model (for speed/cost optimization).
+# Can be overridden via FAST_MODEL environment variable.
+FAST_MODEL = os.getenv("FAST_MODEL", "claude-haiku-4-5-20251001")
+
+# Anthropic Models API configuration for dynamically refreshing /v1/models.
+ANTHROPIC_MODELS_URL = os.getenv("ANTHROPIC_MODELS_URL", "https://api.anthropic.com/v1/models")
+ANTHROPIC_VERSION = os.getenv("ANTHROPIC_VERSION", "2023-06-01")
+MODEL_LIST_CACHE_TTL_SECONDS = int(os.getenv("MODEL_LIST_CACHE_TTL_SECONDS", "3600"))
+# Shorter TTL applied when the live fetch fails so a transient blip doesn't
+# suppress live discovery for a full hour.
+MODEL_LIST_ERROR_TTL_SECONDS = int(os.getenv("MODEL_LIST_ERROR_TTL_SECONDS", "60"))
+MODEL_LIST_REQUEST_TIMEOUT_SECONDS = float(os.getenv("MODEL_LIST_REQUEST_TIMEOUT_SECONDS", "5"))
+
+# Pricing tiers (per million tokens, USD)
+# Sourced from open-sourced Claude Code CLI (src/utils/modelCost.ts)
+_PRICING_SONNET = {"input": 3.0, "output": 15.0, "cache_read": 0.30, "cache_write": 3.75}
+_PRICING_OPUS = {"input": 5.0, "output": 25.0, "cache_read": 0.50, "cache_write": 6.25}
+_PRICING_OPUS_LEGACY = {"input": 15.0, "output": 75.0, "cache_read": 1.50, "cache_write": 18.75}
+_PRICING_HAIKU_45 = {"input": 1.0, "output": 5.0, "cache_read": 0.10, "cache_write": 1.25}
+
+MODEL_PRICING = {
+    "claude-opus-4-7": _PRICING_OPUS,
+    "claude-opus-4-6": _PRICING_OPUS,
+    "claude-opus-4-5-20251101": _PRICING_OPUS,
+    "claude-opus-4-1-20250805": _PRICING_OPUS_LEGACY,
+    "claude-opus-4-20250514": _PRICING_OPUS_LEGACY,
+    "claude-sonnet-4-6": _PRICING_SONNET,
+    "claude-sonnet-4-5-20250929": _PRICING_SONNET,
+    "claude-sonnet-4-20250514": _PRICING_SONNET,
+    "claude-haiku-4-5-20251001": _PRICING_HAIKU_45,
+}
+
+# Web search cost (per request, all models)
+WEB_SEARCH_COST_USD = 0.01
+
+# Fallback model mapping: when an Opus model is overloaded, fall back to Sonnet
+# Sourced from Claude Code's FallbackTriggeredError pattern
+MODEL_FALLBACK_MAP = {
+    "claude-opus-4-7": "claude-sonnet-4-6",
+    "claude-opus-4-6": "claude-sonnet-4-6",
+    "claude-opus-4-5-20251101": "claude-sonnet-4-5-20250929",
+    "claude-opus-4-1-20250805": "claude-sonnet-4-20250514",
+    "claude-opus-4-20250514": "claude-sonnet-4-20250514",
+}
+
+# Effort levels supported by Claude API
+VALID_EFFORT_LEVELS = {"low", "medium", "high", "max"}
+
+# Thinking modes supported by Claude API
+VALID_THINKING_MODES = {"adaptive", "enabled", "disabled"}
 
 # System Prompt Types
 SYSTEM_PROMPT_TYPE_TEXT = "text"
diff --git a/src/cost_tracker.py b/src/cost_tracker.py
new file mode 100644
index 0000000..316f96e
--- /dev/null
+++ b/src/cost_tracker.py
@@ -0,0 +1,184 @@
+"""
+Cost tracking for Claude API usage.
+
+Calculates estimated costs per request and accumulates per session.
+Pricing sourced from open-sourced Claude Code CLI (src/utils/modelCost.ts).
+"""
+
+import asyncio
+import logging
+import time
+from typing import Dict, Any, Optional
+from dataclasses import dataclass, field
+
+from src.constants import MODEL_PRICING, WEB_SEARCH_COST_USD, SESSION_MAX_AGE_MINUTES
+
+logger = logging.getLogger(__name__)
+
+# Default pricing tier (Sonnet) for unknown models
+_DEFAULT_PRICING = MODEL_PRICING.get(
+    "claude-sonnet-4-6",
+    {
+        "input": 3.0,
+        "output": 15.0,
+        "cache_read": 0.30,
+        "cache_write": 3.75,
+    },
+)
+
+_KEY_INPUT = "input"
+_KEY_OUTPUT = "output"
+_KEY_CACHE_READ = "cache_read"
+_KEY_CACHE_WRITE = "cache_write"
+
+
+@dataclass
+class UsageRecord:
+    """Token usage for a single request."""
+
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_read_tokens: int = 0
+    cache_creation_tokens: int = 0
+    web_search_requests: int = 0
+
+
+@dataclass
+class SessionCost:
+    """Accumulated cost for a session."""
+
+    total_cost_usd: float = 0.0
+    total_input_tokens: int = 0
+    total_output_tokens: int = 0
+    total_cache_read_tokens: int = 0
+    total_cache_creation_tokens: int = 0
+    total_web_search_requests: int = 0
+    request_count: int = 0
+    model_usage: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+    last_updated: float = field(default_factory=time.time)
+
+
+def calculate_cost(model: str, usage: UsageRecord) -> float:
+    """Calculate the cost in USD for a given model and usage."""
+    pricing = MODEL_PRICING.get(model, _DEFAULT_PRICING)
+
+    cost = 0.0
+    cost += (usage.input_tokens / 1_000_000) * pricing[_KEY_INPUT]
+    cost += (usage.output_tokens / 1_000_000) * pricing[_KEY_OUTPUT]
+    cost += (usage.cache_read_tokens / 1_000_000) * pricing[_KEY_CACHE_READ]
+    cost += (usage.cache_creation_tokens / 1_000_000) * pricing[_KEY_CACHE_WRITE]
+    cost += usage.web_search_requests * WEB_SEARCH_COST_USD
+
+    return cost
+
+
+class CostTracker:
+    """Tracks costs per session. Uses asyncio.Lock for async-safe access."""
+
+    def __init__(self, max_age_minutes: int = SESSION_MAX_AGE_MINUTES):
+        self._sessions: Dict[str, SessionCost] = {}
+        self._lock = asyncio.Lock()
+        self._max_age_seconds = max_age_minutes * 60
+
+    async def record_usage(
+        self,
+        session_id: str,
+        model: str,
+        usage: UsageRecord,
+    ) -> float:
+        """Record usage for a session. Returns the cost for this request."""
+        cost = calculate_cost(model, usage)
+
+        async with self._lock:
+            if session_id not in self._sessions:
+                self._sessions[session_id] = SessionCost()
+
+            session = self._sessions[session_id]
+            session.total_cost_usd += cost
+            session.total_input_tokens += usage.input_tokens
+            session.total_output_tokens += usage.output_tokens
+            session.total_cache_read_tokens += usage.cache_read_tokens
+            session.total_cache_creation_tokens += usage.cache_creation_tokens
+            session.total_web_search_requests += usage.web_search_requests
+            session.request_count += 1
+            session.last_updated = time.time()
+
+            if model not in session.model_usage:
+                session.model_usage[model] = {
+                    "input_tokens": 0,
+                    "output_tokens": 0,
+                    "cost_usd": 0.0,
+                    "requests": 0,
+                }
+            session.model_usage[model]["input_tokens"] += usage.input_tokens
+            session.model_usage[model]["output_tokens"] += usage.output_tokens
+            session.model_usage[model]["cost_usd"] += cost
+            session.model_usage[model]["requests"] += 1
+
+        logger.debug(
+            f"Session {session_id}: request cost=${cost:.6f}, "
+            f"total=${session.total_cost_usd:.6f}"
+        )
+        return cost
+
+    async def cleanup_expired(self) -> int:
+        """Remove sessions older than max_age. Returns count of removed sessions."""
+        now = time.time()
+        async with self._lock:
+            expired = [
+                sid
+                for sid, s in self._sessions.items()
+                if (now - s.last_updated) > self._max_age_seconds
+            ]
+            for sid in expired:
+                del self._sessions[sid]
+            if expired:
+                logger.info(f"Cleaned up {len(expired)} expired cost tracker sessions")
+            return len(expired)
+
+    async def get_session_cost(self, session_id: str) -> Optional[SessionCost]:
+        """Get accumulated cost for a session."""
+        async with self._lock:
+            return self._sessions.get(session_id)
+
+    async def get_session_summary(self, session_id: str) -> Dict[str, Any]:
+        """Get a summary dict for a session's costs."""
+        async with self._lock:
+            session = self._sessions.get(session_id)
+            if not session:
+                return {"session_id": session_id, "total_cost_usd": 0.0, "request_count": 0}
+
+            return {
+                "session_id": session_id,
+                "total_cost_usd": round(session.total_cost_usd, 6),
+                "total_input_tokens": session.total_input_tokens,
+                "total_output_tokens": session.total_output_tokens,
+                "total_cache_read_tokens": session.total_cache_read_tokens,
+                "total_cache_creation_tokens": session.total_cache_creation_tokens,
+                "total_web_search_requests": session.total_web_search_requests,
+                "request_count": session.request_count,
+                "model_usage": dict(session.model_usage),
+            }
+
+    async def delete_session(self, session_id: str) -> bool:
+        """Remove cost tracking for a session."""
+        async with self._lock:
+            if session_id in self._sessions:
+                del self._sessions[session_id]
+                return True
+            return False
+
+    async def get_all_sessions_summary(self) -> Dict[str, Any]:
+        """Get cost summary across all sessions."""
+        async with self._lock:
+            total_cost = sum(s.total_cost_usd for s in self._sessions.values())
+            total_requests = sum(s.request_count for s in self._sessions.values())
+            return {
+                "active_sessions": len(self._sessions),
+                "total_cost_usd": round(total_cost, 6),
+                "total_requests": total_requests,
+            }
+
+
+# Global singleton instance
+cost_tracker = CostTracker()
diff --git a/src/cpu_watchdog.py b/src/cpu_watchdog.py
new file mode 100644
index 0000000..5ea3e5c
--- /dev/null
+++ b/src/cpu_watchdog.py
@@ -0,0 +1,99 @@
+"""CPU watchdog for detecting and recovering from epoll busy-loops."""
+
+import asyncio
+import logging
+import os
+import signal
+import sys
+import time
+
+logger = logging.getLogger(__name__)
+
+# Configurable via environment variables
+WATCHDOG_ENABLED = os.getenv("WATCHDOG_ENABLED", "false").lower() == "true"
+WATCHDOG_INTERVAL = int(os.getenv("WATCHDOG_INTERVAL", "30"))
+WATCHDOG_CPU_THRESHOLD = float(os.getenv("WATCHDOG_CPU_THRESHOLD", "80"))
+WATCHDOG_STRIKES = int(os.getenv("WATCHDOG_STRIKES", "3"))
+
+
+class CPUWatchdog:
+    def __init__(self):
+        self._task = None
+        self._strikes = 0
+        self._last_cpu_time = None
+        self._last_wall_time = None
+        self._is_linux = sys.platform.startswith("linux")
+
+    def _get_cpu_percent(self):
+        """Read CPU usage from /proc/self/stat. Returns 0-100 float."""
+        if not self._is_linux:
+            return 0.0
+        try:
+            with open("/proc/self/stat") as f:
+                fields = f.read().split()
+            # fields[13] = utime, fields[14] = stime (in clock ticks)
+            cpu_time = int(fields[13]) + int(fields[14])
+            wall_time = time.monotonic()
+            ticks_per_sec = os.sysconf("SC_CLK_TCK")
+
+            if self._last_cpu_time is not None:
+                cpu_delta = (cpu_time - self._last_cpu_time) / ticks_per_sec
+                wall_delta = wall_time - self._last_wall_time
+                if wall_delta > 0:
+                    percent = (cpu_delta / wall_delta) * 100.0
+                else:
+                    percent = 0.0
+            else:
+                percent = 0.0
+
+            self._last_cpu_time = cpu_time
+            self._last_wall_time = wall_time
+            return percent
+        except (FileNotFoundError, IndexError, ValueError, OSError):
+            return 0.0
+
+    async def _loop(self):
+        while True:
+            await asyncio.sleep(WATCHDOG_INTERVAL)
+            try:
+                cpu = self._get_cpu_percent()
+                if cpu > WATCHDOG_CPU_THRESHOLD:
+                    self._strikes += 1
+                    logger.warning(
+                        f"CPU watchdog: {cpu:.1f}% > {WATCHDOG_CPU_THRESHOLD}% "
+                        f"(strike {self._strikes}/{WATCHDOG_STRIKES})"
+                    )
+                    if self._strikes >= WATCHDOG_STRIKES:
+                        logger.error(
+                            f"CPU watchdog: {WATCHDOG_STRIKES} consecutive strikes, "
+                            f"sending SIGTERM for clean restart"
+                        )
+                        os.kill(os.getpid(), signal.SIGTERM)
+                        return
+                else:
+                    if self._strikes > 0:
+                        logger.info(f"CPU watchdog: {cpu:.1f}% -- strikes reset")
+                    self._strikes = 0
+            except Exception as e:
+                logger.debug(f"CPU watchdog check failed: {e}")
+
+    def start(self):
+        if not WATCHDOG_ENABLED:
+            logger.info("CPU watchdog disabled (set WATCHDOG_ENABLED=true to enable)")
+            return
+        if not self._is_linux:
+            logger.info("CPU watchdog skipped (Linux-only, use in Docker)")
+            return
+        logger.info(
+            f"CPU watchdog started: interval={WATCHDOG_INTERVAL}s, "
+            f"threshold={WATCHDOG_CPU_THRESHOLD}%, strikes={WATCHDOG_STRIKES}"
+        )
+        self._task = asyncio.create_task(self._loop())
+
+    def stop(self):
+        if self._task and not self._task.done():
+            self._task.cancel()
+            logger.info("CPU watchdog stopped")
+
+
+cpu_watchdog = CPUWatchdog()
diff --git a/src/function_calling.py b/src/function_calling.py
new file mode 100644
index 0000000..5c35df4
--- /dev/null
+++ b/src/function_calling.py
@@ -0,0 +1,154 @@
+"""Simulate OpenAI function calling via system prompt injection and response parsing."""
+
+import json
+import logging
+import re
+from uuid import uuid4
+
+from src.models import Message, ToolCall, FunctionCall
+
+logger = logging.getLogger(__name__)
+
+_TOOL_CALL_FORMAT = """IMPORTANT: When you want to call a function, respond with ONLY a code block using the tool_calls language tag:
+
+```tool_calls
+[
+  {"name": "function_name", "arguments": {"param1": "value1"}}
+]
+```
+
+You can call multiple functions in one response. Do not include any text outside the code block when calling functions."""
+
+
+def build_tools_system_prompt(tools: list, tool_choice=None) -> str:
+    if not tools and (tool_choice is None or tool_choice == "none"):
+        return ""
+
+    if tool_choice == "none":
+        return ""
+
+    parts = ["# Available Functions\n"]
+
+    for tool in tools:
+        func = tool.get("function", {})
+        name = func.get("name", "unknown")
+        description = func.get("description", "No description")
+        parameters = func.get("parameters", {})
+        parts.append(f"## {name}\n{description}\nParameters: {json.dumps(parameters)}\n")
+
+    if isinstance(tool_choice, dict):
+        forced_name = tool_choice.get("function", {}).get("name", "unknown")
+        parts.append(f"\nYou MUST call function {forced_name}.\n")
+    elif tool_choice == "required":
+        parts.append("\nYou MUST call at least one function.\n")
+    else:
+        parts.append("\nYou MAY call functions if helpful.\n")
+
+    parts.append(_TOOL_CALL_FORMAT)
+
+    return "\n".join(parts)
+
+
+def parse_tool_calls(response_text: str) -> tuple[list, str]:
+    # Primary: fenced tool_calls block
+    pattern = r"```tool_calls\s*\n(.*?)```"
+    match = re.search(pattern, response_text, re.DOTALL)
+
+    if match:
+        try:
+            calls = json.loads(match.group(1).strip())
+            remaining = response_text[: match.start()] + response_text[match.end() :]
+            remaining = remaining.strip()
+            return (calls, remaining)
+        except json.JSONDecodeError:
+            logger.warning("Found tool_calls block but failed to parse JSON")
+
+    # Fallback: find [{"name": and try to parse valid JSON from that position
+    bare_pattern = r'\[\s*\{\s*"name"\s*:'
+    bare_match = re.search(bare_pattern, response_text)
+
+    if bare_match:
+        start = bare_match.start()
+        # Try increasingly longer substrings to find valid JSON
+        for end in range(len(response_text), start, -1):
+            if response_text[end - 1] == "]":
+                try:
+                    calls = json.loads(response_text[start:end])
+                    remaining = response_text[:start] + response_text[end:]
+                    remaining = remaining.strip()
+                    return (calls, remaining)
+                except json.JSONDecodeError:
+                    continue
+        logger.warning("Found bare JSON array marker but failed to parse")
+
+    return ([], response_text)
+
+
+def format_tool_calls(parsed_calls: list) -> list:
+    result = []
+    for call in parsed_calls:
+        name = call.get("name", "")
+        arguments = call.get("arguments", {})
+        result.append(
+            ToolCall(
+                id=f"call_{uuid4().hex[:24]}",
+                type="function",
+                function=FunctionCall(
+                    name=name,
+                    arguments=json.dumps(arguments),
+                ),
+            )
+        )
+    return result
+
+
+def convert_tool_messages(messages: list) -> list:
+    converted = []
+    for msg in messages:
+        # Handle both Message objects and dicts
+        if isinstance(msg, Message):
+            role = msg.role
+            content = msg.content
+            tool_calls = msg.tool_calls
+            tool_call_id = msg.tool_call_id
+            name = msg.name
+        else:
+            role = msg.get("role", "")
+            content = msg.get("content")
+            tool_calls = msg.get("tool_calls")
+            tool_call_id = msg.get("tool_call_id")
+            name = msg.get("name")
+
+        if role == "assistant" and tool_calls:
+            parts = []
+            if content:
+                parts.append(content)
+            for tc in tool_calls:
+                if hasattr(tc, "function"):
+                    fn_name = tc.function.name
+                    fn_args = tc.function.arguments
+                else:
+                    func = tc.get("function", {})
+                    fn_name = func.get("name", "unknown")
+                    fn_args = func.get("arguments", "{}")
+                if isinstance(fn_args, str):
+                    try:
+                        fn_args = json.loads(fn_args)
+                    except json.JSONDecodeError:
+                        pass
+                args_str = json.dumps(fn_args) if isinstance(fn_args, dict) else fn_args
+                parts.append(f"[Called {fn_name} with arguments: {args_str}]")
+            converted.append(Message(role="assistant", content="\n".join(parts)))
+
+        elif role == "tool":
+            tid = tool_call_id or "unknown"
+            tname = name or "unknown"
+            tcontent = content or ""
+            converted.append(
+                Message(role="user", content=f"[Result of {tname} ({tid}): {tcontent}]")
+            )
+
+        else:
+            converted.append(msg)
+
+    return converted
diff --git a/src/main.py b/src/main.py
index 4a74aa4..b5c27dc 100644
--- a/src/main.py
+++ b/src/main.py
@@ -4,8 +4,9 @@
 import logging
 import secrets
 import string
+import time
 import uuid
-from typing import Optional, AsyncGenerator, Dict, Any
+from typing import Optional, AsyncGenerator, Dict, Any, List
 from contextlib import asynccontextmanager
 
 from fastapi import FastAPI, HTTPException, Request, Depends
@@ -14,7 +15,9 @@
 from fastapi.responses import StreamingResponse, JSONResponse, HTMLResponse
 from fastapi.exceptions import RequestValidationError
 from pydantic import ValidationError
+import httpx
 from dotenv import load_dotenv
+from src import __version__
 
 from src.models import (
     ChatCompletionRequest,
@@ -39,9 +42,26 @@
     AnthropicTextBlock,
     AnthropicUsage,
 )
-from src.claude_cli import ClaudeCodeCLI
-from src.message_adapter import MessageAdapter
-from src.auth import verify_api_key, security, validate_claude_code_auth, get_claude_code_auth_info
+from src.claude_cli import ClaudeCodeCLI, ClaudeResultError
+from src.circuit_breaker import sdk_circuit_breaker
+from src.message_adapter import MessageAdapter, JsonFenceStripper
+from src.function_calling import (
+    build_tools_system_prompt,
+    parse_tool_calls,
+    format_tool_calls,
+    convert_tool_messages,
+)
+from src.cpu_watchdog import cpu_watchdog
+from src.auth import (
+    verify_api_key,
+    security,
+    validate_claude_code_auth,
+    get_claude_code_auth_info,
+)
+# Import the module (not the singletons) so reloads of src.auth in tests stay
+# in sync with main.py's view of _auth.cli_health / auth_manager / probe_cli_auth.
+from src import auth as _auth
+from src.auth import _classify_probe_error  # pure function, safe to bind once
 from src.parameter_validator import ParameterValidator, CompatibilityReporter
 from src.session_manager import session_manager
 from src.tool_manager import tool_manager
@@ -51,7 +71,24 @@
     rate_limit_exceeded_handler,
     rate_limit_endpoint,
 )
-from src.constants import CLAUDE_MODELS, CLAUDE_TOOLS, DEFAULT_ALLOWED_TOOLS
+from datetime import datetime, timezone
+
+from src import constants
+from src.constants import (
+    ANTHROPIC_MODELS_URL,
+    ANTHROPIC_VERSION,
+    CLAUDE_MODELS,
+    CLAUDE_TOOLS,
+    DEFAULT_ALLOWED_TOOLS,
+    DEFAULT_MODEL_FALLBACK,
+    MODEL_LIST_CACHE_TTL_SECONDS,
+    MODEL_LIST_ERROR_TTL_SECONDS,
+    MODEL_LIST_REQUEST_TIMEOUT_SECONDS,
+    SESSION_CLEANUP_INTERVAL_MINUTES,
+)
+from src.model_service import model_service
+from src.request_cache import request_cache
+from src.cost_tracker import cost_tracker, UsageRecord
 
 # Load environment variables
 load_dotenv()
@@ -60,6 +97,39 @@
 DEBUG_MODE = os.getenv("DEBUG_MODE", "false").lower() in ("true", "1", "yes", "on")
 VERBOSE = os.getenv("VERBOSE", "false").lower() in ("true", "1", "yes", "on")
 
+# Default max_turns applied when the request does not enable tools. A value of 1
+# causes the Claude Agent SDK to return error_max_turns whenever the agent
+# engages extended thinking and then needs a second turn to emit the final
+# assistant message, which silently produced bad output for OpenAI clients.
+DEFAULT_MAX_TURNS_NO_TOOLS = int(os.getenv("WRAPPER_DEFAULT_MAX_TURNS", "3"))
+
+
+def _kv(event: str, **fields: Any) -> str:
+    """Format a structured log line as "event key=value key=value ...".
+
+    The wrapper's default logging format is plain text (see logging.basicConfig
+    above) and drops ``logger.xxx(msg, extra={...})`` payloads entirely. That
+    sent every structured log line to /dev/null -- we'd emit
+    ``circuit_breaker_open`` with no breaker state attached, forcing ops to
+    inspect response bodies to see what happened. Building the key=value pairs
+    into the message string itself is the cheapest way to keep the data
+    visible without reaching for a full JSON logger.
+
+    ``None`` values are skipped so we don't spam ``stop_reason=None``. Values
+    are repr'd when they contain whitespace or equals signs so a grep for
+    ``key=value`` still works unambiguously.
+    """
+    parts = [event]
+    for key, value in fields.items():
+        if value is None:
+            continue
+        text = str(value)
+        if any(ch.isspace() or ch == "=" for ch in text):
+            text = repr(text)
+        parts.append(f"{key}={text}")
+    return " ".join(parts)
+
+
 # Set logging level based on debug/verbose mode
 log_level = logging.DEBUG if (DEBUG_MODE or VERBOSE) else logging.INFO
 logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
@@ -68,6 +138,198 @@
 # Global variable to store runtime-generated API key
 runtime_api_key = None
 
+# Best-effort cache for Anthropic's live Models API.  The static constants remain
+# the fallback so /v1/models keeps working for Claude CLI, Bedrock, Vertex, local
+# development, and transient Anthropic API outages.
+_model_list_cache: Dict[str, Any] = {"expires_at": 0.0, "models": None}
+# Serializes cache refreshes so concurrent /v1/models requests at TTL expiry
+# don't all stampede the upstream Anthropic API.
+_model_list_lock = asyncio.Lock()
+
+
+def _iso_to_unix(value: Any) -> Optional[int]:
+    """Convert an Anthropic ISO-8601 'created_at' string to a unix timestamp."""
+    if not isinstance(value, str):
+        return None
+    try:
+        return int(datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp())
+    except ValueError:
+        return None
+
+
+def _openai_model_from_anthropic(model_info: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert an Anthropic ModelInfo object to OpenAI-compatible model metadata."""
+    created = _iso_to_unix(model_info.get("created_at"))
+    model: Dict[str, Any] = {
+        "id": model_info["id"],
+        "object": "model",
+        "created": created if created is not None else int(datetime.now(timezone.utc).timestamp()),
+        "owned_by": "anthropic",
+    }
+
+    # Preserve useful Anthropic metadata for clients that want it.  OpenAI clients
+    # ignore unknown keys, and the existing id/object/owned_by shape is retained.
+    for key in (
+        "display_name",
+        "created_at",
+        "max_input_tokens",
+        "max_tokens",
+        "capabilities",
+        "type",
+    ):
+        if key in model_info:
+            model[key] = model_info[key]
+
+    return model
+
+
+def _fallback_model_payload() -> List[Dict[str, Any]]:
+    now = int(datetime.now(timezone.utc).timestamp())
+    return [
+        {"id": model_id, "object": "model", "created": now, "owned_by": "anthropic"}
+        for model_id in CLAUDE_MODELS
+    ]
+
+
+async def _fetch_anthropic_models() -> Optional[List[Dict[str, Any]]]:
+    """Fetch all available models from Anthropic, returning None on fallback-worthy errors."""
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    if not api_key:
+        return None
+
+    headers = {
+        "anthropic-version": ANTHROPIC_VERSION,
+        "x-api-key": api_key,
+    }
+    beta_header = os.getenv("ANTHROPIC_BETA") or os.getenv("ANTHROPIC_BETA_HEADER")
+    if beta_header:
+        headers["anthropic-beta"] = beta_header
+
+    params: Dict[str, Any] = {"limit": 1000}
+    models: List[Dict[str, Any]] = []
+
+    try:
+        async with httpx.AsyncClient(timeout=MODEL_LIST_REQUEST_TIMEOUT_SECONDS) as client:
+            while True:
+                response = await client.get(ANTHROPIC_MODELS_URL, headers=headers, params=params)
+                response.raise_for_status()
+                payload = response.json()
+                models.extend(
+                    _openai_model_from_anthropic(model)
+                    for model in payload.get("data", [])
+                    if model.get("id")
+                )
+
+                if not payload.get("has_more") or not payload.get("last_id"):
+                    break
+                params["after_id"] = payload["last_id"]
+    except Exception as exc:  # noqa: BLE001 - endpoint should degrade gracefully
+        logger.warning("Failed to fetch Anthropic model list, using fallback: %s", exc)
+        return None
+
+    return models or None
+
+
+async def get_available_models() -> List[Dict[str, Any]]:
+    """Return live Anthropic models when possible, with cached static fallback."""
+    if os.getenv("CLAUDE_MODELS_OVERRIDE", "").strip():
+        return _fallback_model_payload()
+
+    now = time.time()
+    cached_models = _model_list_cache.get("models")
+    if cached_models and now < float(_model_list_cache.get("expires_at", 0)):
+        return cached_models
+
+    async with _model_list_lock:
+        # Recheck inside the lock so the first waiter populates the cache and
+        # subsequent waiters return without re-fetching.
+        now = time.time()
+        cached_models = _model_list_cache.get("models")
+        if cached_models and now < float(_model_list_cache.get("expires_at", 0)):
+            return cached_models
+
+        live_models = await _fetch_anthropic_models()
+        if live_models:
+            _model_list_cache.update(
+                {"models": live_models, "expires_at": now + MODEL_LIST_CACHE_TTL_SECONDS}
+            )
+            return live_models
+
+        fallback_models = _fallback_model_payload()
+        # Use a short TTL on failure so transient outages don't suppress live
+        # discovery for the full MODEL_LIST_CACHE_TTL_SECONDS window.
+        _model_list_cache.update(
+            {"models": fallback_models, "expires_at": now + MODEL_LIST_ERROR_TTL_SECONDS}
+        )
+        return fallback_models
+
+
+def _pick_latest_sonnet(models: List[Dict[str, Any]]) -> Optional[str]:
+    """Return the id of the newest Sonnet model in `models`, or None."""
+    sonnets = [m for m in models if isinstance(m.get("id"), str) and "sonnet" in m["id"].lower()]
+    if not sonnets:
+        return None
+    # Prefer Anthropic-provided created_at; fall back to the int `created` we set,
+    # then to id-sort (date-suffixed ids sort correctly newest-last).
+    sonnets.sort(
+        key=lambda m: (
+            _iso_to_unix(m.get("created_at")) or m.get("created") or 0,
+            m["id"],
+        )
+    )
+    return sonnets[-1]["id"]
+
+
+async def resolve_default_model() -> Optional[str]:
+    """Pick the latest Sonnet from /v1/models and store it as the default.
+
+    Skipped when the operator pinned DEFAULT_MODEL via env var, or when no
+    ANTHROPIC_API_KEY is configured (live discovery is the only auth-aware
+    path; Bedrock, Vertex, and Claude CLI subscription users get the static
+    DEFAULT_MODEL_FALLBACK).
+    """
+    if constants.DEFAULT_MODEL_ENV:
+        return constants.DEFAULT_MODEL_ENV
+
+    if not os.getenv("ANTHROPIC_API_KEY"):
+        logger.info(
+            "Live model discovery disabled (no ANTHROPIC_API_KEY); " "using fallback default %s",
+            DEFAULT_MODEL_FALLBACK,
+        )
+        return None
+
+    try:
+        models = await get_available_models()
+    except Exception as exc:  # noqa: BLE001 - startup should never abort on this
+        logger.warning("Could not resolve default model from /v1/models: %s", exc)
+        return None
+
+    latest = _pick_latest_sonnet(models)
+    if latest:
+        constants.RESOLVED_DEFAULT_MODEL = latest
+        logger.info("Resolved default model from Anthropic Models API: %s", latest)
+        return latest
+
+    logger.info(
+        "No Sonnet model found in /v1/models response; using fallback %s",
+        DEFAULT_MODEL_FALLBACK,
+    )
+    return None
+
+
+def log_json_structure(content: str, log: logging.Logger) -> None:
+    """Log the structure of a JSON response for debugging."""
+    try:
+        data = json.loads(content)
+        if isinstance(data, list):
+            log.debug(f"JSON array with {len(data)} items")
+            if len(data) > 0 and isinstance(data[0], dict):
+                log.debug(f"First item fields: {list(data[0].keys())}")
+        elif isinstance(data, dict):
+            log.debug(f"JSON object fields: {list(data.keys())}")
+    except json.JSONDecodeError:
+        log.debug("Response is not valid JSON")
+
 
 def generate_secure_token(length: int = 32) -> str:
     """Generate a secure random token for API authentication."""
@@ -128,11 +390,38 @@ def prompt_for_api_protection() -> Optional[str]:
 )
 
 
+def _log_build_info() -> None:
+    """Log the SDK and bundled CLI versions baked into the image at build time.
+
+    Lets ops tell from Loki which SDK shipped in a given container without
+    shelling in. If /app/BUILD_INFO is missing (e.g. running from source),
+    we fall back to asking the installed package for its version.
+    """
+    try:
+        with open("/app/BUILD_INFO", "r") as f:
+            contents = f.read().strip()
+        logger.info(f"Build info:\n{contents}")
+        return
+    except FileNotFoundError:
+        pass
+    try:
+        import importlib.metadata
+
+        sdk_version = importlib.metadata.version("claude-agent-sdk")
+        logger.info(f"Build info: claude-agent-sdk={sdk_version} (no BUILD_INFO file)")
+    except Exception as e:
+        logger.warning(f"Build info unavailable: {e}")
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Verify Claude Code authentication and CLI on startup."""
+    _log_build_info()
     logger.info("Verifying Claude Code authentication and CLI...")
 
+    # Initialize model service (fetch models from API or use fallback)
+    await model_service.initialize()
+
     # Validate authentication first
     auth_valid, auth_info = validate_claude_code_auth()
 
@@ -155,17 +444,21 @@ async def lifespan(app: FastAPI):
 
         if cli_verified:
             logger.info("✅ Claude Agent SDK verified successfully")
+            _auth.cli_health.mark_ok()
         else:
             logger.warning("⚠️  Claude Agent SDK verification returned False")
             logger.warning("The server will start, but requests may fail.")
+            _auth.cli_health.mark_failed("unknown", "startup verify_cli returned False")
     except asyncio.TimeoutError:
         logger.warning("⚠️  Claude Agent SDK verification timed out (30s)")
         logger.warning("This may indicate network issues or SDK configuration problems.")
         logger.warning("The server will start, but first request may be slow.")
+        _auth.cli_health.mark_failed("unknown", "startup verify_cli timed out after 30s")
     except Exception as e:
         logger.error(f"⚠️  Claude Agent SDK verification failed: {e}")
         logger.warning("The server will start, but requests may fail.")
         logger.warning("Check that Claude Code CLI is properly installed and authenticated.")
+        _auth.cli_health.mark_failed(_classify_probe_error(str(e)), str(e))
 
     # Log debug information if debug mode is enabled
     if DEBUG_MODE or VERBOSE:
@@ -188,21 +481,72 @@ async def lifespan(app: FastAPI):
             f"🔧 API Key protection: {'Enabled' if (os.getenv('API_KEY') or runtime_api_key) else 'Disabled'}"
         )
 
+    # Resolve the default model from the live Anthropic Models API so /v1/chat
+    # uses the latest Sonnet without a code change. Best-effort: any failure
+    # leaves the static fallback in place.
+    try:
+        await resolve_default_model()
+    except Exception as e:
+        logger.warning(f"Default model resolution skipped: {e}")
+
     # Start session cleanup task
     session_manager.start_cleanup_task()
 
+    # Start cost tracker cleanup task (mirrors session cleanup interval)
+    async def cost_cleanup_loop():
+        try:
+            while True:
+                await asyncio.sleep(SESSION_CLEANUP_INTERVAL_MINUTES * 60)
+                await cost_tracker.cleanup_expired()
+        except asyncio.CancelledError:
+            pass
+
+    cost_cleanup_task = asyncio.get_running_loop().create_task(cost_cleanup_loop())
+
+    # Periodic CLI auth probe. Only runs when auth_method == claude_cli because
+    # API key / Bedrock / Vertex failures already surface as
+    # assistant_authentication_failed via _ASSISTANT_ERROR_STATUS. Set the
+    # interval to 0 to disable. Each probe is a 1-turn query and costs ~$0.001
+    # at Sonnet pricing; default 10 min keeps the bill low while still bounding
+    # the stale window.
+    async def cli_auth_probe_loop():
+        interval = int(os.getenv("CLI_AUTH_PROBE_INTERVAL_SECONDS", "600"))
+        if interval <= 0:
+            logger.info("cli_auth_probe disabled (interval=%s)", interval)
+            return
+        try:
+            while True:
+                await asyncio.sleep(interval)
+                if _auth.auth_manager.auth_method != "claude_cli":
+                    continue
+                await _auth.probe_cli_auth()
+        except asyncio.CancelledError:
+            pass
+
+    cli_auth_probe_task = asyncio.get_running_loop().create_task(cli_auth_probe_loop())
+
+    # Start CPU watchdog (Linux/Docker only)
+    cpu_watchdog.start()
+
     yield
 
+    cpu_watchdog.stop()
+    cost_cleanup_task.cancel()
+    cli_auth_probe_task.cancel()
+
     # Cleanup on shutdown
     logger.info("Shutting down session manager...")
     session_manager.shutdown()
 
+    # Shutdown model service
+    await model_service.shutdown()
+
 
 # Create FastAPI app
 app = FastAPI(
     title="Claude Code OpenAI API Wrapper",
     description="OpenAI-compatible API for Claude Code",
-    version="1.0.0",
+    version=__version__,
     lifespan=lifespan,
 )
 
@@ -388,6 +732,242 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
     return JSONResponse(status_code=422, content=error_response)
 
 
+def _build_claude_options(
+    request: ChatCompletionRequest,
+    claude_headers: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Build validated Claude SDK options from a request and optional headers.
+
+    Shared by both the streaming and non-streaming code paths.
+    """
+    claude_options = request.to_claude_options()
+
+    if claude_headers:
+        claude_options.update(claude_headers)
+
+    if claude_options.get("model"):
+        ParameterValidator.validate_model(claude_options["model"])
+
+    if request.max_tokens and claude_options.get("model"):
+        validated = ParameterValidator.validate_max_tokens(
+            claude_options["model"], request.max_tokens
+        )
+        if validated is not None:
+            claude_options["max_tokens"] = validated
+
+    if not request.enable_tools:
+        claude_options["disallowed_tools"] = CLAUDE_TOOLS
+        claude_options["max_turns"] = DEFAULT_MAX_TURNS_NO_TOOLS
+        logger.info(
+            f"Tools disabled (default behavior for OpenAI compatibility); "
+            f"max_turns={DEFAULT_MAX_TURNS_NO_TOOLS} "
+            f"(override via WRAPPER_DEFAULT_MAX_TURNS)"
+        )
+    else:
+        claude_options["allowed_tools"] = DEFAULT_ALLOWED_TOOLS
+        claude_options["permission_mode"] = "bypassPermissions"
+        logger.info(f"Tools enabled by user request: {DEFAULT_ALLOWED_TOOLS}")
+
+    return claude_options
+
+
+def _build_error_max_turns_response(
+    request_id: str, model: str, err: ClaudeResultError
+) -> JSONResponse:
+    """Translate error_max_turns into a valid OpenAI chat completion with
+    finish_reason='length' and empty content. Clients see a well-formed
+    response and can decide whether to retry with different parameters
+    rather than receiving silent garbage."""
+    logger.warning(
+        _kv(
+            "claude_sdk_error_max_turns",
+            request_id=request_id,
+            num_turns=err.num_turns,
+            stop_reason=err.stop_reason,
+            errors=err.errors,
+        )
+    )
+    response = ChatCompletionResponse(
+        id=request_id,
+        model=model,
+        choices=[
+            Choice(
+                index=0,
+                message=Message(role="assistant", content=""),
+                finish_reason="length",
+            )
+        ],
+        usage=Usage(prompt_tokens=0, completion_tokens=0, total_tokens=0),
+    )
+    return JSONResponse(status_code=200, content=response.model_dump())
+
+
+def _build_sdk_error_response(request_id: str, model: str, err: ClaudeResultError) -> JSONResponse:
+    """Non-recoverable SDK result: return 502 so clients know to retry with
+    backoff. Structured body includes the SDK subtype and any errors so
+    callers can tell the difference between a max-turns overflow and a
+    transport failure.
+
+    Defense-in-depth for the CLI-auth probe loop: when stderr_tail (or the
+    error_message) matches the known auth-failure markers, return 401 instead
+    and seed _auth.cli_health so the next request fails fast without a round-trip.
+    """
+    logger.error(
+        _kv(
+            "claude_sdk_error",
+            request_id=request_id,
+            subtype=err.subtype,
+            num_turns=err.num_turns,
+            errors=err.errors,
+            error_message=err.error_message,
+            stderr_tail_chars=len(err.stderr_tail or ""),
+        )
+    )
+    if err.stderr_tail:
+        logger.error(
+            f"claude_sdk_error stderr tail (request_id={request_id}):\n" f"{err.stderr_tail}"
+        )
+
+    blob = " ".join(filter(None, [err.error_message, err.stderr_tail]))
+    if _classify_probe_error(blob) == "auth_failure":
+        _auth.cli_health.mark_failed("auth_failure", blob)
+        logger.warning(
+            _kv(
+                "claude_sdk_cli_auth_failed",
+                request_id=request_id,
+                model=model,
+                subtype=err.subtype,
+            )
+        )
+        return JSONResponse(
+            status_code=401,
+            content={
+                "error": {
+                    "message": (
+                        "Claude CLI is not authenticated. Run `claude /login` "
+                        "on the wrapper host and restart, or set "
+                        "ANTHROPIC_API_KEY."
+                    ),
+                    "type": "authentication_error",
+                    "code": "claude_cli_not_authenticated",
+                }
+            },
+        )
+
+    return JSONResponse(
+        status_code=502,
+        content={
+            "error": {
+                "message": err.error_message
+                or (err.errors[0] if err.errors else f"SDK returned {err.subtype}"),
+                "type": "upstream_sdk_error",
+                "code": err.subtype or "unknown",
+            }
+        },
+    )
+
+
+# Map AssistantMessage error literals to HTTP status codes so each upstream
+# failure mode surfaces with the right semantics instead of collapsing to 502:
+#   rate_limit -> 429 (retryable with backoff; callers should honor Retry-After)
+#   billing_error -> 402 (permanent until billing is resolved)
+#   authentication_failed -> 401 (permanent until auth is fixed)
+#   invalid_request -> 400 (client bug)
+#   server_error / unknown -> 502 (retry with backoff)
+_ASSISTANT_ERROR_STATUS = {
+    "assistant_rate_limit": 429,
+    "assistant_billing_error": 402,
+    "assistant_authentication_failed": 401,
+    "assistant_invalid_request": 400,
+    "assistant_server_error": 502,
+    "assistant_unknown": 502,
+}
+
+_ASSISTANT_ERROR_MESSAGE = {
+    "assistant_rate_limit": "Upstream rate limit exceeded",
+    "assistant_billing_error": "Upstream billing error",
+    "assistant_authentication_failed": "Upstream authentication failed",
+    "assistant_invalid_request": "Upstream rejected the request as invalid",
+    "assistant_server_error": "Upstream server error",
+    "assistant_unknown": "Upstream request failed",
+}
+
+
+def _safe_assistant_error_message(subtype: Optional[str]) -> str:
+    """Return a client-safe message that does not leak exception detail."""
+    return _ASSISTANT_ERROR_MESSAGE.get(subtype or "", "Upstream request failed")
+
+
+def _build_assistant_error_response(
+    request_id: str, model: str, err: ClaudeResultError
+) -> JSONResponse:
+    """Translate an AssistantMessage error to a status-coded OpenAI error."""
+    status = _ASSISTANT_ERROR_STATUS.get(err.subtype or "", 502)
+    headers = None
+    if status == 429:
+        # Conservative default. Callers that want a smarter backoff should
+        # inspect upstream rate-limit headers once the SDK exposes them.
+        headers = {"Retry-After": "30"}
+    logger.warning(
+        _kv(
+            "claude_sdk_assistant_error",
+            request_id=request_id,
+            subtype=err.subtype,
+            errors=err.errors,
+            status=status,
+        )
+    )
+    return JSONResponse(
+        status_code=status,
+        headers=headers,
+        content={
+            "error": {
+                "message": _safe_assistant_error_message(err.subtype),
+                "type": "upstream_api_error",
+                "code": err.subtype or "unknown",
+            }
+        },
+    )
+
+
+def _handle_claude_result_error(
+    request_id: str, model: str, err: ClaudeResultError
+) -> JSONResponse:
+    """Route a ClaudeResultError to the right OpenAI-shaped response.
+
+    Records the outcome against the circuit breaker so a burst of SDK
+    failures across many requests trips the breaker and fails-fast future
+    traffic for a short cool-off period.
+    """
+    # error_max_turns still returned a 200 to the caller with finish_reason=
+    # length; treat it as upstream "bad" for breaker purposes because from a
+    # reliability perspective it's a failed completion.
+    sdk_circuit_breaker.record(success=False)
+    if err.subtype == "error_max_turns":
+        return _build_error_max_turns_response(request_id, model, err)
+    if err.subtype in _ASSISTANT_ERROR_STATUS:
+        return _build_assistant_error_response(request_id, model, err)
+    return _build_sdk_error_response(request_id, model, err)
+
+
+def _run_completion_kwargs(
+    claude_options: Dict[str, Any], prompt: str, system_prompt: Optional[str], stream: bool
+) -> Dict[str, Any]:
+    """Extract run_completion keyword arguments from claude_options."""
+    return {
+        "prompt": prompt,
+        "system_prompt": system_prompt,
+        "model": claude_options.get("model"),
+        "max_turns": claude_options.get("max_turns", 10),
+        "allowed_tools": claude_options.get("allowed_tools"),
+        "disallowed_tools": claude_options.get("disallowed_tools"),
+        "permission_mode": claude_options.get("permission_mode"),
+        "effort": claude_options.get("effort"),
+        "thinking": claude_options.get("thinking"),
+        "stream": stream,
+    }
+
+
 async def generate_streaming_response(
     request: ChatCompletionRequest, request_id: str, claude_headers: Optional[Dict[str, Any]] = None
 ) -> AsyncGenerator[str, None]:
@@ -398,6 +978,10 @@ async def generate_streaming_response(
             request.messages, request.session_id
         )
 
+        # Convert tool role messages for Claude compatibility
+        if request.tools:
+            all_messages = convert_tool_messages(all_messages)
+
         # Convert messages to prompt
         prompt, system_prompt = MessageAdapter.messages_to_prompt(all_messages)
 
@@ -410,49 +994,67 @@ async def generate_streaming_response(
                 system_prompt = sampling_instructions
             logger.debug(f"Added sampling instructions: {sampling_instructions}")
 
+        # Function calling: inject tool definitions into system prompt
+        has_tools = request.tools and len(request.tools) > 0
+        if has_tools:
+            tools_dicts = [t.model_dump() for t in request.tools]
+            tools_prompt = build_tools_system_prompt(tools_dicts, request.tool_choice)
+            if tools_prompt:
+                if system_prompt:
+                    system_prompt = f"{system_prompt}\n\n{tools_prompt}"
+                else:
+                    system_prompt = tools_prompt
+                logger.info(
+                    f"Function calling (streaming): injected {len(request.tools)} tool definitions"
+                )
+
+        # Check for JSON mode
+        json_mode = request.response_format and request.response_format.type in (
+            "json_object",
+            "json_schema",
+        )
+        if json_mode:
+            if (
+                request.response_format.type == "json_schema"
+                and request.response_format.json_schema
+            ):
+                schema = request.response_format.json_schema
+                schema_json = json.dumps(schema.schema_ or {}, indent=2)
+                schema_instructions = MessageAdapter.JSON_SCHEMA_TEMPLATE.format(
+                    schema_json=schema_json
+                )
+                prompt = f"{schema_instructions}\n\n{prompt}"
+                logger.info(f"JSON schema mode (streaming): injected schema into prompt")
+            else:
+                if system_prompt:
+                    system_prompt = f"{MessageAdapter.JSON_MODE_INSTRUCTION}\n\n{system_prompt}"
+                else:
+                    system_prompt = MessageAdapter.JSON_MODE_INSTRUCTION
+                prompt = prompt + MessageAdapter.JSON_PROMPT_SUFFIX
+                logger.info(
+                    "JSON mode enabled (streaming) - instruction added to system and user prompt"
+                )
+
         # Filter content for unsupported features
         prompt = MessageAdapter.filter_content(prompt)
         if system_prompt:
             system_prompt = MessageAdapter.filter_content(system_prompt)
 
-        # Get Claude Agent SDK options from request
-        claude_options = request.to_claude_options()
-
-        # Merge with Claude-specific headers if provided
-        if claude_headers:
-            claude_options.update(claude_headers)
-
-        # Validate model
-        if claude_options.get("model"):
-            ParameterValidator.validate_model(claude_options["model"])
-
-        # Handle tools - disabled by default for OpenAI compatibility
-        if not request.enable_tools:
-            # Disable all tools by using CLAUDE_TOOLS constant
-            claude_options["disallowed_tools"] = CLAUDE_TOOLS
-            claude_options["max_turns"] = 1  # Single turn for Q&A
-            logger.info("Tools disabled (default behavior for OpenAI compatibility)")
-        else:
-            # Enable tools - use default safe subset (Read, Glob, Grep, Bash, Write, Edit)
-            claude_options["allowed_tools"] = DEFAULT_ALLOWED_TOOLS
-            # Set permission mode to bypass prompts (required for API/headless usage)
-            claude_options["permission_mode"] = "bypassPermissions"
-            logger.info(f"Tools enabled by user request: {DEFAULT_ALLOWED_TOOLS}")
+        claude_options = _build_claude_options(request, claude_headers)
 
         # Run Claude Code
         chunks_buffer = []
         role_sent = False  # Track if we've sent the initial role chunk
         content_sent = False  # Track if we've sent any content
+        json_mode_buffer = []  # Buffer for JSON mode - accumulate all content
+        tool_call_buffer = []  # Buffer when tools are defined - parse at end
+        fence_stripper = JsonFenceStripper() if json_mode else None
+
+        if has_tools and json_mode:
+            logger.info("Both tools and JSON mode active -- tools take priority for buffering")
 
         async for chunk in claude_cli.run_completion(
-            prompt=prompt,
-            system_prompt=system_prompt,
-            model=claude_options.get("model"),
-            max_turns=claude_options.get("max_turns", 10),
-            allowed_tools=claude_options.get("allowed_tools"),
-            disallowed_tools=claude_options.get("disallowed_tools"),
-            permission_mode=claude_options.get("permission_mode"),
-            stream=True,
+            **_run_completion_kwargs(claude_options, prompt, system_prompt, stream=True),
         ):
             chunks_buffer.append(chunk)
 
@@ -501,40 +1103,192 @@ async def generate_streaming_response(
                         filtered_text = MessageAdapter.filter_content(raw_text)
 
                         if filtered_text and not filtered_text.isspace():
-                            # Create streaming chunk
+                            if has_tools:
+                                # Buffer when tools defined -- parse tool_calls at end
+                                tool_call_buffer.append(filtered_text)
+                            elif json_mode and fence_stripper:
+                                # Stream through fence stripper
+                                stripped = fence_stripper.process_delta(filtered_text)
+                                if stripped:
+                                    stream_chunk = ChatCompletionStreamResponse(
+                                        id=request_id,
+                                        model=request.model,
+                                        choices=[
+                                            StreamChoice(
+                                                index=0,
+                                                delta={"content": stripped},
+                                                finish_reason=None,
+                                            )
+                                        ],
+                                    )
+                                    yield f"data: {stream_chunk.model_dump_json()}\n\n"
+                                    content_sent = True
+                            elif json_mode:
+                                json_mode_buffer.append(filtered_text)
+                            else:
+                                stream_chunk = ChatCompletionStreamResponse(
+                                    id=request_id,
+                                    model=request.model,
+                                    choices=[
+                                        StreamChoice(
+                                            index=0,
+                                            delta={"content": filtered_text},
+                                            finish_reason=None,
+                                        )
+                                    ],
+                                )
+                                yield f"data: {stream_chunk.model_dump_json()}\n\n"
+                                content_sent = True
+
+                elif isinstance(content, str):
+                    filtered_content = MessageAdapter.filter_content(content)
+
+                    if filtered_content and not filtered_content.isspace():
+                        if has_tools:
+                            tool_call_buffer.append(filtered_content)
+                        elif json_mode and fence_stripper:
+                            stripped = fence_stripper.process_delta(filtered_content)
+                            if stripped:
+                                stream_chunk = ChatCompletionStreamResponse(
+                                    id=request_id,
+                                    model=request.model,
+                                    choices=[
+                                        StreamChoice(
+                                            index=0, delta={"content": stripped}, finish_reason=None
+                                        )
+                                    ],
+                                )
+                                yield f"data: {stream_chunk.model_dump_json()}\n\n"
+                                content_sent = True
+                        elif json_mode:
+                            json_mode_buffer.append(filtered_content)
+                        else:
                             stream_chunk = ChatCompletionStreamResponse(
                                 id=request_id,
                                 model=request.model,
                                 choices=[
                                     StreamChoice(
                                         index=0,
-                                        delta={"content": filtered_text},
+                                        delta={"content": filtered_content},
                                         finish_reason=None,
                                     )
                                 ],
                             )
-
                             yield f"data: {stream_chunk.model_dump_json()}\n\n"
                             content_sent = True
 
-                elif isinstance(content, str):
-                    # Filter out tool usage and thinking blocks
-                    filtered_content = MessageAdapter.filter_content(content)
-
-                    if filtered_content and not filtered_content.isspace():
-                        # Create streaming chunk
-                        stream_chunk = ChatCompletionStreamResponse(
-                            id=request_id,
-                            model=request.model,
-                            choices=[
-                                StreamChoice(
-                                    index=0, delta={"content": filtered_content}, finish_reason=None
-                                )
-                            ],
+        # Flush fence stripper if used
+        if json_mode and fence_stripper:
+            remaining = fence_stripper.flush()
+            if remaining:
+                if not role_sent:
+                    initial_chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        model=request.model,
+                        choices=[
+                            StreamChoice(
+                                index=0,
+                                delta={"role": "assistant", "content": ""},
+                                finish_reason=None,
+                            )
+                        ],
+                    )
+                    yield f"data: {initial_chunk.model_dump_json()}\n\n"
+                    role_sent = True
+                flush_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    model=request.model,
+                    choices=[
+                        StreamChoice(index=0, delta={"content": remaining}, finish_reason=None)
+                    ],
+                )
+                yield f"data: {flush_chunk.model_dump_json()}\n\n"
+                content_sent = True
+
+        # Handle tool call buffer: parse and emit tool_calls
+        if has_tools and tool_call_buffer:
+            combined = "".join(tool_call_buffer)
+            parsed_calls, remaining_text = parse_tool_calls(combined)
+            if not role_sent:
+                initial_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    model=request.model,
+                    choices=[
+                        StreamChoice(
+                            index=0, delta={"role": "assistant", "content": ""}, finish_reason=None
                         )
+                    ],
+                )
+                yield f"data: {initial_chunk.model_dump_json()}\n\n"
+                role_sent = True
+            if parsed_calls:
+                formatted = format_tool_calls(parsed_calls)
+                tc_delta = {"tool_calls": [tc.model_dump() for tc in formatted]}
+                if remaining_text.strip():
+                    tc_delta["content"] = remaining_text.strip()
+                tc_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    model=request.model,
+                    choices=[StreamChoice(index=0, delta=tc_delta, finish_reason=None)],
+                )
+                yield f"data: {tc_chunk.model_dump_json()}\n\n"
+                content_sent = True
+            elif combined.strip():
+                text_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    model=request.model,
+                    choices=[
+                        StreamChoice(index=0, delta={"content": combined}, finish_reason=None)
+                    ],
+                )
+                yield f"data: {text_chunk.model_dump_json()}\n\n"
+                content_sent = True
+
+        # Handle JSON mode: emit accumulated content as single JSON-formatted chunk
+        if json_mode and json_mode_buffer:
+            # Send role chunk first if not sent
+            if not role_sent:
+                initial_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    model=request.model,
+                    choices=[
+                        StreamChoice(
+                            index=0, delta={"role": "assistant", "content": ""}, finish_reason=None
+                        )
+                    ],
+                )
+                yield f"data: {initial_chunk.model_dump_json()}\n\n"
+                role_sent = True
+
+            # Combine buffered content and enforce JSON format
+            combined_content = "".join(json_mode_buffer)
+
+            if DEBUG_MODE or VERBOSE:
+                raw_preview = (
+                    combined_content[:50] if len(combined_content) > 50 else combined_content
+                )
+                raw_end = combined_content[-30:] if len(combined_content) > 30 else combined_content
+                logger.debug(f"Raw response: starts='{raw_preview}' ends='...{raw_end}'")
+
+            json_content, extraction_metadata = MessageAdapter.enforce_json_format_with_metadata(
+                combined_content, strict=True
+            )
+
+            if DEBUG_MODE or VERBOSE:
+                logger.debug(f"JSON extraction metadata: {extraction_metadata}")
+                logger.debug(f"Extracted JSON preview: {json_content[:200]}")
+                log_json_structure(json_content, logger)
 
-                        yield f"data: {stream_chunk.model_dump_json()}\n\n"
-                        content_sent = True
+            # Emit as single chunk
+            json_chunk = ChatCompletionStreamResponse(
+                id=request_id,
+                model=request.model,
+                choices=[
+                    StreamChoice(index=0, delta={"content": json_content}, finish_reason=None)
+                ],
+            )
+            yield f"data: {json_chunk.model_dump_json()}\n\n"
+            content_sent = True
 
         # Handle case where no role was sent (send at least role chunk)
         if not role_sent:
@@ -553,29 +1307,80 @@ async def generate_streaming_response(
 
         # If we sent role but no content, send a minimal response
         if role_sent and not content_sent:
+            fallback_content = (
+                "[]" if json_mode else "I'm unable to provide a response at the moment."
+            )
             fallback_chunk = ChatCompletionStreamResponse(
                 id=request_id,
                 model=request.model,
                 choices=[
                     StreamChoice(
                         index=0,
-                        delta={"content": "I'm unable to provide a response at the moment."},
+                        delta={"content": fallback_content},
                         finish_reason=None,
                     )
                 ],
             )
             yield f"data: {fallback_chunk.model_dump_json()}\n\n"
 
-        # Extract assistant response from all chunks
+        # Extract assistant response from all chunks. parse_claude_message
+        # raises ClaudeResultError on SDK error_max_turns / error_during_execution;
+        # emit a terminal SSE event with finish_reason='length' (max_turns) or an
+        # error payload (other), then close. Do NOT let sentinel text stream out.
         assistant_content = None
+        sdk_error: Optional[ClaudeResultError] = None
         if chunks_buffer:
-            assistant_content = claude_cli.parse_claude_message(chunks_buffer)
+            try:
+                assistant_content = claude_cli.parse_claude_message(chunks_buffer)
+            except ClaudeResultError as err:
+                sdk_error = err
 
             # Store in session if applicable
             if actual_session_id and assistant_content:
                 assistant_message = Message(role="assistant", content=assistant_content)
                 session_manager.add_assistant_response(actual_session_id, assistant_message)
 
+        if sdk_error is not None:
+            if sdk_error.subtype == "error_max_turns":
+                final_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    model=request.model,
+                    choices=[StreamChoice(index=0, delta={}, finish_reason="length")],
+                )
+                logger.warning(
+                    _kv(
+                        "claude_sdk_error_max_turns_stream",
+                        request_id=request_id,
+                        num_turns=sdk_error.num_turns,
+                    )
+                )
+                yield f"data: {final_chunk.model_dump_json()}\n\n"
+                yield "data: [DONE]\n\n"
+            else:
+                logger.error(
+                    _kv(
+                        "claude_sdk_error_stream",
+                        request_id=request_id,
+                        subtype=sdk_error.subtype,
+                        errors=sdk_error.errors,
+                    )
+                )
+                err_payload = {
+                    "error": {
+                        "message": sdk_error.error_message
+                        or (
+                            sdk_error.errors[0]
+                            if sdk_error.errors
+                            else f"SDK returned {sdk_error.subtype}"
+                        ),
+                        "type": "upstream_sdk_error",
+                        "code": sdk_error.subtype or "unknown",
+                    }
+                }
+                yield f"data: {json.dumps(err_payload)}\n\n"
+                yield "data: [DONE]\n\n"
+            return
+
         # Prepare usage data if requested
         usage_data = None
         if request.stream_options and request.stream_options.include_usage:
@@ -589,6 +1394,15 @@ async def generate_streaming_response(
             )
             logger.debug(f"Estimated usage: {usage_data}")
 
+            await cost_tracker.record_usage(
+                session_id=actual_session_id or request_id,
+                model=request.model,
+                usage=UsageRecord(
+                    input_tokens=token_usage["prompt_tokens"],
+                    output_tokens=token_usage["completion_tokens"],
+                ),
+            )
+
         # Send final chunk with finish reason and optionally usage data
         final_chunk = ChatCompletionStreamResponse(
             id=request_id,
@@ -601,10 +1415,61 @@ async def generate_streaming_response(
 
     except Exception as e:
         logger.error(f"Streaming error: {e}")
-        error_chunk = {"error": {"message": str(e), "type": "streaming_error"}}
+        error_chunk = {"error": {"message": "Streaming failed", "type": "streaming_error"}}
         yield f"data: {json.dumps(error_chunk)}\n\n"
 
 
+def _check_cli_auth_or_401() -> Optional[JSONResponse]:
+    """Gate request handlers on the latest CLI-auth probe + the auth manager.
+
+    Returns a JSONResponse with HTTP 401 (or 503 for non-cli auth methods)
+    when authentication is unhealthy, else None.
+
+    Returning a JSONResponse directly - rather than raising HTTPException -
+    is intentional: the global http_exception_handler wraps all detail bodies
+    as `error.type=api_error`, which clobbers the OpenAI-shaped
+    `authentication_error` literal that clients route on.
+    """
+    if _auth.auth_manager.auth_method == "claude_cli" and not _auth.cli_health.ok:
+        return JSONResponse(
+            status_code=401,
+            content={
+                "error": {
+                    "message": (
+                        "Claude CLI authentication is not healthy. "
+                        "Run `claude /login` on the wrapper host and restart, "
+                        "or set ANTHROPIC_API_KEY."
+                    ),
+                    "type": "authentication_error",
+                    "code": "claude_cli_not_authenticated",
+                    "last_probed_at": _auth.cli_health.last_probed_at.isoformat()
+                    if _auth.cli_health.last_probed_at
+                    else None,
+                    "error_kind": _auth.cli_health.error_kind,
+                    "error_message": _auth.cli_health.error_message,
+                }
+            },
+        )
+
+    auth_valid, auth_info = validate_claude_code_auth()
+    if not auth_valid:
+        status = 401 if _auth.auth_manager.auth_method == "claude_cli" else 503
+        return JSONResponse(
+            status_code=status,
+            content={
+                "error": {
+                    "message": "Claude Code authentication failed",
+                    "type": "authentication_error" if status == 401 else "service_unavailable",
+                    "code": "claude_cli_not_authenticated" if status == 401 else "auth_unavailable",
+                    "errors": auth_info.get("errors", []),
+                    "method": auth_info.get("method", "none"),
+                }
+            },
+        )
+
+    return None
+
+
 @app.post("/v1/chat/completions")
 @rate_limit_endpoint("chat")
 async def chat_completions(
@@ -616,17 +1481,32 @@ async def chat_completions(
     # Check FastAPI API key if configured
     await verify_api_key(request, credentials)
 
-    # Validate Claude Code authentication
-    auth_valid, auth_info = validate_claude_code_auth()
-
-    if not auth_valid:
-        error_detail = {
-            "message": "Claude Code authentication failed",
-            "errors": auth_info.get("errors", []),
-            "method": auth_info.get("method", "none"),
-            "help": "Check /v1/auth/status for detailed authentication information",
-        }
-        raise HTTPException(status_code=503, detail=error_detail)
+    # Gate on Claude CLI probe + config-level auth validation.
+    auth_block = _check_cli_auth_or_401()
+    if auth_block is not None:
+        return auth_block
+
+    # Circuit breaker check: if the SDK has been failing at >50% for a minute,
+    # fail-fast with 503 instead of forwarding another doomed request. The
+    # breaker half-opens after open_seconds and lets a single probe through.
+    if not sdk_circuit_breaker.allow_request():
+        snapshot = sdk_circuit_breaker.snapshot()
+        logger.warning(_kv("circuit_breaker_open", **snapshot))
+        return JSONResponse(
+            status_code=503,
+            headers={"Retry-After": "30"},
+            content={
+                "error": {
+                    "message": (
+                        "Upstream SDK is unhealthy (circuit breaker open). "
+                        "Retry after the window resets."
+                    ),
+                    "type": "circuit_breaker_open",
+                    "code": "circuit_open",
+                    "breaker": snapshot,
+                }
+            },
+        )
 
     try:
         request_id = f"chatcmpl-{os.urandom(8).hex()}"
@@ -651,6 +1531,21 @@ async def chat_completions(
             )
         else:
             # Non-streaming response
+            # Check cache if enabled and requested via header
+            cache_enabled = request.headers.get("X-Enable-Cache", "").lower() in (
+                "true",
+                "1",
+                "yes",
+            )
+            if cache_enabled and request_cache.enabled:
+                request_dict = request_body.model_dump()
+                cached_response = request_cache.get(request_dict)
+                if cached_response:
+                    logger.info(f"Cache hit for request {request_id}")
+                    # Return cached response with updated request ID
+                    cached_response["id"] = request_id
+                    return cached_response
+
             # Process messages with session management
             all_messages, actual_session_id = session_manager.process_messages(
                 request_body.messages, request_body.session_id
@@ -660,6 +1555,10 @@ async def chat_completions(
                 f"Chat completion: session_id={actual_session_id}, total_messages={len(all_messages)}"
             )
 
+            # Convert tool role messages for Claude compatibility
+            if request_body.tools:
+                all_messages = convert_tool_messages(all_messages)
+
             # Convert messages to prompt
             prompt, system_prompt = MessageAdapter.messages_to_prompt(all_messages)
 
@@ -672,51 +1571,71 @@ async def chat_completions(
                     system_prompt = sampling_instructions
                 logger.debug(f"Added sampling instructions: {sampling_instructions}")
 
+            # Function calling: inject tool definitions into system prompt
+            has_tools = request_body.tools and len(request_body.tools) > 0
+            if has_tools:
+                tools_dicts = [t.model_dump() for t in request_body.tools]
+                tools_prompt = build_tools_system_prompt(tools_dicts, request_body.tool_choice)
+                if tools_prompt:
+                    if system_prompt:
+                        system_prompt = f"{system_prompt}\n\n{tools_prompt}"
+                    else:
+                        system_prompt = tools_prompt
+                    logger.info(
+                        f"Function calling: injected {len(request_body.tools)} tool definitions"
+                    )
+
+            # Check for JSON mode
+            json_mode = request_body.response_format and request_body.response_format.type in (
+                "json_object",
+                "json_schema",
+            )
+            if json_mode:
+                if (
+                    request_body.response_format.type == "json_schema"
+                    and request_body.response_format.json_schema
+                ):
+                    # JSON schema mode: inject schema into prompt (not system_prompt)
+                    schema = request_body.response_format.json_schema
+                    schema_json = json.dumps(schema.schema_ or {}, indent=2)
+                    schema_instructions = MessageAdapter.JSON_SCHEMA_TEMPLATE.format(
+                        schema_json=schema_json
+                    )
+                    prompt = f"{schema_instructions}\n\n{prompt}"
+                    logger.info(
+                        f"JSON schema mode: injected schema ({len(schema_json)} chars) into prompt"
+                    )
+                else:
+                    # Basic JSON object mode
+                    if system_prompt:
+                        system_prompt = f"{MessageAdapter.JSON_MODE_INSTRUCTION}\n\n{system_prompt}"
+                    else:
+                        system_prompt = MessageAdapter.JSON_MODE_INSTRUCTION
+                    prompt = prompt + MessageAdapter.JSON_PROMPT_SUFFIX
+                    logger.info("JSON mode enabled - instruction added to system and user prompt")
+
             # Filter content
             prompt = MessageAdapter.filter_content(prompt)
             if system_prompt:
                 system_prompt = MessageAdapter.filter_content(system_prompt)
 
-            # Get Claude Agent SDK options from request
-            claude_options = request_body.to_claude_options()
-
-            # Merge with Claude-specific headers
-            if claude_headers:
-                claude_options.update(claude_headers)
-
-            # Validate model
-            if claude_options.get("model"):
-                ParameterValidator.validate_model(claude_options["model"])
-
-            # Handle tools - disabled by default for OpenAI compatibility
-            if not request_body.enable_tools:
-                # Disable all tools by using CLAUDE_TOOLS constant
-                claude_options["disallowed_tools"] = CLAUDE_TOOLS
-                claude_options["max_turns"] = 1  # Single turn for Q&A
-                logger.info("Tools disabled (default behavior for OpenAI compatibility)")
-            else:
-                # Enable tools - use default safe subset (Read, Glob, Grep, Bash, Write, Edit)
-                claude_options["allowed_tools"] = DEFAULT_ALLOWED_TOOLS
-                # Set permission mode to bypass prompts (required for API/headless usage)
-                claude_options["permission_mode"] = "bypassPermissions"
-                logger.info(f"Tools enabled by user request: {DEFAULT_ALLOWED_TOOLS}")
+            claude_options = _build_claude_options(request_body, claude_headers)
 
             # Collect all chunks
             chunks = []
             async for chunk in claude_cli.run_completion(
-                prompt=prompt,
-                system_prompt=system_prompt,
-                model=claude_options.get("model"),
-                max_turns=claude_options.get("max_turns", 10),
-                allowed_tools=claude_options.get("allowed_tools"),
-                disallowed_tools=claude_options.get("disallowed_tools"),
-                permission_mode=claude_options.get("permission_mode"),
-                stream=False,
+                **_run_completion_kwargs(claude_options, prompt, system_prompt, stream=False),
             ):
                 chunks.append(chunk)
 
-            # Extract assistant message
-            raw_assistant_content = claude_cli.parse_claude_message(chunks)
+            # Extract assistant message. parse_claude_message raises
+            # ClaudeResultError when the SDK emits error_max_turns or other
+            # non-success ResultMessage, which we must surface as a proper
+            # OpenAI error response rather than HTTP 200 with sentinel text.
+            try:
+                raw_assistant_content = claude_cli.parse_claude_message(chunks)
+            except ClaudeResultError as err:
+                return _handle_claude_result_error(request_id, request_body.model, err)
 
             if not raw_assistant_content:
                 raise HTTPException(status_code=500, detail="No response from Claude Code")
@@ -724,24 +1643,82 @@ async def chat_completions(
             # Filter out tool usage and thinking blocks
             assistant_content = MessageAdapter.filter_content(raw_assistant_content)
 
+            # Enforce JSON format if JSON mode is enabled
+            if json_mode:
+                original_len = len(assistant_content)
+
+                if DEBUG_MODE or VERBOSE:
+                    raw_preview = (
+                        assistant_content[:50] if len(assistant_content) > 50 else assistant_content
+                    )
+                    raw_end = (
+                        assistant_content[-30:]
+                        if len(assistant_content) > 30
+                        else assistant_content
+                    )
+                    logger.debug(f"Raw response: starts='{raw_preview}' ends='...{raw_end}'")
+
+                assistant_content, extraction_metadata = (
+                    MessageAdapter.enforce_json_format_with_metadata(assistant_content, strict=True)
+                )
+
+                logger.info(
+                    f"JSON enforcement: {original_len} chars -> {len(assistant_content)} chars "
+                    f"(method={extraction_metadata.get('method', 'unknown')})"
+                )
+
+                if DEBUG_MODE or VERBOSE:
+                    logger.debug(f"JSON extraction metadata: {extraction_metadata}")
+                    logger.debug(f"Extracted JSON preview: {assistant_content[:200]}")
+                    log_json_structure(assistant_content, logger)
+
+            # Parse function calls from response if tools were provided
+            tool_calls_list = None
+            finish_reason = "stop"
+            if has_tools:
+                parsed_calls, remaining_text = parse_tool_calls(assistant_content)
+                if parsed_calls:
+                    tool_calls_list = format_tool_calls(parsed_calls)
+                    assistant_content = remaining_text.strip() if remaining_text.strip() else None
+                    finish_reason = "tool_calls"
+                    logger.info(f"Function calling: parsed {len(parsed_calls)} tool call(s)")
+
             # Add assistant response to session if using session mode
             if actual_session_id:
-                assistant_message = Message(role="assistant", content=assistant_content)
+                assistant_message = Message(
+                    role="assistant",
+                    content=assistant_content,
+                    tool_calls=tool_calls_list,
+                )
                 session_manager.add_assistant_response(actual_session_id, assistant_message)
 
             # Estimate tokens (rough approximation)
             prompt_tokens = MessageAdapter.estimate_tokens(prompt)
-            completion_tokens = MessageAdapter.estimate_tokens(assistant_content)
+            completion_tokens = MessageAdapter.estimate_tokens(assistant_content or "")
+
+            await cost_tracker.record_usage(
+                session_id=actual_session_id or request_id,
+                model=request_body.model,
+                usage=UsageRecord(
+                    input_tokens=prompt_tokens,
+                    output_tokens=completion_tokens,
+                ),
+            )
 
             # Create response
+            response_message = Message(
+                role="assistant",
+                content=assistant_content,
+                tool_calls=tool_calls_list,
+            )
             response = ChatCompletionResponse(
                 id=request_id,
                 model=request_body.model,
                 choices=[
                     Choice(
                         index=0,
-                        message=Message(role="assistant", content=assistant_content),
-                        finish_reason="stop",
+                        message=response_message,
+                        finish_reason=finish_reason,
                     )
                 ],
                 usage=Usage(
@@ -751,13 +1728,45 @@ async def chat_completions(
                 ),
             )
 
+            # Store in cache if enabled
+            if cache_enabled and request_cache.enabled:
+                request_dict = request_body.model_dump()
+                response_dict = response.model_dump()
+                request_cache.set(request_dict, response_dict)
+                logger.debug(f"Cached response for request {request_id}")
+
+            # One structured info line per successful completion. Makes Grafana
+            # triage a single `| json | subtype=...` query instead of grepping
+            # DEBUG for num_turns and friends.
+            metadata = claude_cli.extract_metadata(chunks)
+            logger.info(
+                _kv(
+                    "completion_result",
+                    request_id=request_id,
+                    session_id=metadata.get("session_id") or actual_session_id,
+                    subtype="success",
+                    num_turns=metadata.get("num_turns"),
+                    duration_ms=metadata.get("duration_ms"),
+                    total_cost_usd=metadata.get("total_cost_usd"),
+                    is_error=False,
+                    finish_reason=finish_reason,
+                    model=request_body.model,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            )
+            sdk_circuit_breaker.record(success=True)
+
             return response
 
     except HTTPException:
+        # HTTPException often represents a validated client error (401, 422);
+        # do not record it as an SDK-side failure on the breaker.
         raise
     except Exception as e:
+        sdk_circuit_breaker.record(success=False)
         logger.error(f"Chat completion error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
+        raise HTTPException(status_code=500, detail="Chat completion failed")
 
 
 @app.post("/v1/messages")
@@ -775,17 +1784,10 @@ async def anthropic_messages(
     # Check FastAPI API key if configured
     await verify_api_key(request, credentials)
 
-    # Validate Claude Code authentication
-    auth_valid, auth_info = validate_claude_code_auth()
-
-    if not auth_valid:
-        error_detail = {
-            "message": "Claude Code authentication failed",
-            "errors": auth_info.get("errors", []),
-            "method": auth_info.get("method", "none"),
-            "help": "Check /v1/auth/status for detailed authentication information",
-        }
-        raise HTTPException(status_code=503, detail=error_detail)
+    # Gate on Claude CLI probe + config-level auth validation.
+    auth_block = _check_cli_auth_or_401()
+    if auth_block is not None:
+        return auth_block
 
     try:
         logger.info(f"Anthropic Messages API request: model={request_body.model}")
@@ -823,8 +1825,34 @@ async def anthropic_messages(
         ):
             chunks.append(chunk)
 
-        # Extract assistant message
-        raw_assistant_content = claude_cli.parse_claude_message(chunks)
+        # Extract assistant message. On SDK error_max_turns, map to the
+        # Anthropic stop_reason="max_tokens"; on any other SDK error, surface
+        # it as HTTP 502 instead of returning sentinel text as content.
+        try:
+            raw_assistant_content = claude_cli.parse_claude_message(chunks)
+        except ClaudeResultError as err:
+            if err.subtype == "error_max_turns":
+                logger.warning(
+                    _kv(
+                        "claude_sdk_error_max_turns_anthropic",
+                        num_turns=err.num_turns,
+                    )
+                )
+                return AnthropicMessagesResponse(
+                    model=request_body.model,
+                    content=[AnthropicTextBlock(text="")],
+                    stop_reason="max_tokens",
+                    usage=AnthropicUsage(input_tokens=0, output_tokens=0),
+                )
+            raise HTTPException(
+                status_code=502,
+                detail={
+                    "type": "upstream_sdk_error",
+                    "code": err.subtype or "unknown",
+                    "message": err.error_message
+                    or (err.errors[0] if err.errors else f"SDK returned {err.subtype}"),
+                },
+            )
 
         if not raw_assistant_content:
             raise HTTPException(status_code=500, detail="No response from Claude Code")
@@ -853,25 +1881,53 @@ async def anthropic_messages(
         raise
     except Exception as e:
         logger.error(f"Anthropic Messages API error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
+        raise HTTPException(status_code=500, detail="Messages request failed")
 
 
 @app.get("/v1/models")
 async def list_models(
     request: Request, credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)
 ):
-    """List available models."""
+    """List available models, preferring Anthropic's live Models API when configured."""
     # Check FastAPI API key if configured
     await verify_api_key(request, credentials)
 
-    # Use constants for single source of truth
-    return {
-        "object": "list",
-        "data": [
-            {"id": model_id, "object": "model", "owned_by": "anthropic"}
-            for model_id in CLAUDE_MODELS
-        ],
-    }
+    return {"object": "list", "data": await get_available_models()}
+
+
+@app.post("/v1/models/refresh")
+@rate_limit_endpoint("general")
+async def refresh_models_endpoint(
+    request: Request,
+    credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
+):
+    """Refresh the models list from the Anthropic API.
+
+    Requires ANTHROPIC_API_KEY to be set. If the API call fails,
+    the existing cached models are preserved.
+
+    Returns:
+        On success: {"success": true, "count": N, "source": "api", "models": [...]}
+        On failure: {"success": false, "message": "...", "current_count": N, "source": "..."}
+    """
+    await verify_api_key(request, credentials)
+    result = await model_service.refresh_models()
+    return result
+
+
+@app.get("/v1/models/status")
+@rate_limit_endpoint("general")
+async def get_models_status(
+    request: Request,
+    credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
+):
+    """Get model service status including source and last refresh time.
+
+    Returns:
+        {"initialized": bool, "source": "api"|"fallback", "model_count": N, "last_refresh": timestamp|null}
+    """
+    await verify_api_key(request, credentials)
+    return model_service.get_status()
 
 
 @app.post("/v1/compatibility")
@@ -911,6 +1967,75 @@ async def health_check(request: Request):
     return {"status": "healthy", "service": "claude-code-openai-wrapper"}
 
 
+# Rolling window of recent /healthz/deep probe outcomes used to compute a
+# short-term failure rate. Fixed-size deque keeps memory bounded.
+import collections  # noqa: E402 - placed here to keep the deep-health section self-contained
+
+_DEEP_HEALTH_WINDOW = collections.deque(maxlen=10)
+_DEEP_HEALTH_FAILURE_THRESHOLD = 0.20  # open breaker above 20% failure
+
+
+@app.get("/healthz/deep")
+async def healthz_deep(request: Request):
+    """End-to-end probe that actually exercises the completion path.
+
+    The existing /health endpoint only checks process liveness, which stayed
+    green during the week MinusPod was receiving '[Request interrupted by user]'
+    as chapter content. This probe sends a canned prompt, parses the
+    response, and reports unhealthy (HTTP 503) when the rolling failure
+    rate exceeds _DEEP_HEALTH_FAILURE_THRESHOLD. Use from an orchestrator's
+    livenessProbe / healthcheck to fail fast during upstream incidents.
+    """
+    started = asyncio.get_event_loop().time()
+    probe_ok = False
+    detail: Dict[str, Any] = {}
+
+    try:
+        chunks = []
+        async for chunk in claude_cli.run_completion(
+            prompt="Reply with the single word OK.",
+            system_prompt=None,
+            model=None,
+            stream=False,
+            max_turns=DEFAULT_MAX_TURNS_NO_TOOLS,
+            disallowed_tools=CLAUDE_TOOLS,
+        ):
+            chunks.append(chunk)
+
+        try:
+            content = claude_cli.parse_claude_message(chunks) or ""
+        except ClaudeResultError as err:
+            content = ""
+            detail["sdk_error_subtype"] = err.subtype
+
+        normalized = content.strip().rstrip(".").upper()
+        probe_ok = "OK" in normalized
+        detail["content_excerpt"] = content[:120]
+    except Exception as e:
+        detail["exception"] = type(e).__name__
+        detail["exception_message"] = str(e)
+        logger.warning(f"Deep health probe raised: {e}")
+
+    _DEEP_HEALTH_WINDOW.append(probe_ok)
+
+    duration_ms = int((asyncio.get_event_loop().time() - started) * 1000)
+    recent = list(_DEEP_HEALTH_WINDOW)
+    failure_rate = (recent.count(False) / len(recent)) if recent else 0.0
+    status_healthy = failure_rate <= _DEEP_HEALTH_FAILURE_THRESHOLD
+
+    payload = {
+        "status": "healthy" if status_healthy else "unhealthy",
+        "probe_ok": probe_ok,
+        "rolling_window_size": len(recent),
+        "rolling_failure_rate": round(failure_rate, 3),
+        "threshold": _DEEP_HEALTH_FAILURE_THRESHOLD,
+        "duration_ms": duration_ms,
+        "detail": detail,
+    }
+    http_status = 200 if status_healthy else 503
+    return JSONResponse(status_code=http_status, content=payload)
+
+
 @app.get("/version")
 @rate_limit_endpoint("health")
 async def version_info(request: Request):
@@ -927,614 +2052,727 @@ async def version_info(request: Request):
 @app.get("/", response_class=HTMLResponse)
 async def root():
     """Landing page with API documentation."""
-    from src import __version__
-
     auth_info = get_claude_code_auth_info()
     auth_method = auth_info.get("method", "unknown")
     auth_valid = auth_info.get("status", {}).get("valid", False)
     status_color = "#22c55e" if auth_valid else "#ef4444"
-    status_text = "Connected" if auth_valid else "Not Connected"
-
-    html_content = f"""
-    <!DOCTYPE html>
-    <html lang="en" data-theme="dark">
-    <head>
-        <meta charset="UTF-8">
-        <meta name="viewport" content="width=device-width, initial-scale=1.0">
-        <meta name="color-scheme" content="light dark">
-        <title>Claude Code OpenAI Wrapper</title>
-        <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css">
-        <style>
-            :root {{
-                --pico-font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-                --accent-color: #16a34a;
-            }}
-            /* Light mode colors */
-            [data-theme="light"] {{
-                --card-bg: #ffffff;
-                --subtle-bg: #f1f5f9;
-                --border-color: #e2e8f0;
-                --page-bg: #f8fafc;
-            }}
-            /* Dark mode colors */
-            [data-theme="dark"] {{
-                --card-bg: #1e293b;
-                --subtle-bg: #334155;
-                --border-color: #475569;
-                --page-bg: #0f172a;
-            }}
-            /* Page background */
-            body {{ background: var(--page-bg); }}
-            /* GLOBAL FIX: Remove Pico's default code styling everywhere */
-            code:not(pre code) {{
-                background: transparent !important;
-                padding: 0 !important;
-                border-radius: 0 !important;
-                color: inherit !important;
-            }}
-            /* Only style code green where we explicitly want it */
-            .green-code {{ color: var(--accent-color) !important; }}
-            /* Constrain page width - wider for modern screens */
-            .container {{
-                max-width: 1100px;
-                margin: 0 auto;
-                padding: 1.5rem 2rem;
-            }}
-            /* Override Pico article styling */
-            article {{
-                background: var(--card-bg);
-                border: 1px solid var(--border-color);
-                border-radius: 0.75rem;
-                margin-bottom: 1rem;
-                padding: 1rem 1.25rem;
-            }}
-            article header {{
-                padding: 0;
-                margin-bottom: 0.75rem;
-                background: transparent;
-                border: none;
-            }}
-            /* Section headers with icons - matches status-flex layout */
-            .section-header {{
-                display: flex;
-                align-items: center;
-                gap: 0.5rem;
-                margin-bottom: 0.75rem;
-            }}
-            .section-icon {{
-                width: 1rem;
-                height: 1rem;
-                color: var(--accent-color);
-                flex-shrink: 0;
-            }}
-            /* Status indicator */
-            .status-dot {{
-                width: 0.75rem;
-                height: 0.75rem;
-                border-radius: 50%;
-                display: inline-block;
-                animation: pulse 2s infinite;
-            }}
-            @keyframes pulse {{
-                0%, 100% {{ opacity: 1; }}
-                50% {{ opacity: 0.5; }}
-            }}
-            /* Method badges */
-            .badge {{
-                display: inline-block;
-                padding: 0.25rem 0.5rem;
-                font-size: 0.7rem;
-                font-weight: 700;
-                border-radius: 0.25rem;
-                text-transform: uppercase;
-            }}
-            .badge-post {{ background: rgba(34, 197, 94, 0.15); color: #16a34a; }}
-            .badge-get {{ background: rgba(59, 130, 246, 0.15); color: #2563eb; }}
-            /* Header layout */
-            .header-flex {{
-                display: flex;
-                justify-content: space-between;
-                align-items: center;
-                gap: 1rem;
-                margin-bottom: 1rem;
-            }}
-            .header-left {{
-                display: flex;
-                align-items: center;
-                gap: 1rem;
-                flex-shrink: 0;
-            }}
-            .header-right {{
-                display: flex;
-                align-items: center;
-                gap: 0.75rem;
-                flex-shrink: 0;
-            }}
-            .icon-btn {{
-                padding: 0.5rem;
-                border-radius: 0.5rem;
-                background: var(--subtle-bg);
-                border: 1px solid var(--border-color);
-                cursor: pointer;
-                display: inline-flex;
-                align-items: center;
-                justify-content: center;
-                color: inherit;
-            }}
-            .icon-btn:hover {{ opacity: 0.8; }}
-            .icon-btn svg {{ width: 1.25rem; height: 1.25rem; }}
-            .version-badge {{
-                padding: 0.25rem 0.75rem;
-                background: var(--subtle-bg);
-                border: 1px solid var(--border-color);
-                border-radius: 0.5rem;
-                font-family: monospace;
-                font-size: 0.875rem;
-            }}
-            /* Logo container */
-            .logo-container {{
-                background: linear-gradient(135deg, #22c55e 0%, #0ea5e9 100%);
-                padding: 2px;
-                border-radius: 0.75rem;
-            }}
-            .logo-inner {{
-                background: var(--card-bg);
-                border-radius: calc(0.75rem - 2px);
-                padding: 0.75rem;
-                display: flex;
-                align-items: center;
-                justify-content: center;
-            }}
-            .logo-inner svg {{ width: 2rem; height: 2rem; color: #22c55e; }}
-            /* Endpoint list */
-            .endpoint-item {{
-                display: flex;
-                align-items: center;
-                gap: 0.75rem;
-                padding: 0.5rem 0;
-                border-bottom: 1px solid var(--pico-muted-border-color);
-            }}
-            .endpoint-item:last-child {{ border-bottom: none; }}
-            .endpoint-item code {{ flex: 1; }}
-            .endpoint-desc {{ color: var(--pico-muted-color); font-size: 0.85rem; }}
-            /* Details accordion styling */
-            details {{
-                border: 1px solid var(--border-color);
-                border-radius: 0.5rem;
-                margin-bottom: 0.4rem;
-                background: var(--subtle-bg);
+    status_text = "Connected" if auth_valid else "Disconnected"
+
+    html_content = f"""<!DOCTYPE html>
+<html lang="en" data-theme="dark">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="color-scheme" content="light dark">
+    <title>Claude Code OpenAI Wrapper</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        *, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
+        :root {{
+            --font-sans: 'DM Sans', sans-serif;
+            --font-mono: 'JetBrains Mono', monospace;
+        }}
+        [data-theme="dark"] {{
+            --bg: #111111;
+            --surface: #1a1a1a;
+            --surface-alt: #222222;
+            --border: #2a2a2a;
+            --text: #e0e0e0;
+            --text-muted: #888888;
+            --accent: #3b82f6;
+            --code-bg: #161616;
+        }}
+        [data-theme="light"] {{
+            --bg: #f5f5f4;
+            --surface: #ffffff;
+            --surface-alt: #fafaf9;
+            --border: #e5e5e5;
+            --text: #1a1a1a;
+            --text-muted: #666666;
+            --accent: #2563eb;
+            --code-bg: #f5f5f4;
+        }}
+        html {{ font-size: 15px; }}
+        body {{
+            font-family: var(--font-sans);
+            background: var(--bg);
+            color: var(--text);
+            line-height: 1.5;
+            -webkit-font-smoothing: antialiased;
+        }}
+        a {{ color: var(--accent); text-decoration: none; }}
+        a:hover {{ text-decoration: underline; }}
+        code, pre {{ font-family: var(--font-mono); }}
+        .wrap {{
+            max-width: 860px;
+            margin: 0 auto;
+            padding: 2.5rem 1.5rem;
+        }}
+        .hdr {{
+            display: flex;
+            justify-content: space-between;
+            align-items: baseline;
+            gap: 1rem;
+            margin-bottom: 2rem;
+            flex-wrap: wrap;
+        }}
+        .hdr h1 {{
+            font-size: 1.4rem;
+            font-weight: 600;
+            letter-spacing: -0.02em;
+        }}
+        .hdr-right {{
+            display: flex;
+            align-items: center;
+            gap: 0.75rem;
+        }}
+        .ver {{
+            font-family: var(--font-mono);
+            font-size: 0.8rem;
+            color: var(--text-muted);
+        }}
+        .ibtn {{
+            width: 2rem;
+            height: 2rem;
+            display: inline-flex;
+            align-items: center;
+            justify-content: center;
+            border: 1px solid var(--border);
+            border-radius: 6px;
+            background: var(--surface);
+            color: var(--text-muted);
+            cursor: pointer;
+            transition: color 0.15s;
+        }}
+        .ibtn:hover {{ color: var(--text); }}
+        .ibtn svg {{ width: 1rem; height: 1rem; }}
+        .status-bar {{
+            display: flex;
+            align-items: center;
+            gap: 0.75rem;
+            padding: 0.75rem 1rem;
+            background: var(--surface);
+            border: 1px solid var(--border);
+            border-radius: 8px;
+            margin-bottom: 2rem;
+            font-size: 0.85rem;
+        }}
+        .status-dot {{
+            width: 8px;
+            height: 8px;
+            border-radius: 50%;
+            flex-shrink: 0;
+        }}
+        .status-bar .sep {{
+            width: 1px;
+            height: 1rem;
+            background: var(--border);
+        }}
+        .status-bar code {{
+            font-size: 0.8rem;
+            color: var(--accent);
+        }}
+        .section {{
+            margin-bottom: 2rem;
+        }}
+        .section-title {{
+            font-size: 0.7rem;
+            font-weight: 600;
+            text-transform: uppercase;
+            letter-spacing: 0.08em;
+            color: var(--text-muted);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.5rem;
+            border-bottom: 1px solid var(--border);
+        }}
+        .qs {{
+            position: relative;
+            background: var(--code-bg);
+            border: 1px solid var(--border);
+            border-radius: 8px;
+            overflow: hidden;
+        }}
+        .qs .copy-btn {{
+            position: absolute;
+            top: 0.5rem;
+            right: 0.5rem;
+            padding: 0.35rem;
+            background: var(--surface);
+            border: 1px solid var(--border);
+            border-radius: 4px;
+            cursor: pointer;
+            color: var(--text-muted);
+            z-index: 1;
+            transition: color 0.15s;
+        }}
+        .qs .copy-btn:hover {{ color: var(--text); }}
+        .qs .copy-btn svg {{ width: 0.85rem; height: 0.85rem; display: block; }}
+        .shiki {{ padding: 1rem; border-radius: 0; overflow-x: auto; }}
+        .shiki code {{ white-space: pre-wrap; word-break: break-word; font-size: 0.8rem; }}
+        .hidden {{ display: none !important; }}
+        .ep-group {{ margin-bottom: 1.25rem; }}
+        .ep-group-label {{
+            font-size: 0.7rem;
+            font-weight: 500;
+            text-transform: uppercase;
+            letter-spacing: 0.06em;
+            color: var(--text-muted);
+            padding: 0.35rem 0;
+            margin-bottom: 0.25rem;
+        }}
+        .ep {{
+            display: flex;
+            align-items: center;
+            gap: 0.75rem;
+            padding: 0.45rem 0.5rem;
+            border-radius: 6px;
+            font-size: 0.85rem;
+            transition: background 0.1s;
+        }}
+        .ep:hover {{ background: var(--surface-alt); }}
+        .ep .method {{
+            font-family: var(--font-mono);
+            font-size: 0.65rem;
+            font-weight: 500;
+            width: 3.2rem;
+            text-align: center;
+            padding: 0.2rem 0;
+            border-radius: 3px;
+            flex-shrink: 0;
+        }}
+        .m-get {{ background: rgba(59,130,246,0.12); color: #60a5fa; }}
+        .m-post {{ background: rgba(245,158,11,0.12); color: #fbbf24; }}
+        .m-delete {{ background: rgba(239,68,68,0.12); color: #f87171; }}
+        [data-theme="light"] .m-get {{ background: rgba(37,99,235,0.1); color: #2563eb; }}
+        [data-theme="light"] .m-post {{ background: rgba(217,119,6,0.1); color: #b45309; }}
+        [data-theme="light"] .m-delete {{ background: rgba(220,38,38,0.1); color: #dc2626; }}
+        .ep .path {{
+            font-family: var(--font-mono);
+            font-size: 0.8rem;
+            flex: 1;
+        }}
+        .ep .desc {{
+            color: var(--text-muted);
+            font-size: 0.8rem;
+            text-align: right;
+            flex-shrink: 0;
+        }}
+        details.ep-detail {{
+            border-radius: 6px;
+        }}
+        details.ep-detail summary {{
+            display: flex;
+            align-items: center;
+            gap: 0.75rem;
+            padding: 0.45rem 0.5rem;
+            border-radius: 6px;
+            font-size: 0.85rem;
+            cursor: pointer;
+            list-style: none;
+            transition: background 0.1s;
+        }}
+        details.ep-detail summary::-webkit-details-marker {{ display: none; }}
+        details.ep-detail summary:hover {{ background: var(--surface-alt); }}
+        details.ep-detail summary::after {{
+            content: "";
+            width: 0.4rem;
+            height: 0.4rem;
+            border-right: 1.5px solid var(--text-muted);
+            border-bottom: 1.5px solid var(--text-muted);
+            transform: rotate(-45deg);
+            transition: transform 0.15s;
+            flex-shrink: 0;
+        }}
+        details.ep-detail[open] summary::after {{ transform: rotate(45deg); }}
+        details.ep-detail .detail-body {{
+            margin: 0.25rem 0 0.5rem 4.5rem;
+            padding: 0.75rem;
+            background: var(--code-bg);
+            border: 1px solid var(--border);
+            border-radius: 6px;
+            overflow-x: auto;
+        }}
+        details.ep-detail .detail-body pre {{
+            margin: 0;
+            font-size: 0.8rem;
+        }}
+        .btn-sm {{
+            font-family: var(--font-sans);
+            font-size: 0.75rem;
+            font-weight: 500;
+            padding: 0.35rem 0.75rem;
+            border: 1px solid var(--border);
+            border-radius: 4px;
+            background: var(--surface);
+            color: var(--text);
+            cursor: pointer;
+            transition: background 0.15s;
+        }}
+        .btn-sm:hover {{ background: var(--surface-alt); }}
+        .ftr {{
+            display: flex;
+            flex-wrap: wrap;
+            justify-content: space-between;
+            align-items: center;
+            gap: 1rem;
+            padding-top: 1.5rem;
+            border-top: 1px solid var(--border);
+            font-size: 0.8rem;
+            color: var(--text-muted);
+        }}
+        .ftr-links {{
+            display: flex;
+            gap: 1.25rem;
+        }}
+        .ftr-links a {{ color: var(--text-muted); }}
+        .ftr-links a:hover {{ color: var(--text); text-decoration: none; }}
+        .ftr-auth {{
+            font-family: var(--font-mono);
+            font-size: 0.75rem;
+        }}
+    </style>
+    <script type="module">
+        import {{ codeToHtml }} from 'https://esm.sh/shiki@3.0.0';
+        const lightTheme = 'github-light';
+        const darkTheme = 'github-dark';
+        function isDark() {{ return document.documentElement.getAttribute('data-theme') === 'dark'; }}
+
+        async function highlightJson(json, targetId) {{
+            const code = typeof json === 'string' ? json : JSON.stringify(json, null, 2);
+            const theme = isDark() ? darkTheme : lightTheme;
+            try {{
+                const html = await codeToHtml(code, {{ lang: 'json', theme }});
+                document.getElementById(targetId).innerHTML = html;
+            }} catch (e) {{
+                document.getElementById(targetId).textContent = 'Error: ' + e.message;
             }}
-            details summary {{
-                padding: 0.5rem 0.75rem;
-                display: flex;
-                align-items: center;
-                gap: 0.75rem;
-                cursor: pointer;
-                list-style: none;
-            }}
-            details summary::-webkit-details-marker {{ display: none; }}
-            details summary::after {{
-                content: "";
-                margin-left: auto;
-                width: 0.5rem;
-                height: 0.5rem;
-                border-right: 2px solid currentColor;
-                border-bottom: 2px solid currentColor;
-                transform: rotate(-45deg);
-                transition: transform 0.2s;
-            }}
-            details[open] summary::after {{ transform: rotate(45deg); }}
-            details .content {{ padding: 0 1rem 1rem; }}
-            details .content pre {{
-                margin: 0;
-                font-size: 0.875rem;
-                overflow-x: auto;
-            }}
-            /* Config grid */
-            .config-grid {{
-                display: grid;
-                grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
-                gap: 0.75rem;
-            }}
-            .config-item {{
-                padding: 0.75rem;
-                background: var(--subtle-bg);
-                border: 1px solid var(--border-color);
-                border-radius: 0.5rem;
-            }}
-            .config-item code {{ font-weight: 600; }}
-            .config-item p {{ margin: 0.25rem 0 0; font-size: 0.875rem; color: var(--pico-muted-color); }}
-            /* Footer */
-            footer nav {{
-                display: flex;
-                justify-content: center;
-                gap: 2rem;
-            }}
-            footer a {{
-                display: flex;
-                align-items: center;
-                gap: 0.5rem;
-            }}
-            footer svg {{ width: 1rem; height: 1rem; }}
-            /* Quick start */
-            .quickstart-wrapper {{ position: relative; }}
-            .copy-btn {{
-                position: absolute;
-                top: 0.5rem;
-                right: 0.5rem;
-                padding: 0.5rem;
-                background: var(--subtle-bg);
-                border: 1px solid var(--border-color);
-                border-radius: 0.5rem;
-                cursor: pointer;
-                z-index: 1;
-                color: inherit;
-            }}
-            .copy-btn:hover {{ opacity: 0.8; }}
-            .copy-btn svg {{ width: 1rem; height: 1rem; }}
-            .hidden {{ display: none !important; }}
-            /* Shiki code styling */
-            .shiki {{ padding: 1rem; border-radius: 0.5rem; overflow-x: auto; }}
-            .shiki code {{ white-space: pre-wrap; word-break: break-word; }}
-            /* Status card layout */
-            .status-flex {{
-                display: flex;
-                justify-content: space-between;
-                align-items: center;
-                flex-wrap: wrap;
-                gap: 1rem;
-            }}
-            .status-left {{
-                display: flex;
-                align-items: center;
-                gap: 0.75rem;
-            }}
-            .auth-badge {{
-                padding: 0.25rem 0.75rem;
-                background: var(--subtle-bg);
-                border: 1px solid var(--border-color);
-                border-radius: 1rem;
-                font-size: 0.875rem;
-            }}
-        </style>
-        <script type="module">
-            import {{ codeToHtml }} from 'https://esm.sh/shiki@3.0.0';
-
-            const lightTheme = 'github-light';
-            const darkTheme = 'github-dark';
+        }}
 
-            function isDark() {{
-                return document.documentElement.getAttribute('data-theme') === 'dark';
-            }}
-
-            async function highlightJson(json, targetId) {{
-                const code = typeof json === 'string' ? json : JSON.stringify(json, null, 2);
-                const theme = isDark() ? darkTheme : lightTheme;
-                try {{
-                    const html = await codeToHtml(code, {{ lang: 'json', theme }});
-                    document.getElementById(targetId).innerHTML = html;
-                }} catch (e) {{
-                    document.getElementById(targetId).innerHTML = '<pre style="color:red;">Error: ' + e.message + '</pre>';
-                }}
-            }}
-
-            // Lazy load data when details opens
-            document.querySelectorAll('details[data-endpoint]').forEach(details => {{
-                details.addEventListener('toggle', async () => {{
-                    if (details.open) {{
-                        const id = details.id;
-                        const endpoint = details.dataset.endpoint;
-                        const dataContainer = document.getElementById('data-' + id);
-                        const loader = document.getElementById('loader-' + id);
-                        if (dataContainer.innerHTML === '' || dataContainer.dataset.theme !== (isDark() ? 'dark' : 'light')) {{
-                            loader.classList.remove('hidden');
-                            try {{
-                                const response = await fetch(endpoint);
-                                const json = await response.json();
-                                await highlightJson(json, 'data-' + id);
-                                dataContainer.dataset.theme = isDark() ? 'dark' : 'light';
-                            }} catch (e) {{
-                                dataContainer.innerHTML = '<span style="color:red;">Error: ' + e.message + '</span>';
-                            }}
-                            loader.classList.add('hidden');
-                        }}
-                    }}
-                }});
-            }});
-
-            // Re-highlight on theme change
-            window.addEventListener('themeChanged', async () => {{
-                await highlightQuickstart();
-                document.querySelectorAll('details[open][data-endpoint]').forEach(async details => {{
+        document.querySelectorAll('details[data-endpoint]').forEach(details => {{
+            details.addEventListener('toggle', async () => {{
+                if (details.open) {{
                     const id = details.id;
                     const endpoint = details.dataset.endpoint;
                     const dataContainer = document.getElementById('data-' + id);
-                    if (dataContainer && dataContainer.innerHTML) {{
-                        const response = await fetch(endpoint);
-                        const json = await response.json();
-                        await highlightJson(json, 'data-' + id);
-                        dataContainer.dataset.theme = isDark() ? 'dark' : 'light';
+                    const loader = document.getElementById('loader-' + id);
+                    if (dataContainer.innerHTML === '' || dataContainer.dataset.theme !== (isDark() ? 'dark' : 'light')) {{
+                        loader.classList.remove('hidden');
+                        try {{
+                            const response = await fetch(endpoint);
+                            const json = await response.json();
+                            await highlightJson(json, 'data-' + id);
+                            dataContainer.dataset.theme = isDark() ? 'dark' : 'light';
+                        }} catch (e) {{
+                            dataContainer.textContent = 'Error: ' + e.message;
+                        }}
+                        loader.classList.add('hidden');
                     }}
-                }});
-            }});
-
-            const quickstartCode = `curl -X POST http://localhost:8000/v1/chat/completions \\\\
-  -H "Content-Type: application/json" \\\\
-  -d '{{"model": "claude-sonnet-4-5-20250929", "messages": [{{"role": "user", "content": "Hello!"}}]}}'`;
-
-            async function highlightQuickstart() {{
-                const theme = isDark() ? darkTheme : lightTheme;
-                try {{
-                    const html = await codeToHtml(quickstartCode, {{ lang: 'bash', theme }});
-                    document.getElementById('quickstart-code').innerHTML = html;
-                }} catch (e) {{
-                    document.getElementById('quickstart-code').innerHTML = '<pre>' + quickstartCode + '</pre>';
                 }}
-            }}
-
-            window.highlightQuickstart = highlightQuickstart;
-            highlightQuickstart();
-        </script>
-        <script>
-            const quickstartText = 'curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d \\'{{"model": "claude-sonnet-4-5-20250929", "messages": [{{"role": "user", "content": "Hello!"}}]}}\\'';
-
-            function copyQuickstart() {{
-                if (navigator.clipboard && navigator.clipboard.writeText) {{
-                    navigator.clipboard.writeText(quickstartText).then(showCopySuccess).catch(fallbackCopy);
-                }} else {{
-                    fallbackCopy();
+            }});
+        }});
+
+        window.addEventListener('themeChanged', async () => {{
+            await highlightQuickstart();
+            document.querySelectorAll('details[open][data-endpoint]').forEach(async details => {{
+                const id = details.id;
+                const endpoint = details.dataset.endpoint;
+                const dataContainer = document.getElementById('data-' + id);
+                if (dataContainer && dataContainer.innerHTML) {{
+                    const response = await fetch(endpoint);
+                    const json = await response.json();
+                    await highlightJson(json, 'data-' + id);
+                    dataContainer.dataset.theme = isDark() ? 'dark' : 'light';
                 }}
-            }}
-
-            function fallbackCopy() {{
-                const textarea = document.createElement('textarea');
-                textarea.value = quickstartText;
-                textarea.style.position = 'fixed';
-                textarea.style.opacity = '0';
-                document.body.appendChild(textarea);
-                textarea.select();
-                try {{ document.execCommand('copy'); showCopySuccess(); }} catch (e) {{ console.error('Copy failed:', e); }}
-                document.body.removeChild(textarea);
-            }}
+            }});
+        }});
 
-            function showCopySuccess() {{
-                const copyIcon = document.getElementById('copy-icon');
-                const checkIcon = document.getElementById('check-icon');
-                copyIcon.classList.add('hidden');
-                checkIcon.classList.remove('hidden');
-                setTimeout(() => {{
-                    copyIcon.classList.remove('hidden');
-                    checkIcon.classList.add('hidden');
-                }}, 2000);
+        const quickstartCode = `curl -X POST http://localhost:8000/v1/chat/completions \\\\
+  -H "Content-Type: application/json" \\\\
+  -d '{{"model": "claude-sonnet-4-6", "messages": [{{"role": "user", "content": "Hello!"}}]}}'`;
+
+        async function highlightQuickstart() {{
+            const theme = isDark() ? darkTheme : lightTheme;
+            try {{
+                const html = await codeToHtml(quickstartCode, {{ lang: 'bash', theme }});
+                document.getElementById('quickstart-code').innerHTML = html;
+            }} catch (e) {{
+                document.getElementById('quickstart-code').textContent = quickstartCode;
             }}
-
-            function toggleTheme() {{
-                const html = document.documentElement;
-                const current = html.getAttribute('data-theme');
-                const next = current === 'dark' ? 'light' : 'dark';
-                html.setAttribute('data-theme', next);
-                localStorage.setItem('theme', next);
-                updateThemeIcon(next === 'dark');
-                window.dispatchEvent(new Event('themeChanged'));
+        }}
+        window.highlightQuickstart = highlightQuickstart;
+        highlightQuickstart();
+    </script>
+    <script>
+        const quickstartText = 'curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d \\'{{"model": "claude-sonnet-4-6", "messages": [{{"role": "user", "content": "Hello!"}}]}}\\'';
+
+        function copyQuickstart() {{
+            if (navigator.clipboard && navigator.clipboard.writeText) {{
+                navigator.clipboard.writeText(quickstartText).then(showCopySuccess).catch(fallbackCopy);
+            }} else {{ fallbackCopy(); }}
+        }}
+        function fallbackCopy() {{
+            const ta = document.createElement('textarea');
+            ta.value = quickstartText;
+            ta.style.cssText = 'position:fixed;opacity:0';
+            document.body.appendChild(ta);
+            ta.select();
+            try {{ document.execCommand('copy'); showCopySuccess(); }} catch (e) {{}}
+            document.body.removeChild(ta);
+        }}
+        function showCopySuccess() {{
+            document.getElementById('copy-icon').classList.add('hidden');
+            document.getElementById('check-icon').classList.remove('hidden');
+            setTimeout(() => {{
+                document.getElementById('copy-icon').classList.remove('hidden');
+                document.getElementById('check-icon').classList.add('hidden');
+            }}, 2000);
+        }}
+        function toggleTheme() {{
+            const html = document.documentElement;
+            const next = html.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', next);
+            localStorage.setItem('theme', next);
+            document.getElementById('sun-icon').classList.toggle('hidden', next === 'dark');
+            document.getElementById('moon-icon').classList.toggle('hidden', next !== 'dark');
+            window.dispatchEvent(new Event('themeChanged'));
+        }}
+        async function refreshModels() {{
+            const el = document.getElementById('data-models-refresh');
+            el.textContent = 'Refreshing...';
+            try {{
+                const r = await fetch('/v1/models/refresh', {{ method: 'POST' }});
+                const d = await r.json();
+                el.textContent = JSON.stringify(d, null, 2);
+            }} catch (e) {{
+                el.textContent = 'Error: ' + e.message;
             }}
-
-            function updateThemeIcon(isDark) {{
-                document.getElementById('sun-icon').classList.toggle('hidden', isDark);
-                document.getElementById('moon-icon').classList.toggle('hidden', !isDark);
+        }}
+        document.addEventListener('DOMContentLoaded', () => {{
+            const saved = localStorage.getItem('theme');
+            if (saved) {{
+                document.documentElement.setAttribute('data-theme', saved);
+                document.getElementById('sun-icon').classList.toggle('hidden', saved === 'dark');
+                document.getElementById('moon-icon').classList.toggle('hidden', saved !== 'dark');
+            }} else {{
+                document.getElementById('sun-icon').classList.add('hidden');
             }}
-
-            document.addEventListener('DOMContentLoaded', () => {{
-                const saved = localStorage.getItem('theme');
-                if (saved) {{
-                    document.documentElement.setAttribute('data-theme', saved);
-                    updateThemeIcon(saved === 'dark');
-                }} else {{
-                    updateThemeIcon(true);
-                }}
-            }});
-        </script>
-    </head>
-    <body>
-        <main class="container">
-            <!-- Header -->
-            <header class="header-flex">
-                <div class="header-left">
-                    <div class="logo-container">
-                        <div class="logo-inner">
-                            <svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                                <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8 9l3 3-3 3m5 0h3M5 20h14a2 2 0 002-2V6a2 2 0 00-2-2H5a2 2 0 00-2 2v12a2 2 0 002 2z"/>
-                            </svg>
-                        </div>
-                    </div>
-                    <div>
-                        <h1 style="margin:0;">Claude Code OpenAI Wrapper</h1>
-                        <p style="margin:0;color:var(--pico-muted-color);">OpenAI-compatible API for Claude</p>
-                    </div>
+        }});
+    </script>
+</head>
+<body>
+<div class="wrap">
+
+    <header class="hdr">
+        <h1>Claude Code OpenAI Wrapper</h1>
+        <div class="hdr-right">
+            <span class="ver">v{__version__}</span>
+            <button onclick="toggleTheme()" class="ibtn" title="Toggle theme">
+                <svg id="sun-icon" class="hidden" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 3v1m0 16v1m9-9h-1M4 12H3m15.364 6.364l-.707-.707M6.343 6.343l-.707-.707m12.728 0l-.707.707M6.343 17.657l-.707.707M16 12a4 4 0 11-8 0 4 4 0 018 0z"/></svg>
+                <svg id="moon-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M20.354 15.354A9 9 0 018.646 3.646 9.003 9.003 0 0012 21a9.003 9.003 0 008.354-5.646z"/></svg>
+            </button>
+            <a href="https://github.com/ttlequals0/claude-code-openai-wrapper" target="_blank" rel="noopener noreferrer" class="ibtn" title="View on GitHub">
+                <svg fill="currentColor" viewBox="0 0 24 24"><path fill-rule="evenodd" d="M12 2C6.477 2 2 6.484 2 12.017c0 4.425 2.865 8.18 6.839 9.504.5.092.682-.217.682-.483 0-.237-.008-.868-.013-1.703-2.782.605-3.369-1.343-3.369-1.343-.454-1.158-1.11-1.466-1.11-1.466-.908-.62.069-.608.069-.608 1.003.07 1.531 1.032 1.531 1.032.892 1.53 2.341 1.088 2.91.832.092-.647.35-1.088.636-1.338-2.22-.253-4.555-1.113-4.555-4.951 0-1.093.39-1.988 1.029-2.688-.103-.253-.446-1.272.098-2.65 0 0 .84-.27 2.75 1.026A9.564 9.564 0 0112 6.844c.85.004 1.705.115 2.504.337 1.909-1.296 2.747-1.027 2.747-1.027.546 1.379.202 2.398.1 2.651.64.7 1.028 1.595 1.028 2.688 0 3.848-2.339 4.695-4.566 4.943.359.309.678.92.678 1.855 0 1.338-.012 2.419-.012 2.747 0 .268.18.58.688.482A10.019 10.019 0 0022 12.017C22 6.484 17.522 2 12 2z" clip-rule="evenodd"/></svg>
+            </a>
+        </div>
+    </header>
+
+    <div class="status-bar">
+        <span class="status-dot" style="background:{status_color};"></span>
+        <span>{status_text}</span>
+        <span class="sep"></span>
+        <span>Auth: <code>{auth_method}</code></span>
+    </div>
+
+    <div class="section">
+        <div class="section-title">Quick Start</div>
+        <div class="qs">
+            <button onclick="copyQuickstart()" class="copy-btn" title="Copy to clipboard">
+                <svg id="copy-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8 16H6a2 2 0 01-2-2V6a2 2 0 012-2h8a2 2 0 012 2v2m-6 12h8a2 2 0 002-2v-8a2 2 0 00-2-2h-8a2 2 0 00-2 2v8a2 2 0 002 2z"/></svg>
+                <svg id="check-icon" class="hidden" fill="none" stroke="currentColor" viewBox="0 0 24 24" style="color:#22c55e;"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"/></svg>
+            </button>
+            <div id="quickstart-code"></div>
+        </div>
+    </div>
+
+    <div class="section">
+        <div class="section-title">Endpoints</div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">Core API</div>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/chat/completions</span>
+                <span class="desc">OpenAI-compatible chat</span>
+            </div>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/messages</span>
+                <span class="desc">Anthropic-compatible</span>
+            </div>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">Models</div>
+            <details id="models" data-endpoint="/v1/models" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/models</span>
+                    <span class="desc">List available models</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-models" class="hidden">Loading...</small>
+                    <div id="data-models"></div>
                 </div>
-                <div class="header-right">
-                    <span class="version-badge">v{__version__}</span>
-                    <button onclick="toggleTheme()" class="icon-btn" title="Toggle theme">
-                        <svg id="sun-icon" class="hidden" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 3v1m0 16v1m9-9h-1M4 12H3m15.364 6.364l-.707-.707M6.343 6.343l-.707-.707m12.728 0l-.707.707M6.343 17.657l-.707.707M16 12a4 4 0 11-8 0 4 4 0 018 0z"/>
-                        </svg>
-                        <svg id="moon-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M20.354 15.354A9 9 0 018.646 3.646 9.003 9.003 0 0012 21a9.003 9.003 0 008.354-5.646z"/>
-                        </svg>
-                    </button>
-                    <a href="https://github.com/aaronlippold/claude-code-openai-wrapper" target="_blank" rel="noopener noreferrer" class="icon-btn" title="View on GitHub">
-                        <svg fill="currentColor" viewBox="0 0 24 24">
-                            <path fill-rule="evenodd" d="M12 2C6.477 2 2 6.484 2 12.017c0 4.425 2.865 8.18 6.839 9.504.5.092.682-.217.682-.483 0-.237-.008-.868-.013-1.703-2.782.605-3.369-1.343-3.369-1.343-.454-1.158-1.11-1.466-1.11-1.466-.908-.62.069-.608.069-.608 1.003.07 1.531 1.032 1.531 1.032.892 1.53 2.341 1.088 2.91.832.092-.647.35-1.088.636-1.338-2.22-.253-4.555-1.113-4.555-4.951 0-1.093.39-1.988 1.029-2.688-.103-.253-.446-1.272.098-2.65 0 0 .84-.27 2.75 1.026A9.564 9.564 0 0112 6.844c.85.004 1.705.115 2.504.337 1.909-1.296 2.747-1.027 2.747-1.027.546 1.379.202 2.398.1 2.651.64.7 1.028 1.595 1.028 2.688 0 3.848-2.339 4.695-4.566 4.943.359.309.678.92.678 1.855 0 1.338-.012 2.419-.012 2.747 0 .268.18.58.688.482A10.019 10.019 0 0022 12.017C22 6.484 17.522 2 12 2z" clip-rule="evenodd"/>
-                        </svg>
-                    </a>
+            </details>
+            <details id="models-status" data-endpoint="/v1/models/status" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/models/status</span>
+                    <span class="desc">Model service status</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-models-status" class="hidden">Loading...</small>
+                    <div id="data-models-status"></div>
                 </div>
-            </header>
-
-            <!-- Status Card -->
-            <article>
-                <div class="status-flex">
-                    <div class="status-left">
-                        <span class="status-dot" style="background-color: {status_color};"></span>
-                        <strong>{status_text}</strong>
-                    </div>
-                    <span class="auth-badge">Auth: <code class="green-code">{auth_method}</code></span>
+            </details>
+            <details id="models-refresh" class="ep-detail">
+                <summary>
+                    <span class="method m-post">POST</span>
+                    <span class="path">/v1/models/refresh</span>
+                    <span class="desc">Refresh from API</span>
+                </summary>
+                <div class="detail-body">
+                    <p style="margin-bottom:0.5rem;font-size:0.8rem;color:var(--text-muted);">Requires api_key auth with ANTHROPIC_API_KEY set.</p>
+                    <button onclick="refreshModels()" class="btn-sm">Refresh Models</button>
+                    <div id="data-models-refresh" style="margin-top:0.5rem;"></div>
                 </div>
-            </article>
-
-            <!-- Quick Start -->
-            <article>
-                <div class="section-header">
-                    <svg class="section-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M13 10V3L4 14h7v7l9-11h-7z"/></svg>
-                    <strong>Quick Start</strong>
+            </details>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">Sessions</div>
+            <details id="sessions" data-endpoint="/v1/sessions" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/sessions</span>
+                    <span class="desc">List active sessions</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-sessions" class="hidden">Loading...</small>
+                    <div id="data-sessions"></div>
                 </div>
-                <div class="quickstart-wrapper">
-                    <button onclick="copyQuickstart()" class="copy-btn" title="Copy to clipboard">
-                        <svg id="copy-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8 16H6a2 2 0 01-2-2V6a2 2 0 012-2h8a2 2 0 012 2v2m-6 12h8a2 2 0 002-2v-8a2 2 0 00-2-2h-8a2 2 0 00-2 2v8a2 2 0 002 2z"/>
-                        </svg>
-                        <svg id="check-icon" class="hidden" fill="none" stroke="currentColor" viewBox="0 0 24 24" style="color:#22c55e;">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"/>
-                        </svg>
-                    </button>
-                    <div id="quickstart-code"></div>
+            </details>
+            <details id="sessions-stats" data-endpoint="/v1/sessions/stats" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/sessions/stats</span>
+                    <span class="desc">Session statistics</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-sessions-stats" class="hidden">Loading...</small>
+                    <div id="data-sessions-stats"></div>
                 </div>
-            </article>
-
-            <!-- API Endpoints -->
-            <article>
-                <div class="section-header">
-                    <svg class="section-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8 9l3 3-3 3m5 0h3M5 20h14a2 2 0 002-2V6a2 2 0 00-2-2H5a2 2 0 00-2 2v12a2 2 0 002 2z"/></svg>
-                    <strong>API Endpoints</strong>
+            </details>
+            <div class="ep">
+                <span class="method m-get">GET</span>
+                <span class="path">/v1/sessions/{{id}}</span>
+                <span class="desc">Get session by ID</span>
+            </div>
+            <div class="ep">
+                <span class="method m-delete">DELETE</span>
+                <span class="path">/v1/sessions/{{id}}</span>
+                <span class="desc">Delete session</span>
+            </div>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">Tools</div>
+            <details id="tools" data-endpoint="/v1/tools" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/tools</span>
+                    <span class="desc">List available tools</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-tools" class="hidden">Loading...</small>
+                    <div id="data-tools"></div>
                 </div>
-
-                <!-- Static POST endpoints -->
-                <div class="endpoint-item">
-                    <span class="badge badge-post">POST</span>
-                    <code>/v1/chat/completions</code>
-                    <span class="endpoint-desc">OpenAI-compatible chat</span>
+            </details>
+            <details id="tools-config" data-endpoint="/v1/tools/config" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/tools/config</span>
+                    <span class="desc">Tool configuration</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-tools-config" class="hidden">Loading...</small>
+                    <div id="data-tools-config"></div>
                 </div>
-                <div class="endpoint-item">
-                    <span class="badge badge-post">POST</span>
-                    <code>/v1/messages</code>
-                    <span class="endpoint-desc">Anthropic-compatible</span>
+            </details>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/tools/config</span>
+                <span class="desc">Update tool config</span>
+            </div>
+            <details id="tools-stats" data-endpoint="/v1/tools/stats" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/tools/stats</span>
+                    <span class="desc">Tool usage stats</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-tools-stats" class="hidden">Loading...</small>
+                    <div id="data-tools-stats"></div>
                 </div>
-
-                <!-- Expandable GET endpoints -->
-                <details id="models" data-endpoint="/v1/models" name="endpoints">
-                    <summary>
-                        <span class="badge badge-get">GET</span>
-                        <code>/v1/models</code>
-                        <span class="endpoint-desc">List models</span>
-                    </summary>
-                    <div class="content">
-                        <small id="loader-models" class="hidden">Loading...</small>
-                        <div id="data-models"></div>
-                    </div>
-                </details>
-
-                <details id="auth" data-endpoint="/v1/auth/status" name="endpoints">
-                    <summary>
-                        <span class="badge badge-get">GET</span>
-                        <code>/v1/auth/status</code>
-                        <span class="endpoint-desc">Auth status</span>
-                    </summary>
-                    <div class="content">
-                        <small id="loader-auth" class="hidden">Loading...</small>
-                        <div id="data-auth"></div>
-                    </div>
-                </details>
-
-                <details id="sessions" data-endpoint="/v1/sessions" name="endpoints">
-                    <summary>
-                        <span class="badge badge-get">GET</span>
-                        <code>/v1/sessions</code>
-                        <span class="endpoint-desc">Active sessions</span>
-                    </summary>
-                    <div class="content">
-                        <small id="loader-sessions" class="hidden">Loading...</small>
-                        <div id="data-sessions"></div>
-                    </div>
-                </details>
-
-                <details id="health" data-endpoint="/health" name="endpoints">
-                    <summary>
-                        <span class="badge badge-get">GET</span>
-                        <code>/health</code>
-                        <span class="endpoint-desc">Health check</span>
-                    </summary>
-                    <div class="content">
-                        <small id="loader-health" class="hidden">Loading...</small>
-                        <div id="data-health"></div>
-                    </div>
-                </details>
-
-                <details id="version" data-endpoint="/version" name="endpoints">
-                    <summary>
-                        <span class="badge badge-get">GET</span>
-                        <code>/version</code>
-                        <span class="endpoint-desc">API version</span>
-                    </summary>
-                    <div class="content">
-                        <small id="loader-version" class="hidden">Loading...</small>
-                        <div id="data-version"></div>
-                    </div>
-                </details>
-            </article>
-
-            <!-- Configuration -->
-            <article>
-                <div class="section-header">
-                    <svg class="section-icon" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10.325 4.317c.426-1.756 2.924-1.756 3.35 0a1.724 1.724 0 002.573 1.066c1.543-.94 3.31.826 2.37 2.37a1.724 1.724 0 001.065 2.572c1.756.426 1.756 2.924 0 3.35a1.724 1.724 0 00-1.066 2.573c.94 1.543-.826 3.31-2.37 2.37a1.724 1.724 0 00-2.572 1.065c-.426 1.756-2.924 1.756-3.35 0a1.724 1.724 0 00-2.573-1.066c-1.543.94-3.31-.826-2.37-2.37a1.724 1.724 0 00-1.065-2.572c-1.756-.426-1.756-2.924 0-3.35a1.724 1.724 0 001.066-2.573c-.94-1.543.826-3.31 2.37-2.37.996.608 2.296.07 2.572-1.065z"/><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M15 12a3 3 0 11-6 0 3 3 0 016 0z"/></svg>
-                    <strong>Configuration</strong>
+            </details>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">MCP Servers</div>
+            <details id="mcp-servers" data-endpoint="/v1/mcp/servers" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/mcp/servers</span>
+                    <span class="desc">List MCP servers</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-mcp-servers" class="hidden">Loading...</small>
+                    <div id="data-mcp-servers"></div>
                 </div>
-                <p>Set <code>CLAUDE_AUTH_METHOD</code> to choose authentication:</p>
-                <div class="config-grid">
-                    <div class="config-item">
-                        <code class="green-code">cli</code>
-                        <p>Claude CLI auth</p>
-                    </div>
-                    <div class="config-item">
-                        <code class="green-code">api_key</code>
-                        <p>ANTHROPIC_API_KEY</p>
-                    </div>
-                    <div class="config-item">
-                        <code class="green-code">bedrock</code>
-                        <p>AWS Bedrock</p>
-                    </div>
-                    <div class="config-item">
-                        <code class="green-code">vertex</code>
-                        <p>Google Vertex AI</p>
-                    </div>
+            </details>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/mcp/servers</span>
+                <span class="desc">Register server</span>
+            </div>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/mcp/connect</span>
+                <span class="desc">Connect to server</span>
+            </div>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/mcp/disconnect</span>
+                <span class="desc">Disconnect server</span>
+            </div>
+            <details id="mcp-stats" data-endpoint="/v1/mcp/stats" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/mcp/stats</span>
+                    <span class="desc">MCP statistics</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-mcp-stats" class="hidden">Loading...</small>
+                    <div id="data-mcp-stats"></div>
                 </div>
-            </article>
-
-            <!-- Footer -->
-            <footer>
-                <nav>
-                    <a href="/docs">
-                        <svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/>
-                        </svg>
-                        API Docs
-                    </a>
-                    <a href="/redoc">
-                        <svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/>
-                        </svg>
-                        ReDoc
-                    </a>
-                </nav>
-            </footer>
-        </main>
-    </body>
-    </html>
-    """
+            </details>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">Cache</div>
+            <details id="cache-stats" data-endpoint="/v1/cache/stats" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/cache/stats</span>
+                    <span class="desc">Cache statistics</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-cache-stats" class="hidden">Loading...</small>
+                    <div id="data-cache-stats"></div>
+                </div>
+            </details>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/cache/clear</span>
+                <span class="desc">Clear request cache</span>
+            </div>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">Auth / Debug</div>
+            <details id="auth" data-endpoint="/v1/auth/status" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/v1/auth/status</span>
+                    <span class="desc">Auth status</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-auth" class="hidden">Loading...</small>
+                    <div id="data-auth"></div>
+                </div>
+            </details>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/compatibility</span>
+                <span class="desc">Parameter compatibility check</span>
+            </div>
+            <div class="ep">
+                <span class="method m-post">POST</span>
+                <span class="path">/v1/debug/request</span>
+                <span class="desc">Debug request validation</span>
+            </div>
+        </div>
+
+        <div class="ep-group">
+            <div class="ep-group-label">System</div>
+            <details id="health" data-endpoint="/health" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/health</span>
+                    <span class="desc">Health check</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-health" class="hidden">Loading...</small>
+                    <div id="data-health"></div>
+                </div>
+            </details>
+            <details id="version" data-endpoint="/version" class="ep-detail">
+                <summary>
+                    <span class="method m-get">GET</span>
+                    <span class="path">/version</span>
+                    <span class="desc">API version</span>
+                </summary>
+                <div class="detail-body">
+                    <small id="loader-version" class="hidden">Loading...</small>
+                    <div id="data-version"></div>
+                </div>
+            </details>
+        </div>
+    </div>
+
+    <footer class="ftr">
+        <div class="ftr-links">
+            <a href="/docs">Swagger Docs</a>
+            <a href="/redoc">ReDoc</a>
+            <a href="https://github.com/ttlequals0/claude-code-openai-wrapper" target="_blank" rel="noopener noreferrer">GitHub</a>
+        </div>
+        <div class="ftr-auth">CLAUDE_AUTH_METHOD: cli | api_key | bedrock | vertex</div>
+    </footer>
+
+</div>
+</body>
+</html>"""
     return HTMLResponse(content=html_content)
 
 
 @app.post("/v1/debug/request")
 @rate_limit_endpoint("debug")
 async def debug_request_validation(request: Request):
-    """Debug endpoint to test request validation and see what's being sent."""
+    """Debug endpoint to test request validation and see what's being sent.
+
+    Returns a minimal response unless DEBUG_MODE or VERBOSE is enabled, so
+    that exception/request detail is never emitted to production clients.
+    """
+    if not (DEBUG_MODE or VERBOSE):
+        return {
+            "debug_info": {
+                "enabled": False,
+                "hint": "Set DEBUG_MODE=true or VERBOSE=true to enable this endpoint",
+            }
+        }
+
     try:
         # Get the raw request body
         body = await request.body()
@@ -1548,7 +2786,9 @@ async def debug_request_validation(request: Request):
 
             parsed_body = json_lib.loads(raw_body) if raw_body else {}
         except Exception as e:
-            json_error = str(e)
+            # Only expose the exception type, never its message/stack trace.
+            json_error = type(e).__name__
+            logger.warning(f"Debug endpoint JSON parse error: {e}")
 
         # Try to validate against our model
         validation_result = {"valid": False, "errors": []}
@@ -1581,7 +2821,7 @@ async def debug_request_validation(request: Request):
                 "validation_result": validation_result,
                 "debug_mode_enabled": DEBUG_MODE or VERBOSE,
                 "example_valid_request": {
-                    "model": "claude-3-sonnet-20240229",
+                    "model": "claude-sonnet-4-6",
                     "messages": [{"role": "user", "content": "Hello, world!"}],
                     "stream": False,
                 },
@@ -1589,10 +2829,11 @@ async def debug_request_validation(request: Request):
         }
 
     except Exception as e:
+        # Never echo str(e); log it server-side and return only the type.
+        logger.error(f"Debug endpoint error: {e}")
         return {
             "debug_info": {
-                "error": f"Debug endpoint error: {str(e)}",
-                "headers": dict(request.headers),
+                "error_type": type(e).__name__,
                 "method": request.method,
                 "url": str(request.url),
             }
@@ -1610,6 +2851,7 @@ async def get_auth_status(request: Request):
 
     return {
         "claude_code_auth": auth_info,
+        "cli_health": _auth.cli_health.as_dict(),
         "server_info": {
             "api_key_required": bool(active_api_key),
             "api_key_source": (
@@ -1907,6 +3149,41 @@ async def get_mcp_stats(
     return mcp_client.get_stats()
 
 
+# ============================================================================
+# Cache Endpoints
+# ============================================================================
+
+
+@app.get("/v1/cache/stats")
+@rate_limit_endpoint("general")
+async def get_cache_stats(
+    request: Request, credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)
+):
+    """Get request cache statistics.
+
+    Returns information about cache configuration, current size, hit/miss rates,
+    and eviction counts. Cache is opt-in and disabled by default.
+
+    Enable cache by setting REQUEST_CACHE_ENABLED=true environment variable.
+    """
+    await verify_api_key(request, credentials)
+    return request_cache.get_stats()
+
+
+@app.post("/v1/cache/clear")
+@rate_limit_endpoint("general")
+async def clear_cache(
+    request: Request, credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)
+):
+    """Clear all cached responses.
+
+    Returns the number of entries that were cleared.
+    """
+    await verify_api_key(request, credentials)
+    count = request_cache.clear()
+    return {"message": f"Cleared {count} cache entries", "entries_cleared": count}
+
+
 @app.exception_handler(HTTPException)
 async def http_exception_handler(request: Request, exc: HTTPException):
     """Format HTTP exceptions as OpenAI-style errors."""
diff --git a/src/message_adapter.py b/src/message_adapter.py
index 1c9d732..b927460 100644
--- a/src/message_adapter.py
+++ b/src/message_adapter.py
@@ -1,11 +1,661 @@
-from typing import List, Optional, Dict, Any
+from typing import List, Optional, Dict, Any, Tuple
+from dataclasses import dataclass
 from src.models import Message
 import re
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+# Maximum length of content passed through filter_content(). Inputs larger
+# than this are returned unchanged to avoid worst-case work on
+# pathological inputs (ReDoS defence in depth).
+_MAX_FILTER_LENGTH = 1_000_000
+
+# Tag stripping is done with ``str.find`` below rather than regex, which
+# keeps the work strictly linear in input length even when the closing
+# tag is missing. CodeQL's py/polynomial-redos rule flags the original
+# "<tag>.*?</tag>" shape (and, in practice, non-backtracking regex
+# rewrites still rescan on unterminated input).
+
+_TOOL_TAGS = (
+    "read_file",
+    "write_file",
+    "bash",
+    "search_files",
+    "str_replace_editor",
+    "args",
+    "ask_followup_question",
+    "attempt_completion",
+    "question",
+    "follow_up",
+    "suggest",
+)
+
+
+def _strip_tag_blocks(content: str, tag: str) -> str:
+    """Remove every ``<tag>...</tag>`` block from ``content`` in linear time.
+
+    Unlike regex-based stripping, this uses ``str.find`` (C-implemented,
+    single-pass search) so pathological inputs like ``"<tag>" * N + "x"``
+    complete in O(N) rather than backtracking quadratically.
+    """
+    open_tag = f"<{tag}>"
+    close_tag = f"</{tag}>"
+    if open_tag not in content:
+        return content
+    parts: List[str] = []
+    i = 0
+    while True:
+        start = content.find(open_tag, i)
+        if start == -1:
+            parts.append(content[i:])
+            return "".join(parts)
+        parts.append(content[i:start])
+        end = content.find(close_tag, start + len(open_tag))
+        if end == -1:
+            # No matching close tag - keep the remainder as-is rather than
+            # dropping user content, and bail out.
+            parts.append(content[start:])
+            return "".join(parts)
+        i = end + len(close_tag)
+
+
+def _extract_first_block(content: str, tag: str) -> Optional[str]:
+    """Return the inner text of the first ``<tag>...</tag>`` block or ``None``."""
+    open_tag = f"<{tag}>"
+    close_tag = f"</{tag}>"
+    start = content.find(open_tag)
+    if start == -1:
+        return None
+    inner_start = start + len(open_tag)
+    end = content.find(close_tag, inner_start)
+    if end == -1:
+        return None
+    return content[inner_start:end]
+
+
+# Bounded image-reference pattern. The originals used lazy ".*?" with a
+# trailing lookahead which CodeQL flags as polynomial. Fixed upper bounds
+# (well above any plausible legitimate image reference) make the match
+# linear while still removing the unsupported content.
+_IMAGE_RE = re.compile(
+    r"\[Image:[^\]]{0,1024}\]" r"|data:image/[A-Za-z0-9.+-]{1,32};base64,[^\s]{0,65536}"
+)
+
+# Whitespace collapser used at the tail of filter_content.
+_MULTI_NEWLINE_RE = re.compile(r"\n\s*\n\s*\n")
+
+
+@dataclass
+class JsonExtractionResult:
+    """Result of JSON extraction with metadata about the extraction process."""
+
+    content: Optional[str]
+    success: bool
+    method: str  # "direct", "preamble_removed", "code_block", "brace_match", "fallback", "failed"
+    original_length: int
+    extracted_length: int
+    preamble_found: Optional[str] = None
+
+
+class JsonFenceStripper:
+    """Strips markdown ```json fences from streaming chunks in real-time."""
+
+    _FENCES = ["```json\n", "```json\r\n", "```\n", "```\r\n"]
+    _MAX_FENCE_LEN = 10  # longest fence prefix to buffer
+    _CLOSE = "```"
+
+    def __init__(self):
+        self._opening_buf = ""
+        self._opening_stripped = False
+        self._holdback = ""
+
+    def process_delta(self, chunk: str) -> str:
+        if not chunk:
+            return ""
+
+        # Phase 1: detect and strip opening fence
+        if not self._opening_stripped:
+            self._opening_buf += chunk
+            if len(self._opening_buf) < self._MAX_FENCE_LEN:
+                # Still accumulating -- check if it could be a fence prefix
+                for fence in self._FENCES:
+                    if fence.startswith(self._opening_buf):
+                        return ""  # could still match, hold back
+                # No fence can match, release buffer
+                self._opening_stripped = True
+                result = self._opening_buf
+                self._opening_buf = ""
+                return self._apply_holdback(result)
+            else:
+                # Buffer full -- check for fence match
+                self._opening_stripped = True
+                for fence in self._FENCES:
+                    if self._opening_buf.startswith(fence):
+                        remainder = self._opening_buf[len(fence) :]
+                        self._opening_buf = ""
+                        return self._apply_holdback(remainder)
+                # No match, release everything
+                result = self._opening_buf
+                self._opening_buf = ""
+                return self._apply_holdback(result)
+
+        return self._apply_holdback(chunk)
+
+    def _apply_holdback(self, text: str) -> str:
+        combined = self._holdback + text
+        if len(combined) <= len(self._CLOSE):
+            self._holdback = combined
+            return ""
+        self._holdback = combined[-len(self._CLOSE) :]
+        return combined[: -len(self._CLOSE)]
+
+    def flush(self) -> str:
+        result = self._holdback
+        self._holdback = ""
+        # Strip closing fence if present
+        result = result.rstrip()
+        if result.endswith("```"):
+            result = result[:-3].rstrip()
+        return result
 
 
 class MessageAdapter:
     """Converts between OpenAI message format and Claude Code prompts."""
 
+    # Instruction to prepend to system prompt for JSON mode
+    JSON_MODE_INSTRUCTION = (
+        "CRITICAL JSON OUTPUT RULES - FOLLOW EXACTLY:\n"
+        "1. Your ENTIRE response must be valid JSON - nothing else\n"
+        "2. The FIRST character must be { or [ (no exceptions)\n"
+        "3. The LAST character must be } or ] (no exceptions)\n"
+        "4. FORBIDDEN: Do NOT write 'Here is the JSON:', 'Here's the response:', or ANY preamble\n"
+        "5. FORBIDDEN: Do NOT use markdown code blocks (```)\n"
+        "6. FORBIDDEN: Do NOT add any explanation before or after the JSON\n"
+        "7. Start typing the JSON immediately - your first keystroke must be { or ["
+    )
+
+    # Suffix to append to user prompt to reinforce JSON mode
+    JSON_PROMPT_SUFFIX = (
+        "\n\n---\n"
+        "RESPOND WITH RAW JSON ONLY:\n"
+        "- First character: { or [\n"
+        "- Last character: } or ]\n"
+        "- No preamble like 'Here is...' or 'Here's...'\n"
+        "- No markdown, no code fences, no explanation"
+    )
+
+    JSON_SCHEMA_TEMPLATE = (
+        "You MUST respond with valid JSON that strictly conforms to the following JSON Schema.\n"
+        "Do not wrap the JSON in markdown code fences.\n"
+        "Do not include any text before or after the JSON.\n"
+        "RULES:\n"
+        "- Include ALL required properties from the schema, even if empty or default\n"
+        "- Use the EXACT property names from the schema\n"
+        "- Match the EXACT types specified (number not string, etc.)\n"
+        "- Do not add properties not in the schema\n\n"
+        "JSON Schema:\n{schema_json}"
+    )
+
+    # Common preambles that Claude may add before JSON output
+    COMMON_PREAMBLES = [
+        "Here's the JSON:",
+        "Here is the JSON:",
+        "Here's the response:",
+        "Here is the response:",
+        "Here's your JSON:",
+        "Here is your JSON:",
+        "Here's the JSON response:",
+        "Here is the JSON response:",
+        "Here's the data:",
+        "Here is the data:",
+        "Here's the result:",
+        "Here is the result:",
+        "Here's the output:",
+        "Here is the output:",
+        "The JSON is:",
+        "JSON response:",
+        "Response:",
+        "Output:",
+        "Result:",
+    ]
+
+    @staticmethod
+    def _find_balanced_json(content: str, start_char: str, end_char: str) -> Optional[str]:
+        """
+        Find balanced JSON structure using brace/bracket matching.
+
+        Handles escaped quotes and braces inside strings correctly.
+
+        Args:
+            content: The content to search in
+            start_char: Opening character ('{' or '[')
+            end_char: Closing character ('}' or ']')
+
+        Returns:
+            Matched JSON substring or None if not found
+        """
+        start_idx = content.find(start_char)
+        if start_idx == -1:
+            return None
+
+        depth = 0
+        in_string = False
+        escape_next = False
+
+        for i, char in enumerate(content[start_idx:], start=start_idx):
+            if escape_next:
+                escape_next = False
+                continue
+
+            if char == "\\":
+                escape_next = True
+                continue
+
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+
+            if in_string:
+                continue
+
+            if char == start_char:
+                depth += 1
+            elif char == end_char:
+                depth -= 1
+                if depth == 0:
+                    candidate = content[start_idx : i + 1]
+                    try:
+                        json.loads(candidate)
+                        return candidate
+                    except json.JSONDecodeError:
+                        # Keep looking for next valid match
+                        return None
+
+        return None
+
+    @staticmethod
+    def _log_extraction_diagnostics(content: str) -> None:
+        """Log diagnostics to help debug JSON extraction failures."""
+        logger.debug("=== JSON Extraction Diagnostics ===")
+
+        # Check for code fences
+        if "```" in content:
+            fence_count = content.count("```")
+            logger.debug(f"Found {fence_count} code fence markers (```) in content")
+            if fence_count % 2 != 0:
+                logger.debug("Odd number of fences - malformed code block?")
+
+        # Check for common preambles
+        content_lower = content.lower().strip()
+        for preamble in MessageAdapter.COMMON_PREAMBLES:
+            if content_lower.startswith(preamble.lower()):
+                logger.debug(f"Content starts with preamble: '{preamble}'")
+                break
+
+        # Check brace/bracket balance
+        open_braces = content.count("{")
+        close_braces = content.count("}")
+        open_brackets = content.count("[")
+        close_brackets = content.count("]")
+
+        logger.debug(f"Brace balance: {{ = {open_braces}, }} = {close_braces}")
+        logger.debug(f"Bracket balance: [ = {open_brackets}, ] = {close_brackets}")
+
+        if open_braces != close_braces:
+            logger.debug("Unbalanced braces - may indicate truncated or malformed JSON")
+        if open_brackets != close_brackets:
+            logger.debug("Unbalanced brackets - may indicate truncated or malformed JSON")
+
+        # First and last character analysis
+        if content:
+            first_char = content[0] if content else ""
+            last_char = content[-1] if content else ""
+            logger.debug(f"First character: '{first_char}', Last character: '{last_char}'")
+
+            if first_char not in "{[":
+                logger.debug("First char is not { or [ - content has preamble or is not JSON")
+            if last_char not in "}]":
+                logger.debug("Last char is not } or ] - content has suffix or is not JSON")
+
+        # Content preview
+        preview_len = 200
+        if len(content) > preview_len:
+            logger.debug(f"Content preview (first {preview_len}): {content[:preview_len]}...")
+            logger.debug(f"Content preview (last 100): ...{content[-100:]}")
+        else:
+            logger.debug(f"Full content: {content}")
+
+        logger.debug("=== End Diagnostics ===")
+
+    @staticmethod
+    def extract_json(content: str) -> Optional[str]:
+        """
+        Extract JSON from content.
+
+        Priority order:
+        1. Pure JSON (content is already valid JSON) - fast path
+        2. Preamble removal + parse (strip common Claude preambles)
+        3. Markdown code blocks (```json ... ```)
+        4. Balanced brace/bracket matching (handles nested structures)
+        5. First-to-last fallback (find first { to last })
+
+        Args:
+            content: The content to extract JSON from
+
+        Returns:
+            Extracted JSON string, or None if no valid JSON found
+        """
+        if not content:
+            logger.debug("extract_json: Empty content")
+            return None
+
+        original_content = content
+        content = content.strip()
+
+        # Case 1: Try parsing as pure JSON first (fast path)
+        try:
+            json.loads(content)
+            logger.debug(f"extract_json: Already valid JSON ({len(content)} chars)")
+            return content
+        except json.JSONDecodeError:
+            pass
+
+        # Case 2: Try removing common preambles
+        content_lower = content.lower()
+        for preamble in MessageAdapter.COMMON_PREAMBLES:
+            if content_lower.startswith(preamble.lower()):
+                stripped = content[len(preamble) :].strip()
+                try:
+                    json.loads(stripped)
+                    logger.debug(
+                        f"extract_json: Extracted after removing preamble '{preamble}' ({len(stripped)} chars)"
+                    )
+                    return stripped
+                except json.JSONDecodeError:
+                    # Preamble removed but still not valid - try other methods
+                    break
+
+        # Case 3: Extract from markdown code blocks
+        code_block_patterns = [
+            r"```json\s*([\s\S]*?)\s*```",  # ```json block
+            r"```\s*([\s\S]*?)\s*```",  # generic ``` block
+        ]
+
+        for pattern in code_block_patterns:
+            matches = re.findall(pattern, content, re.IGNORECASE)
+            for match in matches:
+                match = match.strip()
+                try:
+                    json.loads(match)
+                    logger.debug(f"extract_json: Extracted from code block ({len(match)} chars)")
+                    return match
+                except json.JSONDecodeError:
+                    logger.debug("extract_json: Code block match failed validation")
+                    continue
+
+        # Case 4: Balanced brace/bracket matching (new algorithm)
+        # Try object first
+        balanced_obj = MessageAdapter._find_balanced_json(content, "{", "}")
+        if balanced_obj:
+            logger.debug(
+                f"extract_json: Extracted via balanced brace matching ({len(balanced_obj)} chars)"
+            )
+            return balanced_obj
+
+        # Try array
+        balanced_arr = MessageAdapter._find_balanced_json(content, "[", "]")
+        if balanced_arr:
+            logger.debug(
+                f"extract_json: Extracted via balanced bracket matching ({len(balanced_arr)} chars)"
+            )
+            return balanced_arr
+
+        # Case 5: First-to-last fallback (less precise but handles some edge cases)
+        first_brace = content.find("{")
+        last_brace = content.rfind("}")
+        if first_brace != -1 and last_brace > first_brace:
+            candidate = content[first_brace : last_brace + 1]
+            try:
+                json.loads(candidate)
+                logger.debug(
+                    f"extract_json: Extracted via first-to-last brace ({len(candidate)} chars)"
+                )
+                return candidate
+            except json.JSONDecodeError:
+                pass
+
+        first_bracket = content.find("[")
+        last_bracket = content.rfind("]")
+        if first_bracket != -1 and last_bracket > first_bracket:
+            candidate = content[first_bracket : last_bracket + 1]
+            try:
+                json.loads(candidate)
+                logger.debug(
+                    f"extract_json: Extracted via first-to-last bracket ({len(candidate)} chars)"
+                )
+                return candidate
+            except json.JSONDecodeError:
+                pass
+
+        # Extraction failed - log diagnostics
+        logger.warning(f"extract_json: No valid JSON found in {len(content)} chars")
+        MessageAdapter._log_extraction_diagnostics(original_content)
+        return None
+
+    @staticmethod
+    def extract_json_with_metadata(content: str) -> JsonExtractionResult:
+        """
+        Extract JSON from content and return metadata about the extraction process.
+
+        This method provides detailed information about how the extraction was performed,
+        useful for debugging and monitoring.
+
+        Args:
+            content: The content to extract JSON from
+
+        Returns:
+            JsonExtractionResult with extraction details
+        """
+        if not content:
+            return JsonExtractionResult(
+                content=None,
+                success=False,
+                method="failed",
+                original_length=0,
+                extracted_length=0,
+            )
+
+        original_length = len(content)
+        content = content.strip()
+
+        # Case 1: Try parsing as pure JSON first (fast path)
+        try:
+            json.loads(content)
+            return JsonExtractionResult(
+                content=content,
+                success=True,
+                method="direct",
+                original_length=original_length,
+                extracted_length=len(content),
+            )
+        except json.JSONDecodeError:
+            pass
+
+        # Case 2: Try removing common preambles
+        content_lower = content.lower()
+        for preamble in MessageAdapter.COMMON_PREAMBLES:
+            if content_lower.startswith(preamble.lower()):
+                stripped = content[len(preamble) :].strip()
+                try:
+                    json.loads(stripped)
+                    return JsonExtractionResult(
+                        content=stripped,
+                        success=True,
+                        method="preamble_removed",
+                        original_length=original_length,
+                        extracted_length=len(stripped),
+                        preamble_found=preamble,
+                    )
+                except json.JSONDecodeError:
+                    break
+
+        # Case 3: Extract from markdown code blocks
+        code_block_patterns = [
+            r"```json\s*([\s\S]*?)\s*```",
+            r"```\s*([\s\S]*?)\s*```",
+        ]
+
+        for pattern in code_block_patterns:
+            matches = re.findall(pattern, content, re.IGNORECASE)
+            for match in matches:
+                match = match.strip()
+                try:
+                    json.loads(match)
+                    return JsonExtractionResult(
+                        content=match,
+                        success=True,
+                        method="code_block",
+                        original_length=original_length,
+                        extracted_length=len(match),
+                    )
+                except json.JSONDecodeError:
+                    continue
+
+        # Case 4: Balanced brace/bracket matching
+        balanced_obj = MessageAdapter._find_balanced_json(content, "{", "}")
+        if balanced_obj:
+            return JsonExtractionResult(
+                content=balanced_obj,
+                success=True,
+                method="brace_match",
+                original_length=original_length,
+                extracted_length=len(balanced_obj),
+            )
+
+        balanced_arr = MessageAdapter._find_balanced_json(content, "[", "]")
+        if balanced_arr:
+            return JsonExtractionResult(
+                content=balanced_arr,
+                success=True,
+                method="brace_match",
+                original_length=original_length,
+                extracted_length=len(balanced_arr),
+            )
+
+        # Case 5: First-to-last fallback
+        first_brace = content.find("{")
+        last_brace = content.rfind("}")
+        if first_brace != -1 and last_brace > first_brace:
+            candidate = content[first_brace : last_brace + 1]
+            try:
+                json.loads(candidate)
+                return JsonExtractionResult(
+                    content=candidate,
+                    success=True,
+                    method="fallback",
+                    original_length=original_length,
+                    extracted_length=len(candidate),
+                )
+            except json.JSONDecodeError:
+                pass
+
+        first_bracket = content.find("[")
+        last_bracket = content.rfind("]")
+        if first_bracket != -1 and last_bracket > first_bracket:
+            candidate = content[first_bracket : last_bracket + 1]
+            try:
+                json.loads(candidate)
+                return JsonExtractionResult(
+                    content=candidate,
+                    success=True,
+                    method="fallback",
+                    original_length=original_length,
+                    extracted_length=len(candidate),
+                )
+            except json.JSONDecodeError:
+                pass
+
+        # Failed
+        return JsonExtractionResult(
+            content=None,
+            success=False,
+            method="failed",
+            original_length=original_length,
+            extracted_length=0,
+        )
+
+    @staticmethod
+    def enforce_json_format(content: str, strict: bool = False) -> str:
+        """
+        Enforce JSON format on content.
+
+        Args:
+            content: The content to enforce JSON format on
+            strict: If True, return '[]' on failure. If False, return original content.
+
+        Returns:
+            Valid JSON string, or fallback value based on strict mode
+        """
+        extracted = MessageAdapter.extract_json(content)
+
+        if extracted:
+            logger.debug(f"enforce_json_format: Successfully extracted ({len(extracted)} chars)")
+            return extracted
+
+        logger.warning(f"enforce_json_format: Extraction failed, strict={strict}")
+        if strict:
+            return "[]"
+
+        return content
+
+    @staticmethod
+    def enforce_json_format_with_metadata(
+        content: str, strict: bool = False
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Enforce JSON format on content and return metadata about the extraction.
+
+        Args:
+            content: The content to enforce JSON format on
+            strict: If True, return '[]' on failure. If False, return original content.
+
+        Returns:
+            Tuple of (extracted_content, metadata_dict)
+        """
+        result = MessageAdapter.extract_json_with_metadata(content)
+
+        metadata = {
+            "success": result.success,
+            "method": result.method,
+            "original_length": result.original_length,
+            "extracted_length": result.extracted_length,
+            "preamble_found": result.preamble_found,
+            "strict_mode": strict,
+        }
+
+        if result.success and result.content:
+            logger.debug(
+                f"enforce_json_format_with_metadata: method={result.method}, "
+                f"original={result.original_length}, extracted={result.extracted_length}"
+            )
+            if result.preamble_found:
+                logger.debug(
+                    f"enforce_json_format_with_metadata: removed preamble '{result.preamble_found}'"
+                )
+            return result.content, metadata
+
+        logger.warning(f"enforce_json_format_with_metadata: Extraction failed, strict={strict}")
+        metadata["fallback_used"] = True
+
+        if strict:
+            metadata["fallback_value"] = "[]"
+            return "[]", metadata
+
+        return content, metadata
+
     @staticmethod
     def messages_to_prompt(messages: List[Message]) -> tuple[str, Optional[str]]:
         """
@@ -16,13 +666,13 @@ def messages_to_prompt(messages: List[Message]) -> tuple[str, Optional[str]]:
         conversation_parts = []
 
         for message in messages:
+            content = message.content or ""
             if message.role == "system":
-                # Use the last system message as the system prompt
-                system_prompt = message.content
+                system_prompt = content
             elif message.role == "user":
-                conversation_parts.append(f"Human: {message.content}")
+                conversation_parts.append(f"Human: {content}")
             elif message.role == "assistant":
-                conversation_parts.append(f"Assistant: {message.content}")
+                conversation_parts.append(f"Assistant: {content}")
 
         # Join conversation parts
         prompt = "\n\n".join(conversation_parts)
@@ -40,56 +690,36 @@ def filter_content(content: str) -> str:
         Remove thinking blocks, tool calls, and image references.
         """
         if not content:
+            return content or ""
+
+        # Defence in depth: cap work on adversarial inputs.
+        if len(content) > _MAX_FILTER_LENGTH:
             return content
 
         # Remove thinking blocks (common when tools are disabled but Claude tries to think)
-        thinking_pattern = r"<thinking>.*?</thinking>"
-        content = re.sub(thinking_pattern, "", content, flags=re.DOTALL)
+        content = _strip_tag_blocks(content, "thinking")
 
         # Extract content from attempt_completion blocks (these contain the actual user response)
-        attempt_completion_pattern = r"<attempt_completion>(.*?)</attempt_completion>"
-        attempt_matches = re.findall(attempt_completion_pattern, content, flags=re.DOTALL)
-        if attempt_matches:
-            # Use the content from the attempt_completion block
-            extracted_content = attempt_matches[0].strip()
+        attempt_inner = _extract_first_block(content, "attempt_completion")
+        if attempt_inner is not None:
+            extracted_content = attempt_inner.strip()
 
             # If there's a <result> tag inside, extract from that
-            result_pattern = r"<result>(.*?)</result>"
-            result_matches = re.findall(result_pattern, extracted_content, flags=re.DOTALL)
-            if result_matches:
-                extracted_content = result_matches[0].strip()
+            result_inner = _extract_first_block(extracted_content, "result")
+            if result_inner is not None:
+                extracted_content = result_inner.strip()
 
             if extracted_content:
                 content = extracted_content
         else:
             # Remove other tool usage blocks (when tools are disabled but Claude tries to use them)
-            tool_patterns = [
-                r"<read_file>.*?</read_file>",
-                r"<write_file>.*?</write_file>",
-                r"<bash>.*?</bash>",
-                r"<search_files>.*?</search_files>",
-                r"<str_replace_editor>.*?</str_replace_editor>",
-                r"<args>.*?</args>",
-                r"<ask_followup_question>.*?</ask_followup_question>",
-                r"<attempt_completion>.*?</attempt_completion>",
-                r"<question>.*?</question>",
-                r"<follow_up>.*?</follow_up>",
-                r"<suggest>.*?</suggest>",
-            ]
-
-            for pattern in tool_patterns:
-                content = re.sub(pattern, "", content, flags=re.DOTALL)
-
-        # Pattern to match image references or base64 data
-        image_pattern = r"\[Image:.*?\]|data:image/.*?;base64,.*?(?=\s|$)"
-
-        def replace_image(match):
-            return "[Image: Content not supported by Claude Code]"
-
-        content = re.sub(image_pattern, replace_image, content)
+            for tag in _TOOL_TAGS:
+                content = _strip_tag_blocks(content, tag)
+
+        content = _IMAGE_RE.sub("[Image: Content not supported by Claude Code]", content)
 
         # Clean up extra whitespace and newlines
-        content = re.sub(r"\n\s*\n\s*\n", "\n\n", content)  # Multiple newlines to double
+        content = _MULTI_NEWLINE_RE.sub("\n\n", content)
         content = content.strip()
 
         # If content is now empty or only whitespace, provide a fallback
diff --git a/src/model_service.py b/src/model_service.py
new file mode 100644
index 0000000..3abee8a
--- /dev/null
+++ b/src/model_service.py
@@ -0,0 +1,238 @@
+"""
+Model service for dynamically fetching available models from Anthropic API.
+
+This service provides:
+- Dynamic model discovery from Anthropic API on startup
+- Runtime model refresh via refresh_models() method
+- Graceful fallback to static CLAUDE_MODELS when API is unavailable
+- Caching of fetched models with refresh timestamp tracking
+- Auth method awareness (only fetches from API for 'anthropic' auth)
+"""
+
+import os
+import time
+import logging
+from typing import List, Optional, Dict, Any
+
+import httpx
+
+from src.constants import CLAUDE_MODELS
+from src.auth import auth_manager
+
+logger = logging.getLogger(__name__)
+
+# Anthropic API configuration
+ANTHROPIC_API_BASE = "https://api.anthropic.com"
+ANTHROPIC_API_VERSION = "2023-06-01"
+MODEL_FETCH_TIMEOUT = 10.0  # seconds
+
+
+class ModelService:
+    """Fetches models from Anthropic API with fallback to constants."""
+
+    def __init__(self):
+        self._cached_models: Optional[List[str]] = None
+        self._http_client: Optional[httpx.AsyncClient] = None
+        self._initialized: bool = False
+        self._last_refresh: Optional[float] = None
+        self._source: str = "fallback"  # "api" or "fallback"
+
+    async def initialize(self) -> None:
+        """Called during app startup - fetch models from API."""
+        if self._initialized:
+            return
+
+        self._http_client = httpx.AsyncClient(timeout=MODEL_FETCH_TIMEOUT)
+
+        # Attempt to fetch models from API
+        fetched_models = await self.fetch_models_from_api()
+
+        if fetched_models:
+            self._cached_models = fetched_models
+            self._source = "api"
+            self._last_refresh = time.time()
+            logger.info(f"Successfully fetched {len(fetched_models)} models from Anthropic API")
+        else:
+            self._cached_models = None
+            self._source = "fallback"
+            logger.info("Using fallback static model list from constants")
+
+        self._initialized = True
+
+    async def shutdown(self) -> None:
+        """Close HTTP client on app shutdown."""
+        if self._http_client:
+            await self._http_client.aclose()
+            self._http_client = None
+        self._cached_models = None
+        self._initialized = False
+        self._last_refresh = None
+        self._source = "fallback"
+
+    async def fetch_models_from_api(self) -> Optional[List[str]]:
+        """
+        Fetch models based on configured auth method.
+
+        Only the 'anthropic' auth method supports dynamic model fetching.
+        Other auth methods (cli, bedrock, vertex) use static model lists.
+
+        Returns list of model IDs on success, None on failure/unsupported.
+        """
+        auth_method = auth_manager.auth_method
+
+        if auth_method == "anthropic":
+            # Use ANTHROPIC_API_KEY for direct API call
+            api_key = os.getenv("ANTHROPIC_API_KEY")
+            if not api_key:
+                logger.debug("ANTHROPIC_API_KEY not set, using fallback")
+                return None
+            return await self._fetch_with_api_key(api_key)
+
+        elif auth_method == "claude_cli":
+            # CLI auth doesn't expose API key - use fallback
+            logger.info("CLI auth method configured - using static model list")
+            return None
+
+        elif auth_method == "bedrock":
+            # Bedrock uses different model naming, use fallback
+            logger.info("Bedrock auth method - using static model list")
+            return None
+
+        elif auth_method == "vertex":
+            # Vertex uses different model naming, use fallback
+            logger.info("Vertex auth method - using static model list")
+            return None
+
+        logger.debug(f"Unknown auth method '{auth_method}', using fallback")
+        return None
+
+    async def _fetch_with_api_key(self, api_key: str) -> Optional[List[str]]:
+        """
+        Fetch models from Anthropic API using API key.
+
+        GET https://api.anthropic.com/v1/models
+        Headers:
+           - x-api-key: {ANTHROPIC_API_KEY}
+           - anthropic-version: 2023-06-01
+
+        Returns list of model IDs on success, None on failure.
+        """
+        if not self._http_client:
+            self._http_client = httpx.AsyncClient(timeout=MODEL_FETCH_TIMEOUT)
+
+        try:
+            response = await self._http_client.get(
+                f"{ANTHROPIC_API_BASE}/v1/models",
+                headers={
+                    "x-api-key": api_key,
+                    "anthropic-version": ANTHROPIC_API_VERSION,
+                },
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                # Extract model IDs from the response
+                # API returns {"data": [{"id": "claude-...", ...}, ...]}
+                models = []
+                for model_data in data.get("data", []):
+                    model_id = model_data.get("id")
+                    if model_id:
+                        models.append(model_id)
+
+                if models:
+                    logger.debug(f"Fetched models from API: {models}")
+                    return models
+                else:
+                    logger.warning("API returned empty model list")
+                    return None
+
+            elif response.status_code == 401:
+                logger.warning(
+                    "Anthropic API authentication failed (401). Check ANTHROPIC_API_KEY."
+                )
+                return None
+            elif response.status_code == 429:
+                logger.warning("Anthropic API rate limited (429). Using fallback models.")
+                return None
+            else:
+                logger.warning(
+                    f"Anthropic API returned status {response.status_code}. Using fallback models."
+                )
+                return None
+
+        except httpx.TimeoutException:
+            logger.warning(f"Anthropic API request timed out after {MODEL_FETCH_TIMEOUT}s")
+            return None
+        except httpx.RequestError as e:
+            logger.warning(f"Network error fetching models from Anthropic API: {e}")
+            return None
+        except Exception as e:
+            logger.warning(f"Unexpected error fetching models: {e}")
+            return None
+
+    def get_models(self) -> List[str]:
+        """Return cached models or CLAUDE_MODELS fallback."""
+        if self._cached_models:
+            return self._cached_models
+        return list(CLAUDE_MODELS)
+
+    def is_initialized(self) -> bool:
+        """Check if service has been initialized."""
+        return self._initialized
+
+    async def refresh_models(self) -> Dict[str, Any]:
+        """Force refresh models from Anthropic API.
+
+        Returns a dict with refresh status and model information.
+        If the API call fails, existing cached models are preserved.
+
+        Note: Only 'anthropic' auth method supports dynamic refresh.
+        Other auth methods will return success=False with explanation.
+        """
+        auth_method = auth_manager.auth_method
+
+        # Check if auth method supports dynamic refresh
+        if auth_method != "anthropic":
+            return {
+                "success": False,
+                "message": f"Dynamic refresh requires ANTHROPIC_API_KEY. Current auth: {auth_method}",
+                "current_count": len(self.get_models()),
+                "source": self._source,
+                "auth_method": auth_method,
+            }
+
+        models = await self.fetch_models_from_api()
+        if models:
+            self._cached_models = models
+            self._last_refresh = time.time()
+            self._source = "api"
+            logger.info(f"Refreshed {len(models)} models from Anthropic API")
+            return {
+                "success": True,
+                "count": len(models),
+                "source": "api",
+                "models": models,
+                "auth_method": auth_method,
+            }
+        else:
+            return {
+                "success": False,
+                "message": "API fetch failed, keeping existing models",
+                "current_count": len(self.get_models()),
+                "source": self._source,
+                "auth_method": auth_method,
+            }
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get service status including source, auth method, and last refresh time."""
+        return {
+            "initialized": self._initialized,
+            "source": self._source,
+            "model_count": len(self.get_models()),
+            "last_refresh": self._last_refresh,
+            "auth_method": auth_manager.auth_method,
+        }
+
+
+# Global singleton instance
+model_service = ModelService()
diff --git a/src/models.py b/src/models.py
index 82e85f4..3a0a1f9 100644
--- a/src/models.py
+++ b/src/models.py
@@ -1,3 +1,4 @@
+import os
 from typing import List, Optional, Dict, Any, Union, Literal
 from pydantic import BaseModel, Field, field_validator, model_validator
 from datetime import datetime
@@ -7,12 +8,34 @@
 logger = logging.getLogger(__name__)
 
 
-# Import DEFAULT_MODEL to avoid circular imports
+# Resolve the default model lazily (avoids circular imports). If the operator
+# set DEFAULT_MODEL via env var, honor it; otherwise prefer the live-resolved
+# latest Sonnet (set at startup by main.resolve_default_model), falling back
+# to the static constant when resolution hasn't happened yet.
 def get_default_model():
-    """Get default model from constants to avoid circular imports."""
-    from src.constants import DEFAULT_MODEL
-
-    return DEFAULT_MODEL
+    from src import constants
+
+    if constants.DEFAULT_MODEL_ENV:
+        return constants.DEFAULT_MODEL_ENV
+    return constants.RESOLVED_DEFAULT_MODEL or constants.DEFAULT_MODEL_FALLBACK
+
+
+def _map_max_tokens_to_thinking() -> bool:
+    """Whether to map OpenAI max_tokens to Claude's max_thinking_tokens.
+
+    Historically the wrapper sent ``max_tokens`` as ``max_thinking_tokens``
+    because the Claude Agent SDK has no direct output-length cap. That mapping
+    is semantically wrong (OpenAI max_tokens caps response length; thinking
+    tokens cap reasoning budget) and caused short prompts to either bust the
+    max_turns ceiling or return truncated output. The mapping is now opt-in
+    via ``WRAPPER_MAP_MAX_TOKENS_TO_THINKING=true``; default is off.
+    """
+    return os.getenv("WRAPPER_MAP_MAX_TOKENS_TO_THINKING", "false").lower() in (
+        "true",
+        "1",
+        "yes",
+        "on",
+    )
 
 
 class ContentPart(BaseModel):
@@ -22,10 +45,34 @@ class ContentPart(BaseModel):
     text: str
 
 
+class FunctionCall(BaseModel):
+    name: str
+    arguments: str
+
+
+class ToolCall(BaseModel):
+    id: str
+    type: Literal["function"] = "function"
+    function: FunctionCall
+
+
+class FunctionDefinition(BaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, Any]] = None
+
+
+class ToolDefinition(BaseModel):
+    type: Literal["function"] = "function"
+    function: FunctionDefinition
+
+
 class Message(BaseModel):
-    role: Literal["system", "user", "assistant"]
-    content: Union[str, List[ContentPart]]
+    role: Literal["system", "user", "assistant", "tool"]
+    content: Optional[Union[str, List[ContentPart]]] = None
     name: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = None
+    tool_call_id: Optional[str] = None
 
     @model_validator(mode="after")
     def normalize_content(self):
@@ -53,6 +100,22 @@ class StreamOptions(BaseModel):
     )
 
 
+class JsonSchema(BaseModel):
+    name: str = ""
+    description: Optional[str] = None
+    schema_: Optional[Dict[str, Any]] = Field(default=None, alias="schema")
+    strict: Optional[bool] = None
+    model_config = {"populate_by_name": True}
+
+
+class ResponseFormat(BaseModel):
+    type: Literal["text", "json_object", "json_schema"] = Field(
+        default="text",
+        description="Response format type",
+    )
+    json_schema: Optional[JsonSchema] = None
+
+
 class ChatCompletionRequest(BaseModel):
     model: str = Field(default_factory=get_default_model)
     messages: List[Message]
@@ -79,6 +142,18 @@ class ChatCompletionRequest(BaseModel):
     stream_options: Optional[StreamOptions] = Field(
         default=None, description="Options for streaming responses"
     )
+    response_format: Optional[ResponseFormat] = Field(
+        default=None,
+        description="Response format - use {'type': 'json_object'} for JSON mode",
+    )
+    tools: Optional[List[ToolDefinition]] = Field(
+        default=None,
+        description="List of tools the model may call (OpenAI function calling format)",
+    )
+    tool_choice: Optional[Union[str, Dict[str, Any]]] = Field(
+        default=None,
+        description="Controls which function is called: 'none', 'auto', 'required', or specific function",
+    )
 
     @field_validator("n")
     @classmethod
@@ -106,9 +181,17 @@ def log_parameter_info(self):
 
         if self.max_tokens is not None or self.max_completion_tokens is not None:
             max_val = self.max_completion_tokens or self.max_tokens
-            info_messages.append(
-                f"max_tokens={max_val} will be mapped to max_thinking_tokens (best-effort)"
-            )
+            if _map_max_tokens_to_thinking():
+                info_messages.append(
+                    f"max_tokens={max_val} will be mapped to max_thinking_tokens "
+                    "(legacy behavior; WRAPPER_MAP_MAX_TOKENS_TO_THINKING=true)"
+                )
+            else:
+                info_messages.append(
+                    f"max_tokens={max_val} is ignored (Claude Agent SDK has no "
+                    "output-length cap; set WRAPPER_MAP_MAX_TOKENS_TO_THINKING=true "
+                    "to restore the legacy max_thinking_tokens mapping)"
+                )
 
         if self.presence_penalty != 0:
             warnings.append(
@@ -181,14 +264,19 @@ def to_claude_options(self) -> Dict[str, Any]:
         if self.model:
             options["model"] = self.model
 
-        # Map max_tokens to max_thinking_tokens (best effort)
+        # OpenAI max_tokens is a response-length cap; the Claude Agent SDK has
+        # no direct equivalent. Historically we mapped it to max_thinking_tokens,
+        # which misused the parameter and caused callers sending small
+        # max_tokens (e.g. 500 for a short title prompt) to burn their budget
+        # on reasoning then bust max_turns before emitting the answer. The
+        # mapping is now off by default; opt in via
+        # WRAPPER_MAP_MAX_TOKENS_TO_THINKING=true to restore the old behavior.
         max_token_value = self.max_completion_tokens or self.max_tokens
-        if max_token_value is not None:
-            # Claude SDK doesn't have exact token limiting, but we can try max_thinking_tokens
-            # This is approximate and may not work as expected
+        if max_token_value is not None and _map_max_tokens_to_thinking():
             options["max_thinking_tokens"] = max_token_value
             logger.info(
-                f"Mapped max_tokens={max_token_value} to max_thinking_tokens (approximate behavior)"
+                f"Mapped max_tokens={max_token_value} to max_thinking_tokens "
+                "(legacy behavior; WRAPPER_MAP_MAX_TOKENS_TO_THINKING=true)"
             )
 
         # Use user field for session identification if provided
@@ -202,7 +290,9 @@ def to_claude_options(self) -> Dict[str, Any]:
 class Choice(BaseModel):
     index: int
     message: Message
-    finish_reason: Optional[Literal["stop", "length", "content_filter", "null"]] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter", "tool_calls", "null"]] = (
+        None
+    )
 
 
 class Usage(BaseModel):
@@ -224,7 +314,9 @@ class ChatCompletionResponse(BaseModel):
 class StreamChoice(BaseModel):
     index: int
     delta: Dict[str, Any]
-    finish_reason: Optional[Literal["stop", "length", "content_filter", "null"]] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter", "tool_calls", "null"]] = (
+        None
+    )
 
 
 class ChatCompletionStreamResponse(BaseModel):
diff --git a/src/parameter_validator.py b/src/parameter_validator.py
index e45452f..df44fd9 100644
--- a/src/parameter_validator.py
+++ b/src/parameter_validator.py
@@ -3,17 +3,33 @@
 """
 
 import logging
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Set
 from src.models import ChatCompletionRequest
-from src.constants import CLAUDE_MODELS
+from src.constants import CLAUDE_MODELS, MODEL_METADATA, VALID_EFFORT_LEVELS, VALID_THINKING_MODES
 
 logger = logging.getLogger(__name__)
 
 
+def get_supported_models() -> Set[str]:
+    """Get supported models from model_service or fallback to constants."""
+    try:
+        from src.model_service import model_service
+
+        return set(model_service.get_models())
+    except ImportError:
+        return set(CLAUDE_MODELS)
+
+
 class ParameterValidator:
     """Validates and maps OpenAI Chat Completions parameters to Claude Code SDK options."""
 
-    # Use models from constants (single source of truth)
+    @classmethod
+    def get_supported_models(cls) -> Set[str]:
+        """Get currently supported models (dynamic or fallback)."""
+        return get_supported_models()
+
+    # Legacy class attribute for backwards compatibility
+    # Use get_supported_models() method for dynamic models
     SUPPORTED_MODELS = set(CLAUDE_MODELS)
 
     # Valid permission modes for Claude Code SDK
@@ -22,9 +38,10 @@ class ParameterValidator:
     @classmethod
     def validate_model(cls, model: str) -> bool:
         """Validate that the model is supported by Claude Code SDK."""
-        if model not in cls.SUPPORTED_MODELS:
+        supported = cls.get_supported_models()
+        if model not in supported:
             logger.warning(
-                f"Model '{model}' is not in the known supported models list. It will still be attempted but may fail. Supported models: {sorted(cls.SUPPORTED_MODELS)}"
+                f"Model '{model}' is not in the known supported models list. It will still be attempted but may fail. Supported models: {sorted(supported)}"
             )
             # Return True anyway to allow graceful degradation
         return True
@@ -139,8 +156,55 @@ def extract_claude_headers(cls, headers: Dict[str, str]) -> Dict[str, Any]:
                     f"Invalid X-Claude-Max-Thinking-Tokens header: {headers['x-claude-max-thinking-tokens']}"
                 )
 
+        # Extract effort level (low, medium, high, max)
+        if "x-claude-effort" in headers:
+            effort = headers["x-claude-effort"].lower().strip()
+            if effort in VALID_EFFORT_LEVELS:
+                claude_options["effort"] = effort
+            else:
+                logger.warning(
+                    f"Invalid X-Claude-Effort header: '{effort}'. "
+                    f"Valid values: {sorted(VALID_EFFORT_LEVELS)}"
+                )
+
+        # Extract thinking mode (adaptive, enabled, disabled)
+        if "x-claude-thinking" in headers:
+            thinking = headers["x-claude-thinking"].lower().strip()
+            if thinking in VALID_THINKING_MODES:
+                claude_options["thinking"] = thinking
+            else:
+                logger.warning(
+                    f"Invalid X-Claude-Thinking header: '{thinking}'. "
+                    f"Valid values: {sorted(VALID_THINKING_MODES)}"
+                )
+
         return claude_options
 
+    @classmethod
+    def validate_max_tokens(cls, model: str, requested_max_tokens: Optional[int]) -> Optional[int]:
+        """Validate and cap max_tokens based on model-specific limits.
+
+        Returns the validated max_tokens value, or None if not specified.
+        Model metadata sourced from open-sourced Claude Code CLI.
+        """
+        if requested_max_tokens is None:
+            return None
+
+        metadata = MODEL_METADATA.get(model)
+        if not metadata:
+            # Unknown model, pass through without validation
+            return requested_max_tokens
+
+        max_limit = metadata["max_output_limit"]
+        if requested_max_tokens > max_limit:
+            logger.warning(
+                f"max_tokens={requested_max_tokens} exceeds limit for {model} "
+                f"(max={max_limit}). Capping to {max_limit}."
+            )
+            return max_limit
+
+        return requested_max_tokens
+
 
 class CompatibilityReporter:
     """Reports on OpenAI API compatibility and suggests alternatives."""
@@ -164,12 +228,14 @@ def generate_compatibility_report(cls, request: ChatCompletionRequest) -> Dict[s
             report["supported_parameters"].append("stream")
         if request.user:
             report["supported_parameters"].append("user (for logging)")
+        if request.response_format:
+            report["supported_parameters"].append("response_format")
 
         # Check unsupported parameters with suggestions
         if request.temperature != 1.0:
             report["unsupported_parameters"].append("temperature")
             report["suggestions"].append(
-                "Claude Code SDK does not support temperature control. Consider using different models for varied response styles (e.g., claude-3-5-haiku for more focused responses)."
+                "Claude Code SDK does not support temperature control. Consider using different models for varied response styles (e.g., claude-haiku-4-5-20251001 for more focused responses)."
             )
 
         if request.top_p != 1.0:
diff --git a/src/request_cache.py b/src/request_cache.py
new file mode 100644
index 0000000..ddf7386
--- /dev/null
+++ b/src/request_cache.py
@@ -0,0 +1,248 @@
+"""
+Request deduplication cache for Claude Code OpenAI Wrapper.
+
+Provides an optional caching layer for identical requests to reduce API calls
+and improve response times for repeated queries.
+"""
+
+import hashlib
+import json
+import os
+import threading
+import time
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Any, Optional
+from collections import OrderedDict
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CacheEntry:
+    """A cached response with metadata."""
+
+    response: Dict[str, Any]
+    created_at: float
+    expires_at: float
+    hit_count: int = 0
+
+
+class RequestCache:
+    """
+    Thread-safe LRU cache with TTL for request deduplication.
+
+    Features:
+    - LRU eviction when max_size is reached
+    - TTL-based expiration
+    - Thread-safe operations
+    - Deterministic request hashing
+    """
+
+    def __init__(
+        self,
+        enabled: bool = True,
+        max_size: int = 100,
+        ttl_seconds: int = 60,
+    ):
+        """
+        Initialize the request cache.
+
+        Args:
+            enabled: Whether caching is enabled
+            max_size: Maximum number of entries to store
+            ttl_seconds: Time-to-live for cache entries in seconds
+        """
+        self._enabled = enabled
+        self._max_size = max_size
+        self._ttl_seconds = ttl_seconds
+        self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
+        self._lock = threading.RLock()
+        self._stats = {
+            "hits": 0,
+            "misses": 0,
+            "evictions": 0,
+            "expirations": 0,
+        }
+
+    @property
+    def enabled(self) -> bool:
+        """Check if caching is enabled."""
+        return self._enabled
+
+    def _compute_hash(self, request_data: Dict[str, Any]) -> str:
+        """
+        Compute a deterministic hash for a request.
+
+        Only includes fields that affect the response:
+        - model
+        - messages
+        - temperature
+        - max_tokens
+        - response_format
+
+        Excludes:
+        - stream (caching only applies to non-streaming)
+        - session_id
+        - other metadata
+
+        Args:
+            request_data: The request dictionary
+
+        Returns:
+            A hex string hash of the request
+        """
+        # Extract only the fields that affect the response
+        hashable_fields = {
+            "model": request_data.get("model"),
+            "messages": request_data.get("messages"),
+            "temperature": request_data.get("temperature"),
+            "max_tokens": request_data.get("max_tokens"),
+            "response_format": request_data.get("response_format"),
+            "top_p": request_data.get("top_p"),
+        }
+
+        # Convert to a stable JSON string (sorted keys)
+        json_str = json.dumps(hashable_fields, sort_keys=True, default=str)
+
+        # Compute SHA-256 hash
+        return hashlib.sha256(json_str.encode()).hexdigest()
+
+    def get(self, request_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Get a cached response for a request.
+
+        Args:
+            request_data: The request dictionary
+
+        Returns:
+            Cached response if found and not expired, None otherwise
+        """
+        if not self._enabled:
+            return None
+
+        cache_key = self._compute_hash(request_data)
+        current_time = time.time()
+
+        with self._lock:
+            if cache_key not in self._cache:
+                self._stats["misses"] += 1
+                return None
+
+            entry = self._cache[cache_key]
+
+            # Check if expired
+            if current_time > entry.expires_at:
+                del self._cache[cache_key]
+                self._stats["expirations"] += 1
+                self._stats["misses"] += 1
+                logger.debug(f"Cache entry expired for key {cache_key[:16]}...")
+                return None
+
+            # Move to end (most recently used)
+            self._cache.move_to_end(cache_key)
+            entry.hit_count += 1
+            self._stats["hits"] += 1
+
+            logger.debug(f"Cache hit for key {cache_key[:16]}... (hit_count={entry.hit_count})")
+            return entry.response
+
+    def set(self, request_data: Dict[str, Any], response: Dict[str, Any]) -> None:
+        """
+        Cache a response for a request.
+
+        Args:
+            request_data: The request dictionary
+            response: The response to cache
+        """
+        if not self._enabled:
+            return
+
+        cache_key = self._compute_hash(request_data)
+        current_time = time.time()
+
+        with self._lock:
+            # Evict if at capacity
+            while len(self._cache) >= self._max_size:
+                oldest_key = next(iter(self._cache))
+                del self._cache[oldest_key]
+                self._stats["evictions"] += 1
+                logger.debug(f"Evicted oldest cache entry {oldest_key[:16]}...")
+
+            # Add new entry
+            self._cache[cache_key] = CacheEntry(
+                response=response,
+                created_at=current_time,
+                expires_at=current_time + self._ttl_seconds,
+            )
+
+            logger.debug(f"Cached response for key {cache_key[:16]}... (ttl={self._ttl_seconds}s)")
+
+    def clear(self) -> int:
+        """
+        Clear all cache entries.
+
+        Returns:
+            Number of entries cleared
+        """
+        with self._lock:
+            count = len(self._cache)
+            self._cache.clear()
+            logger.info(f"Cleared {count} cache entries")
+            return count
+
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get cache statistics.
+
+        Returns:
+            Dictionary with cache stats
+        """
+        with self._lock:
+            total_requests = self._stats["hits"] + self._stats["misses"]
+            hit_rate = (self._stats["hits"] / total_requests * 100) if total_requests > 0 else 0
+
+            return {
+                "enabled": self._enabled,
+                "max_size": self._max_size,
+                "ttl_seconds": self._ttl_seconds,
+                "current_size": len(self._cache),
+                "hits": self._stats["hits"],
+                "misses": self._stats["misses"],
+                "hit_rate_percent": round(hit_rate, 2),
+                "evictions": self._stats["evictions"],
+                "expirations": self._stats["expirations"],
+            }
+
+    def cleanup_expired(self) -> int:
+        """
+        Remove all expired entries.
+
+        Returns:
+            Number of entries removed
+        """
+        current_time = time.time()
+        removed = 0
+
+        with self._lock:
+            expired_keys = [
+                key for key, entry in self._cache.items() if current_time > entry.expires_at
+            ]
+
+            for key in expired_keys:
+                del self._cache[key]
+                removed += 1
+                self._stats["expirations"] += 1
+
+        if removed > 0:
+            logger.debug(f"Cleaned up {removed} expired cache entries")
+
+        return removed
+
+
+# Global cache instance with configuration from environment
+request_cache = RequestCache(
+    enabled=os.getenv("REQUEST_CACHE_ENABLED", "false").lower() in ("true", "1", "yes", "on"),
+    max_size=int(os.getenv("REQUEST_CACHE_MAX_SIZE", "100")),
+    ttl_seconds=int(os.getenv("REQUEST_CACHE_TTL_SECONDS", "60")),
+)
diff --git a/src/retry.py b/src/retry.py
new file mode 100644
index 0000000..59af1a5
--- /dev/null
+++ b/src/retry.py
@@ -0,0 +1,130 @@
+"""
+Retry logic with exponential backoff and model fallback.
+
+Patterns sourced from open-sourced Claude Code CLI (src/services/api/withRetry.ts).
+"""
+
+import asyncio
+import logging
+import random
+from typing import Optional
+
+from src.constants import MODEL_FALLBACK_MAP
+
+logger = logging.getLogger(__name__)
+
+# Retry configuration (matches Claude Code source)
+DEFAULT_MAX_RETRIES = 10
+BASE_DELAY_MS = 500
+MAX_DELAY_MS = 30_000
+MAX_CONSECUTIVE_529_FOR_FALLBACK = 3
+
+
+class RetryConfig:
+    """Configuration for retry behavior."""
+
+    def __init__(
+        self,
+        max_retries: int = DEFAULT_MAX_RETRIES,
+        base_delay_ms: int = BASE_DELAY_MS,
+        max_delay_ms: int = MAX_DELAY_MS,
+        enable_model_fallback: bool = True,
+    ):
+        self.max_retries = max_retries
+        self.base_delay_ms = base_delay_ms
+        self.max_delay_ms = max_delay_ms
+        self.enable_model_fallback = enable_model_fallback
+
+
+class RetryState:
+    """Tracks retry state across attempts for a single request."""
+
+    def __init__(self, config: Optional[RetryConfig] = None):
+        self.config = config or RetryConfig()
+        self.attempt = 0
+        self.consecutive_529s = 0
+        self.fallback_model: Optional[str] = None
+
+    def calculate_delay(self, retry_after: Optional[float] = None) -> float:
+        """Calculate delay with exponential backoff and jitter.
+
+        If a retry-after header value is provided, use it as a minimum.
+        """
+        # Exponential backoff: base * 2^attempt
+        exp_delay = self.config.base_delay_ms * (2**self.attempt)
+        # Cap at max delay
+        exp_delay = min(exp_delay, self.config.max_delay_ms)
+        # Add jitter (0-25% of delay)
+        jitter = random.uniform(0, exp_delay * 0.25)
+        delay_ms = exp_delay + jitter
+
+        # If retry-after is provided, use the larger value
+        if retry_after is not None:
+            retry_after_ms = retry_after * 1000
+            delay_ms = max(delay_ms, retry_after_ms)
+
+        return delay_ms / 1000  # Return seconds
+
+    def should_retry(
+        self, status_code: Optional[int] = None, error: Optional[Exception] = None
+    ) -> bool:
+        """Determine if the request should be retried."""
+        if self.attempt >= self.config.max_retries:
+            return False
+
+        if status_code is not None:
+            if status_code in (429, 529):
+                return True
+            if status_code >= 500:
+                return True
+            if status_code == 401:
+                return True
+
+        if error is not None:
+            error_str = str(error).lower()
+            # Network errors are retryable
+            if any(term in error_str for term in ["timeout", "connection", "econnreset", "epipe"]):
+                return True
+            # Context overflow (400) -- only retry if the error message indicates it
+            if "context" in error_str and ("overflow" in error_str or "too long" in error_str):
+                return True
+
+        return False
+
+    def record_attempt(self, status_code: Optional[int] = None) -> None:
+        """Record an attempt and track consecutive 529s."""
+        self.attempt += 1
+
+        if status_code == 529:
+            self.consecutive_529s += 1
+        else:
+            self.consecutive_529s = 0
+
+    def should_fallback(self, model: str) -> bool:
+        """Check if we should fall back to a faster model after repeated 529s."""
+        if not self.config.enable_model_fallback:
+            return False
+        if self.consecutive_529s < MAX_CONSECUTIVE_529_FOR_FALLBACK:
+            return False
+        return model in MODEL_FALLBACK_MAP
+
+    def get_fallback_model(self, model: str) -> Optional[str]:
+        """Get the fallback model for the given model."""
+        if self.should_fallback(model):
+            fallback = MODEL_FALLBACK_MAP.get(model)
+            if fallback:
+                self.fallback_model = fallback
+                logger.warning(
+                    f"Falling back from {model} to {fallback} after "
+                    f"{self.consecutive_529s} consecutive 529 errors"
+                )
+                self.consecutive_529s = 0
+            return fallback
+        return None
+
+
+async def retry_delay(state: RetryState, retry_after: Optional[float] = None) -> None:
+    """Wait for the calculated retry delay."""
+    delay = state.calculate_delay(retry_after)
+    logger.info(f"Retry attempt {state.attempt}/{state.config.max_retries}, waiting {delay:.1f}s")
+    await asyncio.sleep(delay)
diff --git a/src/tool_manager.py b/src/tool_manager.py
index a481d4a..94fe588 100644
--- a/src/tool_manager.py
+++ b/src/tool_manager.py
@@ -30,20 +30,22 @@ class ToolMetadata:
 
 # Tool metadata database
 TOOL_METADATA: Dict[str, ToolMetadata] = {
-    "Task": ToolMetadata(
-        name="Task",
-        description="Launch specialized agents for complex, multi-step tasks",
+    "Agent": ToolMetadata(
+        name="Agent",
+        description="Spawn sub-agents for complex, multi-step tasks",
         category="agent",
         parameters={
             "description": "Short description of the task",
             "prompt": "Detailed task instructions for the agent",
             "subagent_type": "Type of specialized agent to use",
+            "model": "Optional model override for the agent",
+            "isolation": "Isolation mode (e.g., worktree)",
         },
         examples=[
             "Launch a general-purpose agent to refactor code",
             "Use Explore agent to find API endpoints",
         ],
-        is_safe=False,  # Can spawn sub-agents
+        is_safe=False,
         requires_network=False,
     ),
     "Bash": ToolMetadata(
@@ -54,9 +56,10 @@ class ToolMetadata:
             "command": "The bash command to execute",
             "timeout": "Optional timeout in milliseconds",
             "run_in_background": "Run command in background",
+            "description": "Description of what the command does",
         },
         examples=["Run npm install", "Execute git status", "List directory contents"],
-        is_safe=True,
+        is_safe=False,  # Requires permission in Claude Code
         requires_network=False,
     ),
     "Glob": ToolMetadata(
@@ -197,47 +200,272 @@ class ToolMetadata:
         is_safe=True,
         requires_network=True,
     ),
-    "BashOutput": ToolMetadata(
-        name="BashOutput",
-        description="Retrieve output from background bash shells",
+    "Skill": ToolMetadata(
+        name="Skill",
+        description="Execute specialized skills and slash commands",
+        category="productivity",
+        parameters={"skill": "Skill name to execute", "args": "Optional arguments"},
+        examples=["Execute PDF processing skill", "Run commit skill"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "Brief": ToolMetadata(
+        name="Brief",
+        description="Control output verbosity level",
+        category="output",
+        parameters={"level": "Verbosity level"},
+        examples=["Set brief output mode"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "Config": ToolMetadata(
+        name="Config",
+        description="Read or write Claude Code configuration",
         category="system",
-        parameters={
-            "bash_id": "ID of the background shell",
-            "filter": "Regex to filter output lines",
-        },
-        examples=["Check output of running process", "Monitor long-running command"],
+        parameters={"action": "read or write", "key": "Config key", "value": "Config value"},
+        examples=["Read current config", "Update a setting"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "ListPeers": ToolMetadata(
+        name="ListPeers",
+        description="List peer agents in multi-agent setups",
+        category="agent",
+        parameters={},
+        examples=["List available peer agents"],
         is_safe=True,
         requires_network=False,
     ),
-    "KillShell": ToolMetadata(
-        name="KillShell",
-        description="Kill a running background bash shell",
+    "REPL": ToolMetadata(
+        name="REPL",
+        description="Execute code in a REPL environment",
         category="system",
-        parameters={"shell_id": "ID of the shell to kill"},
-        examples=["Stop long-running background process"],
+        parameters={"code": "Code to execute", "language": "Programming language"},
+        examples=["Run Python code in REPL"],
+        is_safe=False,
+        requires_network=False,
+    ),
+    "Sleep": ToolMetadata(
+        name="Sleep",
+        description="Pause execution for a specified duration",
+        category="system",
+        parameters={"duration": "Duration in milliseconds"},
+        examples=["Wait before retrying an operation"],
         is_safe=True,
         requires_network=False,
     ),
-    "Skill": ToolMetadata(
-        name="Skill",
-        description="Execute specialized skills",
-        category="productivity",
-        parameters={"command": "Skill name to execute"},
-        examples=["Execute PDF processing skill", "Run Excel manipulation skill"],
+    "Monitor": ToolMetadata(
+        name="Monitor",
+        description="Monitor running processes and background tasks",
+        category="system",
+        parameters={"target": "Process or task to monitor"},
+        examples=["Monitor a background build process"],
         is_safe=True,
         requires_network=False,
     ),
-    "SlashCommand": ToolMetadata(
-        name="SlashCommand",
-        description="Execute custom slash commands",
-        category="productivity",
-        parameters={"command": "Slash command with arguments"},
-        examples=["Run custom code review command", "Execute project-specific workflow"],
+    "SendUserFile": ToolMetadata(
+        name="SendUserFile",
+        description="Send a file to the user",
+        category="file",
+        parameters={"path": "Path to the file to send"},
+        examples=["Send generated report to user"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "PushNotification": ToolMetadata(
+        name="PushNotification",
+        description="Send push notifications to the user",
+        category="notification",
+        parameters={"title": "Notification title", "body": "Notification body"},
+        examples=["Notify user that a long task completed"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "ListMcpResources": ToolMetadata(
+        name="ListMcpResources",
+        description="List available MCP server resources",
+        category="mcp",
+        parameters={"server": "MCP server name"},
+        examples=["List resources from a connected MCP server"],
+        is_safe=True,
+        requires_network=True,
+    ),
+    "ReadMcpResource": ToolMetadata(
+        name="ReadMcpResource",
+        description="Read a specific MCP server resource",
+        category="mcp",
+        parameters={"server": "MCP server name", "uri": "Resource URI"},
+        examples=["Read a resource from an MCP server"],
+        is_safe=True,
+        requires_network=True,
+    ),
+    "VerifyPlanExecution": ToolMetadata(
+        name="VerifyPlanExecution",
+        description="Verify that a plan was executed correctly",
+        category="planning",
+        parameters={"plan_id": "ID of the plan to verify"},
+        examples=["Check that all plan steps were completed"],
         is_safe=True,
         requires_network=False,
     ),
+    "SendMessage": ToolMetadata(
+        name="SendMessage",
+        description="Send messages to teammates or other agents",
+        category="agent",
+        parameters={"to": "Recipient agent or teammate", "message": "Message content"},
+        examples=["Send status update to teammate"],
+        is_safe=False,
+        requires_network=False,
+    ),
+    "TaskCreate": ToolMetadata(
+        name="TaskCreate",
+        description="Create a new task for tracking work",
+        category="task",
+        parameters={"subject": "Task subject", "description": "Task description"},
+        examples=["Create task to track implementation progress"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "TaskUpdate": ToolMetadata(
+        name="TaskUpdate",
+        description="Update an existing task status or details",
+        category="task",
+        parameters={"taskId": "Task ID", "status": "New status"},
+        examples=["Mark task as completed"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "TaskGet": ToolMetadata(
+        name="TaskGet",
+        description="Get details of a specific task",
+        category="task",
+        parameters={"taskId": "Task ID to retrieve"},
+        examples=["Get task details by ID"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "TaskList": ToolMetadata(
+        name="TaskList",
+        description="List all tasks",
+        category="task",
+        parameters={},
+        examples=["List all active tasks"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "TaskOutput": ToolMetadata(
+        name="TaskOutput",
+        description="Get the output of a completed task",
+        category="task",
+        parameters={"taskId": "Task ID"},
+        examples=["Retrieve output from finished task"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "TaskStop": ToolMetadata(
+        name="TaskStop",
+        description="Stop a running task",
+        category="task",
+        parameters={"taskId": "Task ID to stop"},
+        examples=["Cancel a running background task"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "EnterPlanMode": ToolMetadata(
+        name="EnterPlanMode",
+        description="Enter plan mode for designing implementation approach",
+        category="planning",
+        parameters={},
+        examples=["Enter plan mode before implementing a feature"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "ExitPlanMode": ToolMetadata(
+        name="ExitPlanMode",
+        description="Exit plan mode and present plan for approval",
+        category="planning",
+        parameters={},
+        examples=["Exit plan mode after finishing design"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "EnterWorktree": ToolMetadata(
+        name="EnterWorktree",
+        description="Create an isolated git worktree for safe changes",
+        category="git",
+        parameters={"branch": "Branch name for the worktree"},
+        examples=["Create isolated worktree for feature work"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "ExitWorktree": ToolMetadata(
+        name="ExitWorktree",
+        description="Exit and clean up a git worktree",
+        category="git",
+        parameters={},
+        examples=["Clean up worktree after finishing work"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "ToolSearch": ToolMetadata(
+        name="ToolSearch",
+        description="Search for available tools by keyword or name",
+        category="discovery",
+        parameters={"query": "Search query for tools"},
+        examples=["Find tools for file operations"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "AskUserQuestion": ToolMetadata(
+        name="AskUserQuestion",
+        description="Ask the user for input or clarification",
+        category="interaction",
+        parameters={"question": "Question to ask", "options": "Available choices"},
+        examples=["Ask user to choose between approaches"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "CronCreate": ToolMetadata(
+        name="CronCreate",
+        description="Create a scheduled recurring task",
+        category="scheduling",
+        parameters={"schedule": "Cron schedule expression", "command": "Command to run"},
+        examples=["Schedule a daily health check"],
+        is_safe=False,
+        requires_network=False,
+    ),
+    "CronDelete": ToolMetadata(
+        name="CronDelete",
+        description="Delete a scheduled task",
+        category="scheduling",
+        parameters={"cronId": "ID of the cron job to delete"},
+        examples=["Remove a scheduled task"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "CronList": ToolMetadata(
+        name="CronList",
+        description="List all scheduled tasks",
+        category="scheduling",
+        parameters={},
+        examples=["List all active cron jobs"],
+        is_safe=True,
+        requires_network=False,
+    ),
+    "RemoteTrigger": ToolMetadata(
+        name="RemoteTrigger",
+        description="Trigger remote agent execution",
+        category="scheduling",
+        parameters={"trigger": "Trigger configuration"},
+        examples=["Trigger a remote agent to run a task"],
+        is_safe=False,
+        requires_network=True,
+    ),
 }
 
+# Task is a backward-compatible alias for Agent -- share the same metadata
+TOOL_METADATA["Task"] = TOOL_METADATA["Agent"]
+
 
 @dataclass
 class ToolConfiguration:
@@ -389,13 +617,8 @@ def get_stats(self) -> Dict:
                 ),
                 "session_configs": len(self.session_configs),
                 "tool_categories": {
-                    "file": len([t for t in TOOL_METADATA.values() if t.category == "file"]),
-                    "system": len([t for t in TOOL_METADATA.values() if t.category == "system"]),
-                    "web": len([t for t in TOOL_METADATA.values() if t.category == "web"]),
-                    "productivity": len(
-                        [t for t in TOOL_METADATA.values() if t.category == "productivity"]
-                    ),
-                    "agent": len([t for t in TOOL_METADATA.values() if t.category == "agent"]),
+                    category: len([t for t in TOOL_METADATA.values() if t.category == category])
+                    for category in sorted(set(t.category for t in TOOL_METADATA.values()))
                 },
             }
 
diff --git a/tests/test_anthropic_messages.py b/tests/test_anthropic_messages.py
index 1f8d303..d368e44 100644
--- a/tests/test_anthropic_messages.py
+++ b/tests/test_anthropic_messages.py
@@ -211,5 +211,39 @@ def test_response_format_matches_anthropic_sdk(self):
         assert "output_tokens" in result["usage"]
 
 
+class TestAnthropicMessagesCliHealthGate:
+    """In-process gate check: /v1/messages must return 401 (not 503) when the
+    Claude CLI probe failed, so Anthropic SDK clients (VC and similar) route
+    the failure as AuthenticationError instead of a transient server error.
+    """
+
+    def test_messages_returns_401_when_cli_health_unhealthy(self, monkeypatch):
+        from fastapi.testclient import TestClient
+
+        from src import main as main_mod
+        from src import auth as auth_mod
+
+        monkeypatch.setattr(auth_mod.auth_manager, "auth_method", "claude_cli", raising=False)
+        auth_mod.cli_health.mark_failed("auth_failure", "Not logged in - Please run /login")
+
+        try:
+            client = TestClient(main_mod.app)
+            resp = client.post(
+                "/v1/messages",
+                json={
+                    "model": "claude-sonnet-4-6",
+                    "max_tokens": 16,
+                    "messages": [{"role": "user", "content": "hello"}],
+                },
+            )
+        finally:
+            auth_mod.cli_health.mark_ok()
+
+        assert resp.status_code == 401, resp.text
+        body = resp.json()
+        assert body["error"]["type"] == "authentication_error"
+        assert body["error"]["code"] == "claude_cli_not_authenticated"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/test_auth_unit.py b/tests/test_auth_unit.py
index ba9ec92..87d5867 100644
--- a/tests/test_auth_unit.py
+++ b/tests/test_auth_unit.py
@@ -491,6 +491,48 @@ def test_returns_runtime_key_when_available(self):
                 assert result in ["env-key", "runtime-key"]
 
 
+class TestProbeCliAuth:
+    """Cover the periodic CLI-auth probe in src.auth.probe_cli_auth()."""
+
+    @pytest.mark.asyncio
+    async def test_probe_cli_auth_success_marks_ok(self):
+        import src.auth
+
+        importlib.reload(src.auth)
+        fake_cli = MagicMock()
+        fake_cli.verify_cli = AsyncMock(return_value=True)
+        result = await src.auth.probe_cli_auth(cli=fake_cli)
+        assert result is True
+        assert src.auth.cli_health.ok is True
+        assert src.auth.cli_health.last_ok_at is not None
+        assert src.auth.cli_health.error_kind is None
+
+    @pytest.mark.asyncio
+    async def test_probe_cli_auth_marker_in_stderr_marks_auth_failure(self):
+        import src.auth
+
+        importlib.reload(src.auth)
+        fake_cli = MagicMock()
+        fake_cli.verify_cli = AsyncMock(side_effect=RuntimeError("Not logged in - Please run /login"))
+        result = await src.auth.probe_cli_auth(cli=fake_cli)
+        assert result is False
+        assert src.auth.cli_health.ok is False
+        assert src.auth.cli_health.error_kind == "auth_failure"
+        assert "Not logged in" in (src.auth.cli_health.error_message or "")
+
+    @pytest.mark.asyncio
+    async def test_probe_cli_auth_generic_error_marks_unknown(self):
+        import src.auth
+
+        importlib.reload(src.auth)
+        fake_cli = MagicMock()
+        fake_cli.verify_cli = AsyncMock(side_effect=RuntimeError("connection refused"))
+        result = await src.auth.probe_cli_auth(cli=fake_cli)
+        assert result is False
+        assert src.auth.cli_health.ok is False
+        assert src.auth.cli_health.error_kind == "unknown"
+
+
 # Reset module state after tests
 @pytest.fixture(autouse=True)
 def reset_auth_module():
diff --git a/tests/test_circuit_breaker_unit.py b/tests/test_circuit_breaker_unit.py
new file mode 100644
index 0000000..d11e465
--- /dev/null
+++ b/tests/test_circuit_breaker_unit.py
@@ -0,0 +1,187 @@
+"""Unit tests for src.circuit_breaker.
+
+Covers the state machine (closed -> open -> half-open -> closed/open),
+threshold behavior, and half-open single-probe semantics.
+"""
+
+import time
+
+from src.circuit_breaker import (
+    CircuitBreaker,
+    CircuitBreakerConfig,
+    CircuitBreakerState,
+)
+
+
+def _make_breaker(**overrides) -> CircuitBreaker:
+    defaults = dict(
+        window_seconds=10.0,
+        failure_ratio_threshold=0.5,
+        min_requests_for_trip=4,
+        open_seconds=0.05,  # short cool-off for tests
+    )
+    defaults.update(overrides)
+    return CircuitBreaker(CircuitBreakerConfig(**defaults))
+
+
+class TestCircuitBreakerClosed:
+    def test_starts_closed_and_allows_requests(self):
+        b = _make_breaker()
+        assert b.allow_request() is True
+        assert b.state == CircuitBreakerState.CLOSED
+
+    def test_success_keeps_breaker_closed(self):
+        b = _make_breaker()
+        for _ in range(20):
+            assert b.allow_request()
+            b.record(success=True)
+        assert b.state == CircuitBreakerState.CLOSED
+
+    def test_below_min_requests_does_not_trip(self):
+        b = _make_breaker(min_requests_for_trip=10)
+        for _ in range(3):
+            b.allow_request()
+            b.record(success=False)
+        # Failure ratio 100% but min_requests not met.
+        assert b.state == CircuitBreakerState.CLOSED
+
+
+class TestCircuitBreakerOpens:
+    def test_trips_when_failure_ratio_threshold_reached(self):
+        b = _make_breaker()
+        # 4 requests, all failures -> ratio 1.0 > 0.5, n=4 meets min_requests_for_trip.
+        for _ in range(4):
+            b.allow_request()
+            b.record(success=False)
+        assert b.state == CircuitBreakerState.OPEN
+
+    def test_open_breaker_denies_new_requests(self):
+        b = _make_breaker()
+        for _ in range(4):
+            b.allow_request()
+            b.record(success=False)
+        assert b.state == CircuitBreakerState.OPEN
+        # Subsequent requests should be shed until cool-off elapses.
+        assert b.allow_request() is False
+
+
+class TestCircuitBreakerHalfOpen:
+    def test_half_opens_after_cool_off_and_allows_one_probe(self):
+        b = _make_breaker(open_seconds=0.01)
+        for _ in range(4):
+            b.allow_request()
+            b.record(success=False)
+        assert b.state == CircuitBreakerState.OPEN
+
+        # Wait for cool-off, then a single probe is allowed.
+        time.sleep(0.02)
+        assert b.allow_request() is True
+        assert b.state == CircuitBreakerState.HALF_OPEN
+        # While probe is in flight, no additional requests.
+        assert b.allow_request() is False
+
+    def test_successful_probe_closes_breaker(self):
+        b = _make_breaker(open_seconds=0.01)
+        for _ in range(4):
+            b.allow_request()
+            b.record(success=False)
+        time.sleep(0.02)
+        assert b.allow_request() is True  # probe
+        b.record(success=True)
+        assert b.state == CircuitBreakerState.CLOSED
+        assert b.allow_request() is True
+
+    def test_failed_probe_reopens_breaker(self):
+        b = _make_breaker(open_seconds=0.01)
+        for _ in range(4):
+            b.allow_request()
+            b.record(success=False)
+        time.sleep(0.02)
+        assert b.allow_request() is True  # probe
+        b.record(success=False)
+        assert b.state == CircuitBreakerState.OPEN
+
+
+class TestCircuitBreakerSnapshot:
+    def test_snapshot_exposes_state_and_ratio(self):
+        b = _make_breaker()
+        b.allow_request()
+        b.record(success=True)
+        b.allow_request()
+        b.record(success=False)
+        snap = b.snapshot()
+        assert snap["state"] == CircuitBreakerState.CLOSED
+        assert snap["window_size"] == 2
+        assert snap["failure_ratio"] == 0.5
+        assert snap["enabled"] is True
+        assert snap["min_requests_for_trip"] == 4
+
+
+class TestCircuitBreakerDisabled:
+    """When disabled, allow_request always returns True and record is a no-op.
+    Used as a kill switch while the upstream SDK is degraded and the breaker
+    itself would amplify an outage by shedding otherwise-recoverable load."""
+
+    def test_disabled_allows_all_requests(self):
+        b = CircuitBreaker(CircuitBreakerConfig(min_requests_for_trip=2), enabled=False)
+        # Record enough failures to normally trip an enabled breaker.
+        for _ in range(10):
+            assert b.allow_request() is True
+            b.record(success=False)
+        # Still closed, still allowing.
+        assert b.state == CircuitBreakerState.CLOSED
+        assert b.allow_request() is True
+
+
+class TestCircuitBreakerConfigFromEnv:
+    """Env-var overrides let ops retune without a rebuild."""
+
+    def test_env_overrides_defaults(self, monkeypatch):
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_WINDOW_SECONDS", "120")
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_THRESHOLD", "0.9")
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_MIN_REQUESTS", "50")
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_OPEN_SECONDS", "45")
+
+        cfg = CircuitBreakerConfig.from_env()
+        assert cfg.window_seconds == 120.0
+        assert cfg.failure_ratio_threshold == 0.9
+        assert cfg.min_requests_for_trip == 50
+        assert cfg.open_seconds == 45.0
+
+    def test_env_defaults_kick_in_when_unset(self, monkeypatch):
+        for name in (
+            "WRAPPER_CIRCUIT_BREAKER_WINDOW_SECONDS",
+            "WRAPPER_CIRCUIT_BREAKER_THRESHOLD",
+            "WRAPPER_CIRCUIT_BREAKER_MIN_REQUESTS",
+            "WRAPPER_CIRCUIT_BREAKER_OPEN_SECONDS",
+        ):
+            monkeypatch.delenv(name, raising=False)
+
+        cfg = CircuitBreakerConfig.from_env()
+        # Defaults tightened for MinusPod incident: 20 requests / 0.75 ratio.
+        assert cfg.min_requests_for_trip == 20
+        assert cfg.failure_ratio_threshold == 0.75
+        assert cfg.window_seconds == 60.0
+        assert cfg.open_seconds == 30.0
+
+    def test_invalid_env_value_falls_back_to_default(self, monkeypatch):
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_MIN_REQUESTS", "not-a-number")
+        cfg = CircuitBreakerConfig.from_env()
+        assert cfg.min_requests_for_trip == 20
+
+
+class TestCircuitBreakerEnabledFn:
+    def test_env_flag_toggle(self, monkeypatch):
+        from src.circuit_breaker import circuit_breaker_enabled
+
+        monkeypatch.delenv("WRAPPER_CIRCUIT_BREAKER_ENABLED", raising=False)
+        assert circuit_breaker_enabled() is True
+
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_ENABLED", "false")
+        assert circuit_breaker_enabled() is False
+
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_ENABLED", "0")
+        assert circuit_breaker_enabled() is False
+
+        monkeypatch.setenv("WRAPPER_CIRCUIT_BREAKER_ENABLED", "yes")
+        assert circuit_breaker_enabled() is True
diff --git a/tests/test_claude_cli_unit.py b/tests/test_claude_cli_unit.py
index c67c7fe..b587019 100644
--- a/tests/test_claude_cli_unit.py
+++ b/tests/test_claude_cli_unit.py
@@ -141,6 +141,115 @@ def test_result_takes_priority(self, cli_class):
         result = cli.parse_claude_message(messages)
         assert result == "Final result"
 
+    def test_error_max_turns_raises_instead_of_returning_sentinel(self, cli_class):
+        """When the SDK reports error_max_turns, parse_claude_message raises
+        ClaudeResultError. Previously the loop fell through to the synthetic
+        UserMessage('[Request interrupted by user]') and returned its text
+        verbatim as the response body, which shipped as the OpenAI response
+        content and made its way into downstream artifacts (e.g. chapter
+        titles). This test pins the fix.
+        """
+        from src.claude_cli import ClaudeResultError
+
+        cli = MagicMock()
+        cli.parse_claude_message = cli_class.parse_claude_message.__get__(cli, cli_class)
+
+        # Shape matches what the SDK emits on error_max_turns: a synthetic
+        # UserMessage with the interrupt sentinel, then a ResultMessage with
+        # subtype='error_max_turns', result=None.
+        messages = [
+            {
+                "content": [{"type": "text", "text": "[Request interrupted by user]"}],
+                "uuid": "u-sentinel",
+                "parent_tool_use_id": None,
+            },
+            {
+                "subtype": "error_max_turns",
+                "is_error": False,
+                "num_turns": 2,
+                "duration_ms": 2159,
+                "duration_api_ms": 0,
+                "result": None,
+                "session_id": "sess-err",
+            },
+        ]
+        with pytest.raises(ClaudeResultError) as excinfo:
+            cli.parse_claude_message(messages)
+        assert excinfo.value.subtype == "error_max_turns"
+        assert excinfo.value.num_turns == 2
+
+    def test_user_message_content_never_leaks_as_response(self, cli_class):
+        """A SDK UserMessage (identified by uuid + no model field) must never
+        be returned as assistant content, even when it precedes a successful
+        result. Guards against the same leak as the error_max_turns case."""
+        cli = MagicMock()
+        cli.parse_claude_message = cli_class.parse_claude_message.__get__(cli, cli_class)
+
+        messages = [
+            {
+                "content": [{"type": "text", "text": "[Request interrupted by user]"}],
+                "uuid": "u-sentinel",
+                "parent_tool_use_id": None,
+            },
+            # AssistantMessage shape: has model, no uuid-only marker.
+            {
+                "content": [{"type": "text", "text": "Real answer"}],
+                "model": "claude-sonnet-4-6",
+                "parent_tool_use_id": None,
+            },
+        ]
+        result = cli.parse_claude_message(messages)
+        assert result == "Real answer"
+        assert "Request interrupted" not in (result or "")
+
+    def test_is_error_true_raises_even_when_subtype_missing(self, cli_class):
+        """If a ResultMessage has is_error=True without a matching subtype
+        literal, we still raise. This covers future SDK changes that add new
+        error subtypes we haven't enumerated."""
+        from src.claude_cli import ClaudeResultError
+
+        cli = MagicMock()
+        cli.parse_claude_message = cli_class.parse_claude_message.__get__(cli, cli_class)
+
+        messages = [
+            {
+                "subtype": "something_new",
+                "is_error": True,
+                "num_turns": 1,
+                "duration_ms": 100,
+                "result": None,
+                "errors": ["rate_limited_by_upstream"],
+            },
+        ]
+        with pytest.raises(ClaudeResultError) as excinfo:
+            cli.parse_claude_message(messages)
+        assert "rate_limited_by_upstream" in excinfo.value.errors
+
+    def test_stderr_tail_propagates_through_result_error(self, cli_class):
+        """The run_completion loop copies the CLI subprocess's captured
+        stderr onto the ResultMessage dict; parse_claude_message must forward
+        it onto the ClaudeResultError so the HTTP layer can log the actual
+        reason the CLI died."""
+        from src.claude_cli import ClaudeResultError
+
+        cli = MagicMock()
+        cli.parse_claude_message = cli_class.parse_claude_message.__get__(cli, cli_class)
+
+        stderr_tail = "Error: auth rejected\nnode:internal/abort\n"
+        messages = [
+            {
+                "subtype": "error_during_execution",
+                "is_error": False,
+                "num_turns": 2,
+                "duration_ms": 2000,
+                "result": None,
+                "stderr_tail": stderr_tail,
+            },
+        ]
+        with pytest.raises(ClaudeResultError) as excinfo:
+            cli.parse_claude_message(messages)
+        assert excinfo.value.stderr_tail == stderr_tail
+
 
 class TestClaudeCodeCLIExtractMetadata:
     """Test ClaudeCodeCLI.extract_metadata()"""
diff --git a/tests/test_cost_tracker_unit.py b/tests/test_cost_tracker_unit.py
new file mode 100644
index 0000000..ee04fe3
--- /dev/null
+++ b/tests/test_cost_tracker_unit.py
@@ -0,0 +1,120 @@
+"""Unit tests for cost tracker module."""
+
+import asyncio
+import pytest
+from src.cost_tracker import CostTracker, UsageRecord, calculate_cost
+
+
+class TestCalculateCost:
+    """Tests for calculate_cost function (sync, no async needed)."""
+
+    def test_sonnet_pricing(self):
+        usage = UsageRecord(input_tokens=1_000_000, output_tokens=1_000_000)
+        cost = calculate_cost("claude-sonnet-4-6", usage)
+        assert cost == pytest.approx(18.0)
+
+    def test_opus_46_pricing(self):
+        usage = UsageRecord(input_tokens=1_000_000, output_tokens=1_000_000)
+        cost = calculate_cost("claude-opus-4-6", usage)
+        assert cost == pytest.approx(30.0)
+
+    def test_haiku_pricing(self):
+        usage = UsageRecord(input_tokens=1_000_000, output_tokens=1_000_000)
+        cost = calculate_cost("claude-haiku-4-5-20251001", usage)
+        assert cost == pytest.approx(6.0)
+
+    def test_cache_tokens(self):
+        usage = UsageRecord(cache_read_tokens=1_000_000, cache_creation_tokens=1_000_000)
+        cost = calculate_cost("claude-sonnet-4-6", usage)
+        assert cost == pytest.approx(4.05)
+
+    def test_web_search(self):
+        usage = UsageRecord(web_search_requests=5)
+        cost = calculate_cost("claude-sonnet-4-6", usage)
+        assert cost == pytest.approx(0.05)
+
+    def test_zero_usage(self):
+        usage = UsageRecord()
+        cost = calculate_cost("claude-sonnet-4-6", usage)
+        assert cost == 0.0
+
+    def test_unknown_model_uses_default(self):
+        usage = UsageRecord(input_tokens=1_000_000, output_tokens=1_000_000)
+        cost = calculate_cost("unknown-model-xyz", usage)
+        assert cost == pytest.approx(18.0)
+
+    def test_small_usage(self):
+        usage = UsageRecord(input_tokens=100, output_tokens=50)
+        cost = calculate_cost("claude-sonnet-4-6", usage)
+        assert cost == pytest.approx(0.00105)
+
+
+@pytest.mark.asyncio
+class TestCostTracker:
+    """Tests for CostTracker class (async methods)."""
+
+    async def test_record_usage(self):
+        tracker = CostTracker()
+        usage = UsageRecord(input_tokens=1000, output_tokens=500)
+        cost = await tracker.record_usage("session-1", "claude-sonnet-4-6", usage)
+        assert cost > 0
+
+    async def test_session_accumulation(self):
+        tracker = CostTracker()
+        usage = UsageRecord(input_tokens=1000, output_tokens=500)
+        await tracker.record_usage("session-1", "claude-sonnet-4-6", usage)
+        await tracker.record_usage("session-1", "claude-sonnet-4-6", usage)
+
+        session = await tracker.get_session_cost("session-1")
+        assert session is not None
+        assert session.request_count == 2
+        assert session.total_input_tokens == 2000
+        assert session.total_output_tokens == 1000
+
+    async def test_multiple_sessions(self):
+        tracker = CostTracker()
+        usage = UsageRecord(input_tokens=1000, output_tokens=500)
+        await tracker.record_usage("session-1", "claude-sonnet-4-6", usage)
+        await tracker.record_usage("session-2", "claude-opus-4-6", usage)
+
+        summary = await tracker.get_all_sessions_summary()
+        assert summary["active_sessions"] == 2
+        assert summary["total_requests"] == 2
+
+    async def test_per_model_tracking(self):
+        tracker = CostTracker()
+        await tracker.record_usage("s1", "claude-sonnet-4-6", UsageRecord(input_tokens=100))
+        await tracker.record_usage("s1", "claude-opus-4-6", UsageRecord(input_tokens=200))
+
+        summary = await tracker.get_session_summary("s1")
+        assert "claude-sonnet-4-6" in summary["model_usage"]
+        assert "claude-opus-4-6" in summary["model_usage"]
+        assert summary["model_usage"]["claude-sonnet-4-6"]["requests"] == 1
+        assert summary["model_usage"]["claude-opus-4-6"]["requests"] == 1
+
+    async def test_delete_session(self):
+        tracker = CostTracker()
+        await tracker.record_usage("s1", "claude-sonnet-4-6", UsageRecord(input_tokens=100))
+        assert await tracker.delete_session("s1") is True
+        assert await tracker.get_session_cost("s1") is None
+        assert await tracker.delete_session("s1") is False
+
+    async def test_nonexistent_session_summary(self):
+        tracker = CostTracker()
+        summary = await tracker.get_session_summary("nonexistent")
+        assert summary["total_cost_usd"] == 0.0
+        assert summary["request_count"] == 0
+
+    async def test_cleanup_expired(self):
+        tracker = CostTracker(max_age_minutes=0)  # Expire immediately
+        await tracker.record_usage("s1", "claude-sonnet-4-6", UsageRecord(input_tokens=100))
+        removed = await tracker.cleanup_expired()
+        assert removed == 1
+        assert await tracker.get_session_cost("s1") is None
+
+    async def test_cleanup_keeps_fresh_sessions(self):
+        tracker = CostTracker(max_age_minutes=60)
+        await tracker.record_usage("s1", "claude-sonnet-4-6", UsageRecord(input_tokens=100))
+        removed = await tracker.cleanup_expired()
+        assert removed == 0
+        assert await tracker.get_session_cost("s1") is not None
diff --git a/tests/test_cpu_watchdog_unit.py b/tests/test_cpu_watchdog_unit.py
new file mode 100644
index 0000000..e082934
--- /dev/null
+++ b/tests/test_cpu_watchdog_unit.py
@@ -0,0 +1,61 @@
+"""Tests for CPU watchdog module."""
+
+import pytest
+from unittest.mock import patch
+from src.cpu_watchdog import CPUWatchdog
+
+
+class TestCPUWatchdog:
+    def test_init_defaults(self):
+        wd = CPUWatchdog()
+        assert wd._task is None
+        assert wd._strikes == 0
+        assert wd._last_cpu_time is None
+
+    def test_get_cpu_percent_non_linux(self):
+        wd = CPUWatchdog()
+        wd._is_linux = False
+        assert wd._get_cpu_percent() == 0.0
+
+    def test_get_cpu_percent_first_call_returns_zero(self):
+        wd = CPUWatchdog()
+        wd._is_linux = True
+        with patch("builtins.open", side_effect=FileNotFoundError):
+            assert wd._get_cpu_percent() == 0.0
+
+    def test_start_disabled(self):
+        wd = CPUWatchdog()
+        with patch("src.cpu_watchdog.WATCHDOG_ENABLED", False):
+            wd.start()
+        assert wd._task is None
+
+    def test_start_non_linux(self):
+        wd = CPUWatchdog()
+        wd._is_linux = False
+        with patch("src.cpu_watchdog.WATCHDOG_ENABLED", True):
+            wd.start()
+        assert wd._task is None
+
+    def test_stop_no_task(self):
+        wd = CPUWatchdog()
+        wd.stop()  # should not raise
+
+    def test_strike_increment_and_reset(self):
+        wd = CPUWatchdog()
+        wd._strikes = 2
+        # Simulating a below-threshold reading resets strikes
+        wd._strikes = 0
+        assert wd._strikes == 0
+
+    def test_env_vars_read_at_import(self):
+        from src.cpu_watchdog import (
+            WATCHDOG_ENABLED,
+            WATCHDOG_INTERVAL,
+            WATCHDOG_CPU_THRESHOLD,
+            WATCHDOG_STRIKES,
+        )
+
+        assert isinstance(WATCHDOG_ENABLED, bool)
+        assert isinstance(WATCHDOG_INTERVAL, int)
+        assert isinstance(WATCHDOG_CPU_THRESHOLD, float)
+        assert isinstance(WATCHDOG_STRIKES, int)
diff --git a/tests/test_dynamic_models.py b/tests/test_dynamic_models.py
new file mode 100644
index 0000000..ab8bf6f
--- /dev/null
+++ b/tests/test_dynamic_models.py
@@ -0,0 +1,236 @@
+"""Unit tests for dynamic Anthropic model listing."""
+
+import asyncio
+
+import pytest
+
+from src import constants, main
+
+
+@pytest.mark.asyncio
+async def test_get_available_models_uses_anthropic_models_api(monkeypatch):
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+
+    async def fake_fetch():
+        return [
+            {
+                "id": "claude-test-latest",
+                "object": "model",
+                "owned_by": "anthropic",
+                "display_name": "Claude Test Latest",
+            }
+        ]
+
+    monkeypatch.delenv("CLAUDE_MODELS_OVERRIDE", raising=False)
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    models = await main.get_available_models()
+
+    assert models[0]["id"] == "claude-test-latest"
+    assert models[0]["display_name"] == "Claude Test Latest"
+
+
+@pytest.mark.asyncio
+async def test_get_available_models_falls_back_to_constants(monkeypatch):
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+
+    async def fake_fetch():
+        return None
+
+    monkeypatch.delenv("CLAUDE_MODELS_OVERRIDE", raising=False)
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    models = await main.get_available_models()
+
+    assert {model["id"] for model in models} >= {"claude-sonnet-4-6", "claude-opus-4-6"}
+
+
+@pytest.mark.asyncio
+async def test_model_override_skips_live_fetch(monkeypatch):
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+
+    async def fake_fetch():
+        raise AssertionError("override should not call live Anthropic API")
+
+    monkeypatch.setenv("CLAUDE_MODELS_OVERRIDE", "custom-a,custom-b")
+    monkeypatch.setattr(main, "CLAUDE_MODELS", ["custom-a", "custom-b"])
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    models = await main.get_available_models()
+
+    assert [model["id"] for model in models] == ["custom-a", "custom-b"]
+
+
+def test_openai_model_from_anthropic_preserves_metadata():
+    model = main._openai_model_from_anthropic(
+        {
+            "id": "claude-test",
+            "type": "model",
+            "display_name": "Claude Test",
+            "created_at": "2026-01-01T00:00:00Z",
+            "max_input_tokens": 200000,
+            "max_tokens": 64000,
+            "capabilities": {"batch": {"supported": True}},
+        }
+    )
+
+    assert model["id"] == "claude-test"
+    assert model["object"] == "model"
+    assert model["owned_by"] == "anthropic"
+    # `created` should be the unix timestamp of the ISO `created_at`.
+    assert model["created"] == 1767225600
+    assert model["capabilities"] == {"batch": {"supported": True}}
+
+
+def test_fallback_objects_include_created_field():
+    fallback = main._fallback_model_payload()
+
+    assert fallback, "fallback list should not be empty"
+    for entry in fallback:
+        assert isinstance(entry["created"], int) and entry["created"] > 0
+
+
+@pytest.mark.asyncio
+async def test_concurrent_calls_only_fetch_once(monkeypatch):
+    """Lock + double-check should prevent thundering-herd on cache expiry."""
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+    call_count = 0
+
+    async def fake_fetch():
+        nonlocal call_count
+        call_count += 1
+        await asyncio.sleep(0.01)
+        return [{"id": "claude-test", "object": "model", "owned_by": "anthropic"}]
+
+    monkeypatch.delenv("CLAUDE_MODELS_OVERRIDE", raising=False)
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    results = await asyncio.gather(*[main.get_available_models() for _ in range(8)])
+
+    assert call_count == 1
+    for r in results:
+        assert r[0]["id"] == "claude-test"
+
+
+@pytest.mark.asyncio
+async def test_failed_fetch_uses_short_error_ttl(monkeypatch):
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+
+    async def fake_fetch():
+        return None
+
+    monkeypatch.delenv("CLAUDE_MODELS_OVERRIDE", raising=False)
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+    monkeypatch.setattr(main, "MODEL_LIST_CACHE_TTL_SECONDS", 3600)
+    monkeypatch.setattr(main, "MODEL_LIST_ERROR_TTL_SECONDS", 60)
+
+    await main.get_available_models()
+
+    expires_at = main._model_list_cache["expires_at"]
+    # Error TTL ~60s; success TTL ~3600s. Confirm we used the short one.
+    import time as _time
+
+    assert expires_at - _time.time() < 120
+
+
+def test_pick_latest_sonnet_prefers_newest_created_at():
+    models = [
+        {"id": "claude-sonnet-4-5", "created_at": "2025-09-29T00:00:00Z"},
+        {"id": "claude-sonnet-4-6", "created_at": "2026-04-01T00:00:00Z"},
+        {"id": "claude-opus-4-6", "created_at": "2026-04-15T00:00:00Z"},
+    ]
+
+    assert main._pick_latest_sonnet(models) == "claude-sonnet-4-6"
+
+
+def test_pick_latest_sonnet_returns_none_when_no_sonnet():
+    models = [{"id": "claude-haiku-4-5", "created_at": "2025-10-01T00:00:00Z"}]
+
+    assert main._pick_latest_sonnet(models) is None
+
+
+@pytest.mark.asyncio
+async def test_resolve_default_model_sets_constants(monkeypatch):
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+    constants.RESOLVED_DEFAULT_MODEL = None
+
+    async def fake_fetch():
+        return [
+            {
+                "id": "claude-sonnet-4-7",
+                "object": "model",
+                "owned_by": "anthropic",
+                "created_at": "2026-06-01T00:00:00Z",
+            },
+            {
+                "id": "claude-sonnet-4-6",
+                "object": "model",
+                "owned_by": "anthropic",
+                "created_at": "2026-04-01T00:00:00Z",
+            },
+        ]
+
+    monkeypatch.delenv("CLAUDE_MODELS_OVERRIDE", raising=False)
+    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test")
+    monkeypatch.setattr(constants, "DEFAULT_MODEL_ENV", None)
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    resolved = await main.resolve_default_model()
+
+    assert resolved == "claude-sonnet-4-7"
+    assert constants.RESOLVED_DEFAULT_MODEL == "claude-sonnet-4-7"
+
+
+@pytest.mark.asyncio
+async def test_resolve_default_model_skips_without_api_key(monkeypatch, caplog):
+    """No ANTHROPIC_API_KEY -> skip live discovery, log clearly, use fallback."""
+    constants.RESOLVED_DEFAULT_MODEL = None
+
+    async def fake_fetch():
+        raise AssertionError("should not call live API without ANTHROPIC_API_KEY")
+
+    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+    monkeypatch.setattr(constants, "DEFAULT_MODEL_ENV", None)
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    with caplog.at_level("INFO", logger="src.main"):
+        resolved = await main.resolve_default_model()
+
+    assert resolved is None
+    assert constants.RESOLVED_DEFAULT_MODEL is None
+    assert any("Live model discovery disabled" in r.message for r in caplog.records)
+
+
+@pytest.mark.asyncio
+async def test_resolve_default_model_honors_env_override(monkeypatch):
+    main._model_list_cache = {"expires_at": 0.0, "models": None}
+    constants.RESOLVED_DEFAULT_MODEL = None
+
+    async def fake_fetch():
+        raise AssertionError("env override should short-circuit fetch")
+
+    monkeypatch.setattr(constants, "DEFAULT_MODEL_ENV", "claude-opus-4-6")
+    monkeypatch.setattr(main, "_fetch_anthropic_models", fake_fetch)
+
+    resolved = await main.resolve_default_model()
+
+    assert resolved == "claude-opus-4-6"
+    assert constants.RESOLVED_DEFAULT_MODEL is None
+
+
+def test_get_default_model_prefers_resolved_over_fallback(monkeypatch):
+    from src import models as models_module
+
+    monkeypatch.setattr(constants, "DEFAULT_MODEL_ENV", None)
+    monkeypatch.setattr(constants, "RESOLVED_DEFAULT_MODEL", "claude-sonnet-future")
+
+    assert models_module.get_default_model() == "claude-sonnet-future"
+
+
+def test_get_default_model_env_override_wins(monkeypatch):
+    from src import models as models_module
+
+    monkeypatch.setattr(constants, "DEFAULT_MODEL_ENV", "claude-opus-4-6")
+    monkeypatch.setattr(constants, "RESOLVED_DEFAULT_MODEL", "claude-sonnet-future")
+
+    assert models_module.get_default_model() == "claude-opus-4-6"
diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py
index 3592818..7b7a913 100644
--- a/tests/test_endpoints.py
+++ b/tests/test_endpoints.py
@@ -125,3 +125,37 @@ def main():
 
 if __name__ == "__main__":
     main()
+
+
+class TestChatCompletionsCliHealthGate:
+    """In-process gate check: when auth_method=claude_cli and the latest probe
+    failed, /v1/chat/completions must return 401 with an OpenAI-shaped
+    authentication_error body, without touching the SDK.
+    """
+
+    def test_chat_completions_returns_401_when_cli_health_unhealthy(self, monkeypatch):
+        from fastapi.testclient import TestClient
+
+        from src import main as main_mod
+        from src import auth as auth_mod
+
+        monkeypatch.setattr(auth_mod.auth_manager, "auth_method", "claude_cli", raising=False)
+        auth_mod.cli_health.mark_failed("auth_failure", "Not logged in - Please run /login")
+
+        try:
+            client = TestClient(main_mod.app)
+            resp = client.post(
+                "/v1/chat/completions",
+                json={
+                    "model": "claude-sonnet-4-6",
+                    "messages": [{"role": "user", "content": "hello"}],
+                },
+            )
+        finally:
+            auth_mod.cli_health.mark_ok()
+
+        assert resp.status_code == 401, resp.text
+        body = resp.json()
+        assert body["error"]["type"] == "authentication_error"
+        assert body["error"]["code"] == "claude_cli_not_authenticated"
+        assert body["error"]["error_kind"] == "auth_failure"
diff --git a/tests/test_error_path_unit.py b/tests/test_error_path_unit.py
new file mode 100644
index 0000000..eebb8a0
--- /dev/null
+++ b/tests/test_error_path_unit.py
@@ -0,0 +1,207 @@
+"""Unit tests for the SDK-error -> HTTP-response translation helpers.
+
+These cover the OpenAI-shape outputs we produce when parse_claude_message
+raises ClaudeResultError, so an error_max_turns from the Claude Agent SDK
+never ships as a 200 with the literal string '[Request interrupted by user]'
+as message content.
+"""
+
+import json
+
+from src.claude_cli import ClaudeResultError
+from src.main import (
+    _build_error_max_turns_response,
+    _build_sdk_error_response,
+    _handle_claude_result_error,
+)
+
+
+def _body(response):
+    return json.loads(response.body)
+
+
+class TestErrorMaxTurnsResponse:
+    def test_returns_200_with_finish_reason_length_and_empty_content(self):
+        err = ClaudeResultError(
+            subtype="error_max_turns",
+            num_turns=2,
+            errors=None,
+            stop_reason=None,
+            error_message=None,
+        )
+        resp = _build_error_max_turns_response("req-1", "claude-sonnet-4-6", err)
+
+        assert resp.status_code == 200
+        body = _body(resp)
+        assert body["id"] == "req-1"
+        assert body["model"] == "claude-sonnet-4-6"
+        assert body["choices"][0]["finish_reason"] == "length"
+        assert body["choices"][0]["message"]["role"] == "assistant"
+        assert body["choices"][0]["message"]["content"] == ""
+        # Sentinel must not appear in the serialized body under any field.
+        assert "Request interrupted by user" not in json.dumps(body)
+
+
+class TestSdkErrorResponse:
+    def test_returns_502_with_structured_error_body(self):
+        err = ClaudeResultError(
+            subtype="error_during_execution",
+            num_turns=0,
+            errors=["upstream timeout"],
+            stop_reason=None,
+            error_message=None,
+        )
+        resp = _build_sdk_error_response("req-2", "claude-sonnet-4-6", err)
+
+        assert resp.status_code == 502
+        body = _body(resp)
+        assert body["error"]["type"] == "upstream_sdk_error"
+        assert body["error"]["code"] == "error_during_execution"
+        assert body["error"]["message"] == "upstream timeout"
+
+
+class TestHandleClaudeResultError:
+    def test_error_max_turns_routes_to_length_finish_reason(self):
+        err = ClaudeResultError(subtype="error_max_turns", num_turns=2)
+        resp = _handle_claude_result_error("req-3", "claude-opus-4-6", err)
+
+        assert resp.status_code == 200
+        body = _body(resp)
+        assert body["choices"][0]["finish_reason"] == "length"
+
+    def test_other_errors_route_to_502(self):
+        err = ClaudeResultError(
+            subtype="error_during_execution",
+            num_turns=0,
+            error_message="boom",
+        )
+        resp = _handle_claude_result_error("req-4", "claude-opus-4-6", err)
+
+        assert resp.status_code == 502
+        assert _body(resp)["error"]["code"] == "error_during_execution"
+
+    def test_generic_is_error_routes_to_502(self):
+        # Covers future SDK subtypes that aren't explicitly enumerated.
+        err = ClaudeResultError(subtype="something_new", num_turns=1)
+        resp = _handle_claude_result_error("req-5", "claude-opus-4-6", err)
+
+        assert resp.status_code == 502
+        assert _body(resp)["error"]["code"] == "something_new"
+
+
+class TestAssistantErrorTaxonomy:
+    """AssistantMessage.error literals map to proper HTTP status codes."""
+
+    def test_rate_limit_returns_429_with_retry_after(self):
+        err = ClaudeResultError(subtype="assistant_rate_limit", errors=["rate_limit"])
+        resp = _handle_claude_result_error("req-rl", "claude-sonnet-4-6", err)
+        assert resp.status_code == 429
+        assert resp.headers.get("retry-after") == "30"
+        assert _body(resp)["error"]["code"] == "assistant_rate_limit"
+
+    def test_billing_error_returns_402(self):
+        err = ClaudeResultError(subtype="assistant_billing_error", errors=["billing_error"])
+        resp = _handle_claude_result_error("req-be", "claude-sonnet-4-6", err)
+        assert resp.status_code == 402
+
+    def test_authentication_failed_returns_401(self):
+        err = ClaudeResultError(
+            subtype="assistant_authentication_failed",
+            errors=["authentication_failed"],
+        )
+        resp = _handle_claude_result_error("req-af", "claude-sonnet-4-6", err)
+        assert resp.status_code == 401
+
+    def test_invalid_request_returns_400(self):
+        err = ClaudeResultError(subtype="assistant_invalid_request", errors=["invalid_request"])
+        resp = _handle_claude_result_error("req-ir", "claude-sonnet-4-6", err)
+        assert resp.status_code == 400
+
+    def test_server_error_returns_502(self):
+        err = ClaudeResultError(subtype="assistant_server_error", errors=["server_error"])
+        resp = _handle_claude_result_error("req-se", "claude-sonnet-4-6", err)
+        assert resp.status_code == 502
+
+
+class TestParseClaudeMessageAssistantError:
+    """parse_claude_message raises with the assistant_<error> subtype so the
+    HTTP layer can map each AssistantMessageError literal to a status code."""
+
+    def test_assistant_rate_limit_raises(self):
+        from unittest.mock import MagicMock
+
+        from src.claude_cli import ClaudeCodeCLI
+
+        cli = MagicMock()
+        cli.parse_claude_message = ClaudeCodeCLI.parse_claude_message.__get__(cli, ClaudeCodeCLI)
+        messages = [
+            {
+                "content": [{"type": "text", "text": "partial"}],
+                "model": "claude-sonnet-4-6",
+                "error": "rate_limit",
+            }
+        ]
+        import pytest
+
+        with pytest.raises(ClaudeResultError) as excinfo:
+            cli.parse_claude_message(messages)
+        assert excinfo.value.subtype == "assistant_rate_limit"
+        assert "rate_limit" in excinfo.value.errors
+
+
+class TestCliAuthFailureToFourOhOne:
+    """Defense-in-depth: when ClaudeResultError carries CLI auth markers in
+    its stderr_tail or error_message, _build_sdk_error_response must return
+    HTTP 401 instead of 502, with an OpenAI-shaped authentication_error body.
+    """
+
+    def test_sdk_error_with_auth_marker_in_stderr_maps_to_401(self):
+        err = ClaudeResultError(
+            subtype="error_during_execution",
+            num_turns=0,
+            errors=None,
+            stop_reason=None,
+            error_message=None,
+            stderr_tail="Not logged in - Please run /login",
+        )
+        resp = _build_sdk_error_response("req-cli-auth", "claude-sonnet-4-6", err)
+        assert resp.status_code == 401
+        body = _body(resp)
+        assert body["error"]["type"] == "authentication_error"
+        assert body["error"]["code"] == "claude_cli_not_authenticated"
+
+    def test_sdk_error_with_invalid_api_key_in_message_maps_to_401(self):
+        err = ClaudeResultError(
+            subtype="error_during_execution",
+            errors=["Invalid API key"],
+            error_message="Invalid API key",
+        )
+        resp = _build_sdk_error_response("req-cli-key", "claude-sonnet-4-6", err)
+        assert resp.status_code == 401
+        body = _body(resp)
+        assert body["error"]["type"] == "authentication_error"
+
+    def test_sdk_error_without_auth_marker_still_502(self):
+        err = ClaudeResultError(
+            subtype="error_during_execution",
+            errors=["upstream timeout"],
+            stderr_tail="connection refused",
+        )
+        resp = _build_sdk_error_response("req-generic", "claude-sonnet-4-6", err)
+        assert resp.status_code == 502
+        body = _body(resp)
+        assert body["error"]["type"] == "upstream_sdk_error"
+
+    def test_sdk_error_with_auth_marker_seeds_cli_health(self):
+        import src.auth
+
+        src.auth.cli_health.mark_ok()
+        assert src.auth.cli_health.ok is True
+
+        err = ClaudeResultError(
+            subtype="error_during_execution",
+            stderr_tail="Not logged in - Please run /login",
+        )
+        _build_sdk_error_response("req-cli-seed", "claude-sonnet-4-6", err)
+        assert src.auth.cli_health.ok is False
+        assert src.auth.cli_health.error_kind == "auth_failure"
diff --git a/tests/test_fence_stripper_unit.py b/tests/test_fence_stripper_unit.py
new file mode 100644
index 0000000..9908662
--- /dev/null
+++ b/tests/test_fence_stripper_unit.py
@@ -0,0 +1,55 @@
+"""Tests for JsonFenceStripper streaming fence removal."""
+
+import pytest
+from src.message_adapter import JsonFenceStripper
+
+
+class TestJsonFenceStripper:
+    def test_no_fences(self):
+        s = JsonFenceStripper()
+        result = s.process_delta('{"key": "value"}')
+        result += s.flush()
+        assert '"key"' in result
+        assert '"value"' in result
+
+    def test_strips_json_fence(self):
+        s = JsonFenceStripper()
+        chunks = ["```json\n", '{"key": "val', 'ue"}', "\n```"]
+        output = ""
+        for c in chunks:
+            output += s.process_delta(c)
+        output += s.flush()
+        assert "```" not in output
+        assert '"key"' in output
+
+    def test_strips_bare_fence(self):
+        s = JsonFenceStripper()
+        chunks = ["```\n", '{"a": 1}', "\n```"]
+        output = ""
+        for c in chunks:
+            output += s.process_delta(c)
+        output += s.flush()
+        assert "```" not in output
+        assert '"a"' in output
+
+    def test_no_fence_passes_through(self):
+        s = JsonFenceStripper()
+        chunks = ['{"hello":', ' "world"}']
+        output = ""
+        for c in chunks:
+            output += s.process_delta(c)
+        output += s.flush()
+        assert "hello" in output
+        assert "world" in output
+
+    def test_empty_chunks(self):
+        s = JsonFenceStripper()
+        assert s.process_delta("") == ""
+        assert s.flush() == ""
+
+    def test_single_large_chunk(self):
+        s = JsonFenceStripper()
+        text = '```json\n{"data": [1, 2, 3]}\n```'
+        output = s.process_delta(text) + s.flush()
+        assert "```" not in output
+        assert '"data"' in output
diff --git a/tests/test_function_calling_unit.py b/tests/test_function_calling_unit.py
new file mode 100644
index 0000000..0dead55
--- /dev/null
+++ b/tests/test_function_calling_unit.py
@@ -0,0 +1,216 @@
+"""Tests for function calling simulation."""
+
+import json
+import pytest
+from src.function_calling import (
+    build_tools_system_prompt,
+    parse_tool_calls,
+    format_tool_calls,
+    convert_tool_messages,
+)
+from src.models import Message, ToolCall, FunctionCall
+
+SAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+                "required": ["location"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "search",
+            "description": "Search the web",
+            "parameters": {
+                "type": "object",
+                "properties": {"query": {"type": "string"}},
+            },
+        },
+    },
+]
+
+
+class TestBuildToolsSystemPrompt:
+    def test_no_tools_returns_empty(self):
+        assert build_tools_system_prompt([], None) == ""
+
+    def test_none_choice_returns_empty(self):
+        assert build_tools_system_prompt(SAMPLE_TOOLS, "none") == ""
+
+    def test_auto_choice_includes_may_call(self):
+        result = build_tools_system_prompt(SAMPLE_TOOLS, "auto")
+        assert "MAY call functions" in result
+        assert "get_weather" in result
+        assert "search" in result
+
+    def test_required_choice_includes_must_call(self):
+        result = build_tools_system_prompt(SAMPLE_TOOLS, "required")
+        assert "MUST call at least one function" in result
+
+    def test_specific_function_choice(self):
+        choice = {"type": "function", "function": {"name": "get_weather"}}
+        result = build_tools_system_prompt(SAMPLE_TOOLS, choice)
+        assert "MUST call function get_weather" in result
+
+    def test_includes_tool_call_format(self):
+        result = build_tools_system_prompt(SAMPLE_TOOLS, "auto")
+        assert "```tool_calls" in result
+
+    def test_includes_parameters(self):
+        result = build_tools_system_prompt(SAMPLE_TOOLS, "auto")
+        assert "location" in result
+        assert "query" in result
+
+    def test_default_choice_is_auto(self):
+        result = build_tools_system_prompt(SAMPLE_TOOLS)
+        assert "MAY call functions" in result
+
+
+class TestParseToolCalls:
+    def test_fenced_tool_calls(self):
+        text = 'Some text\n```tool_calls\n[{"name": "get_weather", "arguments": {"location": "NYC"}}]\n```\nMore text'
+        calls, remaining = parse_tool_calls(text)
+        assert len(calls) == 1
+        assert calls[0]["name"] == "get_weather"
+        assert calls[0]["arguments"]["location"] == "NYC"
+        assert "Some text" in remaining
+        assert "More text" in remaining
+
+    def test_multiple_tool_calls(self):
+        text = '```tool_calls\n[{"name": "get_weather", "arguments": {"location": "NYC"}}, {"name": "search", "arguments": {"query": "hello"}}]\n```'
+        calls, remaining = parse_tool_calls(text)
+        assert len(calls) == 2
+
+    def test_bare_json_array_fallback(self):
+        text = 'Here are the results:\n[{"name": "search", "arguments": {"query": "test"}}]'
+        calls, remaining = parse_tool_calls(text)
+        assert len(calls) == 1
+        assert calls[0]["name"] == "search"
+
+    def test_no_tool_calls(self):
+        text = "Just a regular response with no function calls."
+        calls, remaining = parse_tool_calls(text)
+        assert calls == []
+        assert remaining == text
+
+    def test_malformed_json_returns_empty(self):
+        text = "```tool_calls\nnot valid json\n```"
+        calls, remaining = parse_tool_calls(text)
+        assert calls == []
+
+
+class TestFormatToolCalls:
+    def test_basic_format(self):
+        parsed = [{"name": "get_weather", "arguments": {"location": "NYC"}}]
+        result = format_tool_calls(parsed)
+        assert len(result) == 1
+        assert result[0].type == "function"
+        assert result[0].function.name == "get_weather"
+        assert result[0].id.startswith("call_")
+        assert json.loads(result[0].function.arguments) == {"location": "NYC"}
+
+    def test_multiple_calls_get_unique_ids(self):
+        parsed = [
+            {"name": "a", "arguments": {}},
+            {"name": "b", "arguments": {}},
+        ]
+        result = format_tool_calls(parsed)
+        assert result[0].id != result[1].id
+
+
+class TestConvertToolMessages:
+    def test_assistant_with_tool_calls(self):
+        msg = Message(
+            role="assistant",
+            content="Let me check",
+            tool_calls=[
+                ToolCall(
+                    id="call_123",
+                    type="function",
+                    function=FunctionCall(name="get_weather", arguments='{"location": "NYC"}'),
+                )
+            ],
+        )
+        result = convert_tool_messages([msg])
+        assert len(result) == 1
+        assert result[0].role == "assistant"
+        assert "Called get_weather" in result[0].content
+        assert "Let me check" in result[0].content
+
+    def test_tool_result_message(self):
+        msg = Message(
+            role="tool", content="72F and sunny", name="get_weather", tool_call_id="call_123"
+        )
+        result = convert_tool_messages([msg])
+        assert len(result) == 1
+        assert result[0].role == "user"
+        assert "Result of get_weather" in result[0].content
+
+    def test_regular_messages_pass_through(self):
+        msg = Message(role="user", content="Hello")
+        result = convert_tool_messages([msg])
+        assert result[0] is msg
+
+    def test_mixed_conversation(self):
+        messages = [
+            Message(role="user", content="What's the weather?"),
+            Message(
+                role="assistant",
+                content=None,
+                tool_calls=[
+                    ToolCall(
+                        id="c1",
+                        type="function",
+                        function=FunctionCall(name="get_weather", arguments='{"location": "NYC"}'),
+                    )
+                ],
+            ),
+            Message(role="tool", content="72F", name="get_weather", tool_call_id="c1"),
+        ]
+        result = convert_tool_messages(messages)
+        assert len(result) == 3
+        assert result[0].role == "user"
+        assert result[1].role == "assistant"
+        assert result[2].role == "user"
+
+    def test_convert_dict_messages(self):
+        messages = [
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "type": "function",
+                        "function": {"name": "search", "arguments": '{"q": "test"}'},
+                    }
+                ],
+            },
+            {"role": "tool", "content": "results", "name": "search", "tool_call_id": "c1"},
+        ]
+        result = convert_tool_messages(messages)
+        assert len(result) == 2
+        assert result[0].role == "assistant"
+        assert "Called search" in result[0].content
+        assert result[1].role == "user"
+        assert "Result of search" in result[1].content
+
+
+class TestParseToolCallsEdgeCases:
+    def test_nested_arrays_in_arguments(self):
+        text = '[{"name": "fn", "arguments": {"items": [1, 2, 3]}}]'
+        calls, remaining = parse_tool_calls(text)
+        assert len(calls) == 1
+        assert calls[0]["arguments"]["items"] == [1, 2, 3]
+
+    def test_tool_choice_dict_in_prompt(self):
+        choice = {"type": "function", "function": {"name": "search"}}
+        result = build_tools_system_prompt(SAMPLE_TOOLS, choice)
+        assert "MUST call function search" in result
diff --git a/tests/test_json_format_unit.py b/tests/test_json_format_unit.py
new file mode 100644
index 0000000..5b3e7cf
--- /dev/null
+++ b/tests/test_json_format_unit.py
@@ -0,0 +1,521 @@
+#!/usr/bin/env python3
+"""
+Unit tests for JSON format functionality.
+
+Tests the JSON extraction and enforcement methods in MessageAdapter,
+as well as the ResponseFormat model.
+"""
+
+import pytest
+
+from src.message_adapter import MessageAdapter, JsonExtractionResult
+from src.models import ResponseFormat, ChatCompletionRequest, Message
+
+
+class TestExtractJson:
+    """Test MessageAdapter.extract_json() method."""
+
+    def test_extract_json_pure(self):
+        """Pure JSON content is returned as-is."""
+        content = '{"name": "test", "value": 123}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_extract_json_pure_array(self):
+        """Pure JSON array is returned as-is."""
+        content = "[1, 2, 3, 4, 5]"
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_extract_json_pure_with_whitespace(self):
+        """Pure JSON with surrounding whitespace is extracted."""
+        content = '  \n{"key": "value"}\n  '
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"key": "value"}'
+
+    def test_extract_json_markdown_block(self):
+        """Extracts JSON from ```json code block."""
+        content = """Here is the data:
+```json
+{"items": [1, 2, 3]}
+```
+That's all!"""
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"items": [1, 2, 3]}'
+
+    def test_extract_json_generic_code_block(self):
+        """Extracts JSON from generic ``` code block."""
+        content = """Response:
+```
+{"status": "ok"}
+```"""
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"status": "ok"}'
+
+    def test_extract_json_embedded_object(self):
+        """Finds JSON object embedded in text."""
+        content = 'The result is {"success": true, "count": 42} as expected.'
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"success": true, "count": 42}'
+
+    def test_extract_json_embedded_array(self):
+        """Finds JSON array embedded in text."""
+        content = "Available items: [1, 2, 3] are ready."
+        result = MessageAdapter.extract_json(content)
+        assert result == "[1, 2, 3]"
+
+    def test_extract_json_nested_object(self):
+        """Extracts nested JSON objects."""
+        content = """Result: {"outer": {"inner": {"deep": "value"}}}"""
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '"deep": "value"' in result
+
+    def test_extract_json_complex_array(self):
+        """Extracts complex JSON arrays."""
+        content = """Data: [{"id": 1}, {"id": 2}]"""
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '"id": 1' in result
+
+    def test_extract_json_no_json(self):
+        """Returns None when no valid JSON found."""
+        content = "This is just plain text with no JSON."
+        result = MessageAdapter.extract_json(content)
+        assert result is None
+
+    def test_extract_json_invalid_json(self):
+        """Returns None for malformed JSON."""
+        content = '{"broken: json'
+        result = MessageAdapter.extract_json(content)
+        assert result is None
+
+    def test_extract_json_empty_string(self):
+        """Returns None for empty string."""
+        result = MessageAdapter.extract_json("")
+        assert result is None
+
+    def test_extract_json_none_input(self):
+        """Returns None for None input."""
+        result = MessageAdapter.extract_json(None)
+        assert result is None
+
+    def test_extract_json_prefers_code_block(self):
+        """Prefers code block JSON over embedded JSON."""
+        content = """Text {"wrong": "json"}
+```json
+{"correct": "json"}
+```"""
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"correct": "json"}'
+
+    def test_extract_json_multiline(self):
+        """Extracts multiline JSON from code block."""
+        content = """```json
+{
+    "name": "test",
+    "items": [
+        1,
+        2,
+        3
+    ]
+}
+```"""
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '"name": "test"' in result
+        assert '"items"' in result
+
+
+class TestEnforceJsonFormat:
+    """Test MessageAdapter.enforce_json_format() method."""
+
+    def test_enforce_json_valid_object(self):
+        """Valid JSON object passes through."""
+        content = '{"key": "value"}'
+        result = MessageAdapter.enforce_json_format(content)
+        assert result == content
+
+    def test_enforce_json_valid_array(self):
+        """Valid JSON array passes through."""
+        content = "[1, 2, 3]"
+        result = MessageAdapter.enforce_json_format(content)
+        assert result == content
+
+    def test_enforce_json_extracts_from_text(self):
+        """Extracts JSON from surrounding text."""
+        content = 'Here is the result: {"data": 123}'
+        result = MessageAdapter.enforce_json_format(content)
+        assert result == '{"data": 123}'
+
+    def test_enforce_json_strict_fallback(self):
+        """Returns '[]' on failure in strict mode."""
+        content = "No JSON here at all!"
+        result = MessageAdapter.enforce_json_format(content, strict=True)
+        assert result == "[]"
+
+    def test_enforce_json_non_strict_returns_original(self):
+        """Returns original content on failure in non-strict mode."""
+        content = "No JSON here at all!"
+        result = MessageAdapter.enforce_json_format(content, strict=False)
+        assert result == content
+
+    def test_enforce_json_from_markdown(self):
+        """Extracts JSON from markdown code block."""
+        content = """```json
+{"extracted": true}
+```"""
+        result = MessageAdapter.enforce_json_format(content)
+        assert result == '{"extracted": true}'
+
+    def test_enforce_json_empty_strict(self):
+        """Empty input returns '[]' in strict mode."""
+        result = MessageAdapter.enforce_json_format("", strict=True)
+        assert result == "[]"
+
+
+class TestResponseFormatModel:
+    """Test ResponseFormat Pydantic model."""
+
+    def test_response_format_default_text(self):
+        """Default type is 'text'."""
+        rf = ResponseFormat()
+        assert rf.type == "text"
+
+    def test_response_format_text_explicit(self):
+        """Can explicitly set type to 'text'."""
+        rf = ResponseFormat(type="text")
+        assert rf.type == "text"
+
+    def test_response_format_json_object(self):
+        """Can set type to 'json_object'."""
+        rf = ResponseFormat(type="json_object")
+        assert rf.type == "json_object"
+
+    def test_response_format_invalid_type(self):
+        """Invalid type raises validation error."""
+        with pytest.raises(ValueError):
+            ResponseFormat(type="invalid")
+
+    def test_response_format_in_request(self):
+        """ResponseFormat can be used in ChatCompletionRequest."""
+        request = ChatCompletionRequest(
+            messages=[Message(role="user", content="Return JSON")],
+            response_format=ResponseFormat(type="json_object"),
+        )
+        assert request.response_format is not None
+        assert request.response_format.type == "json_object"
+
+    def test_response_format_none_in_request(self):
+        """ResponseFormat can be None in ChatCompletionRequest."""
+        request = ChatCompletionRequest(
+            messages=[Message(role="user", content="Hello")],
+        )
+        assert request.response_format is None
+
+    def test_response_format_dict_input(self):
+        """ResponseFormat accepts dict input (OpenAI client style)."""
+        request = ChatCompletionRequest(
+            messages=[Message(role="user", content="Return JSON")],
+            response_format={"type": "json_object"},
+        )
+        assert request.response_format.type == "json_object"
+
+    def test_response_format_json_schema(self):
+        """json_schema type with schema definition."""
+        rf = ResponseFormat(
+            type="json_schema",
+            json_schema={
+                "name": "test",
+                "schema": {"type": "object", "properties": {"x": {"type": "number"}}},
+            },
+        )
+        assert rf.type == "json_schema"
+        assert rf.json_schema is not None
+        assert rf.json_schema.name == "test"
+        assert rf.json_schema.schema_ is not None
+        assert rf.json_schema.schema_["type"] == "object"
+
+    def test_response_format_json_schema_in_request(self):
+        """json_schema type works in ChatCompletionRequest."""
+        request = ChatCompletionRequest(
+            messages=[Message(role="user", content="Return JSON")],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "colors",
+                    "schema": {"type": "object", "properties": {"colors": {"type": "array"}}},
+                },
+            },
+        )
+        assert request.response_format.type == "json_schema"
+        assert request.response_format.json_schema.name == "colors"
+
+
+class TestJsonModeInstruction:
+    """Test JSON_MODE_INSTRUCTION constant."""
+
+    def test_json_mode_instruction_exists(self):
+        """JSON_MODE_INSTRUCTION constant exists."""
+        assert hasattr(MessageAdapter, "JSON_MODE_INSTRUCTION")
+
+    def test_json_mode_instruction_not_empty(self):
+        """JSON_MODE_INSTRUCTION is not empty."""
+        assert len(MessageAdapter.JSON_MODE_INSTRUCTION) > 0
+
+    def test_json_mode_instruction_mentions_json(self):
+        """JSON_MODE_INSTRUCTION mentions JSON."""
+        assert "JSON" in MessageAdapter.JSON_MODE_INSTRUCTION.upper()
+
+    def test_json_mode_instruction_is_string(self):
+        """JSON_MODE_INSTRUCTION is a string."""
+        assert isinstance(MessageAdapter.JSON_MODE_INSTRUCTION, str)
+
+
+class TestJsonExtractionEdgeCases:
+    """Test edge cases for JSON extraction."""
+
+    def test_json_with_escaped_quotes(self):
+        """Handles JSON with escaped quotes."""
+        content = '{"message": "He said \\"hello\\""}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_json_with_unicode(self):
+        """Handles JSON with unicode characters."""
+        content = '{"emoji": "\\u2764", "text": "hello"}'
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+
+    def test_json_boolean_values(self):
+        """Handles JSON boolean values."""
+        content = '{"active": true, "deleted": false}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_json_null_value(self):
+        """Handles JSON null value."""
+        content = '{"data": null}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_json_number_types(self):
+        """Handles various JSON number types."""
+        content = '{"int": 42, "float": 3.14, "negative": -10, "exp": 1e5}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_deeply_nested_json(self):
+        """Handles deeply nested JSON."""
+        content = '{"a": {"b": {"c": {"d": {"e": 1}}}}}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_json_array_of_objects(self):
+        """Handles array of objects."""
+        content = '[{"id": 1}, {"id": 2}, {"id": 3}]'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+    def test_multiple_json_blocks_returns_first_valid(self):
+        """When multiple code blocks exist, returns valid JSON from first."""
+        content = """```json
+{"first": true}
+```
+```json
+{"second": true}
+```"""
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"first": true}'
+
+    def test_json_with_newlines(self):
+        """Handles JSON with embedded newlines."""
+        content = '{"text": "line1\\nline2"}'
+        result = MessageAdapter.extract_json(content)
+        assert result == content
+
+
+class TestBalancedJsonExtraction:
+    """Test the balanced brace/bracket matching algorithm."""
+
+    def test_deeply_nested_objects(self):
+        """Handles deeply nested objects with balanced matching."""
+        content = 'Preamble: {"a": {"b": {"c": {"d": {"e": {"f": 1}}}}}}'
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"a": {"b": {"c": {"d": {"e": {"f": 1}}}}}}'
+
+    def test_mixed_nesting(self):
+        """Handles mixed objects and arrays."""
+        content = 'Result: {"items": [{"id": 1, "nested": {"value": [1,2,3]}}]}'
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '"items"' in result
+        assert '"nested"' in result
+
+    def test_escaped_quotes_in_strings(self):
+        """Handles escaped quotes within strings."""
+        content = """{"message": "He said \\"hello\\" to me", "count": 1}"""
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '\\"hello\\"' in result
+
+    def test_braces_inside_strings(self):
+        """Ignores braces inside string values."""
+        content = '{"code": "function() { return {}; }", "valid": true}'
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '"valid": true' in result
+
+    def test_brackets_inside_strings(self):
+        """Ignores brackets inside string values."""
+        content = '{"regex": "[a-z]+", "array": [1, 2, 3]}'
+        result = MessageAdapter.extract_json(content)
+        assert result is not None
+        assert '"array": [1, 2, 3]' in result
+
+    def test_preamble_stripping(self):
+        """Removes common Claude preambles before JSON."""
+        content = 'Here\'s the JSON: {"key": "value"}'
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"key": "value"}'
+
+    def test_heres_the_response_preamble(self):
+        """Handles 'Here is the response:' preamble."""
+        content = 'Here is the response: {"status": "ok"}'
+        result = MessageAdapter.extract_json(content)
+        assert result == '{"status": "ok"}'
+
+    def test_result_preamble(self):
+        """Handles 'Result:' preamble."""
+        content = "Result: [1, 2, 3, 4, 5]"
+        result = MessageAdapter.extract_json(content)
+        assert result == "[1, 2, 3, 4, 5]"
+
+
+class TestJsonExtractionMetadata:
+    """Test the extract_json_with_metadata method."""
+
+    def test_direct_extraction_method(self):
+        """Reports 'direct' method for pure JSON."""
+        content = '{"pure": "json"}'
+        result = MessageAdapter.extract_json_with_metadata(content)
+        assert result.success is True
+        assert result.method == "direct"
+        assert result.content == content
+
+    def test_preamble_removed_method(self):
+        """Reports 'preamble_removed' method when preamble stripped."""
+        content = 'Here\'s the JSON: {"key": "value"}'
+        result = MessageAdapter.extract_json_with_metadata(content)
+        assert result.success is True
+        assert result.method == "preamble_removed"
+        assert result.preamble_found == "Here's the JSON:"
+
+    def test_code_block_method(self):
+        """Reports 'code_block' method for markdown extraction."""
+        content = """```json
+{"extracted": true}
+```"""
+        result = MessageAdapter.extract_json_with_metadata(content)
+        assert result.success is True
+        assert result.method == "code_block"
+
+    def test_brace_match_method(self):
+        """Reports 'brace_match' for balanced extraction."""
+        content = 'Some text {"embedded": true} more text'
+        result = MessageAdapter.extract_json_with_metadata(content)
+        assert result.success is True
+        assert result.method == "brace_match"
+
+    def test_length_tracking(self):
+        """Tracks original and extracted lengths."""
+        content = '   {"padded": true}   '
+        result = MessageAdapter.extract_json_with_metadata(content)
+        assert result.original_length == len(content)
+        assert result.extracted_length == len('{"padded": true}')
+
+    def test_failure_reporting(self):
+        """Reports failure correctly for invalid content."""
+        content = "No JSON here at all!"
+        result = MessageAdapter.extract_json_with_metadata(content)
+        assert result.success is False
+        assert result.method == "failed"
+        assert result.content is None
+
+    def test_empty_content(self):
+        """Handles empty content."""
+        result = MessageAdapter.extract_json_with_metadata("")
+        assert result.success is False
+        assert result.method == "failed"
+        assert result.original_length == 0
+
+
+class TestEnforceJsonFormatWithMetadata:
+    """Test enforce_json_format_with_metadata method."""
+
+    def test_returns_tuple(self):
+        """Returns tuple of (content, metadata)."""
+        content = '{"key": "value"}'
+        result = MessageAdapter.enforce_json_format_with_metadata(content)
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+
+    def test_metadata_dict_structure(self):
+        """Metadata dict contains expected keys."""
+        content = '{"key": "value"}'
+        json_content, metadata = MessageAdapter.enforce_json_format_with_metadata(content)
+        assert "success" in metadata
+        assert "method" in metadata
+        assert "original_length" in metadata
+        assert "extracted_length" in metadata
+        assert "strict_mode" in metadata
+
+    def test_strict_mode_in_metadata(self):
+        """Strict mode is reflected in metadata."""
+        content = "No JSON"
+        _, metadata_strict = MessageAdapter.enforce_json_format_with_metadata(content, strict=True)
+        _, metadata_non_strict = MessageAdapter.enforce_json_format_with_metadata(
+            content, strict=False
+        )
+
+        assert metadata_strict["strict_mode"] is True
+        assert metadata_non_strict["strict_mode"] is False
+
+    def test_fallback_used_on_failure(self):
+        """Reports fallback_used when extraction fails."""
+        content = "No JSON here!"
+        _, metadata = MessageAdapter.enforce_json_format_with_metadata(content, strict=True)
+        assert metadata.get("fallback_used") is True
+        assert metadata.get("fallback_value") == "[]"
+
+    def test_preamble_in_metadata(self):
+        """Preamble is included in metadata when found."""
+        content = 'Here\'s the JSON: {"key": "value"}'
+        _, metadata = MessageAdapter.enforce_json_format_with_metadata(content)
+        assert metadata.get("preamble_found") == "Here's the JSON:"
+
+
+class TestCommonPreambles:
+    """Test COMMON_PREAMBLES constant."""
+
+    def test_common_preambles_exists(self):
+        """COMMON_PREAMBLES constant exists."""
+        assert hasattr(MessageAdapter, "COMMON_PREAMBLES")
+
+    def test_common_preambles_is_list(self):
+        """COMMON_PREAMBLES is a list."""
+        assert isinstance(MessageAdapter.COMMON_PREAMBLES, list)
+
+    def test_common_preambles_not_empty(self):
+        """COMMON_PREAMBLES is not empty."""
+        assert len(MessageAdapter.COMMON_PREAMBLES) > 0
+
+    def test_common_preambles_includes_heres(self):
+        """COMMON_PREAMBLES includes 'Here's the JSON:' variant."""
+        preambles_lower = [p.lower() for p in MessageAdapter.COMMON_PREAMBLES]
+        assert any("here's the json" in p for p in preambles_lower)
+
+    def test_common_preambles_includes_here_is(self):
+        """COMMON_PREAMBLES includes 'Here is the JSON:' variant."""
+        preambles_lower = [p.lower() for p in MessageAdapter.COMMON_PREAMBLES]
+        assert any("here is the json" in p for p in preambles_lower)
diff --git a/tests/test_log_format_unit.py b/tests/test_log_format_unit.py
new file mode 100644
index 0000000..f0e4680
--- /dev/null
+++ b/tests/test_log_format_unit.py
@@ -0,0 +1,48 @@
+"""Unit tests for src.main._kv log-line formatter.
+
+The wrapper's default logging format is plain text and drops extras.
+``_kv`` exists so we can serialize structured fields INTO the message string
+itself without reaching for a full JSON logger.
+"""
+
+from src.main import _kv
+
+
+class TestKvFormatter:
+    def test_basic_event_only(self):
+        assert _kv("circuit_breaker_open") == "circuit_breaker_open"
+
+    def test_simple_key_value(self):
+        assert (
+            _kv("completion_result", num_turns=2, subtype="success")
+            == "completion_result num_turns=2 subtype=success"
+        )
+
+    def test_none_values_are_skipped(self):
+        # None extras would just spam the log line if kept; drop them.
+        out = _kv("claude_sdk_error", subtype="error_max_turns", stop_reason=None)
+        assert "stop_reason" not in out
+        assert out == "claude_sdk_error subtype=error_max_turns"
+
+    def test_values_with_whitespace_are_quoted(self):
+        # grep for `key=value` must keep working even when the value has spaces.
+        out = _kv("claude_sdk_error", error_message="boom boom")
+        assert "error_message='boom boom'" in out
+
+    def test_equals_in_value_is_quoted(self):
+        out = _kv("circuit_breaker_open", reason="k=v")
+        assert "reason='k=v'" in out
+
+    def test_snapshot_style_kwargs_expansion(self):
+        snapshot = {
+            "state": "open",
+            "window_size": 2,
+            "failure_ratio": 1.0,
+            "threshold": 0.75,
+        }
+        out = _kv("circuit_breaker_open", **snapshot)
+        assert out.startswith("circuit_breaker_open ")
+        assert "state=open" in out
+        assert "window_size=2" in out
+        assert "failure_ratio=1.0" in out
+        assert "threshold=0.75" in out
diff --git a/tests/test_message_adapter_unit.py b/tests/test_message_adapter_unit.py
index 90f3c52..882b9db 100644
--- a/tests/test_message_adapter_unit.py
+++ b/tests/test_message_adapter_unit.py
@@ -93,7 +93,7 @@ class TestFilterContent:
     def test_empty_content_returns_empty(self):
         """Empty content returns empty."""
         assert MessageAdapter.filter_content("") == ""
-        assert MessageAdapter.filter_content(None) is None
+        assert MessageAdapter.filter_content(None) == ""
 
     def test_plain_text_unchanged(self):
         """Plain text content is unchanged."""
diff --git a/tests/test_model_service_unit.py b/tests/test_model_service_unit.py
new file mode 100644
index 0000000..2a1a8f7
--- /dev/null
+++ b/tests/test_model_service_unit.py
@@ -0,0 +1,573 @@
+#!/usr/bin/env python3
+"""
+Unit tests for src/model_service.py
+
+Tests the ModelService class that fetches models from Anthropic API
+with graceful fallback to static constants. Includes tests for
+different authentication methods (anthropic, cli, bedrock, vertex).
+"""
+
+import time
+import pytest
+from unittest.mock import patch, AsyncMock, MagicMock, PropertyMock
+import httpx
+
+from src.model_service import ModelService, MODEL_FETCH_TIMEOUT
+from src.constants import CLAUDE_MODELS
+
+
+class TestModelService:
+    """Test ModelService class."""
+
+    @pytest.fixture
+    def model_service(self):
+        """Create a fresh ModelService instance for each test."""
+        return ModelService()
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_success(self, model_service):
+        """Successfully fetches models from API with anthropic auth."""
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "data": [
+                {"id": "claude-sonnet-4-5-20250929", "name": "Claude Sonnet"},
+                {"id": "claude-haiku-4-5-20251001", "name": "Claude Haiku"},
+            ]
+        }
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}):
+                with patch.object(model_service, "_http_client") as mock_client:
+                    mock_client.get = AsyncMock(return_value=mock_response)
+
+                    result = await model_service.fetch_models_from_api()
+
+        assert result is not None
+        assert len(result) == 2
+        assert "claude-sonnet-4-5-20250929" in result
+        assert "claude-haiku-4-5-20251001" in result
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_timeout(self, model_service):
+        """Returns None on timeout, allowing fallback to constants."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}):
+                with patch.object(model_service, "_http_client") as mock_client:
+                    mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("timeout"))
+
+                    result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_auth_error(self, model_service):
+        """Returns None on 401 auth error, allowing fallback."""
+        mock_response = MagicMock()
+        mock_response.status_code = 401
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "invalid-key"}):
+                with patch.object(model_service, "_http_client") as mock_client:
+                    mock_client.get = AsyncMock(return_value=mock_response)
+
+                    result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_rate_limited(self, model_service):
+        """Returns None on 429 rate limit, allowing fallback."""
+        mock_response = MagicMock()
+        mock_response.status_code = 429
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}):
+                with patch.object(model_service, "_http_client") as mock_client:
+                    mock_client.get = AsyncMock(return_value=mock_response)
+
+                    result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_network_error(self, model_service):
+        """Returns None on network error, allowing fallback."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}):
+                with patch.object(model_service, "_http_client") as mock_client:
+                    mock_client.get = AsyncMock(side_effect=httpx.RequestError("connection failed"))
+
+                    result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_no_api_key(self, model_service):
+        """Returns None when no API key is set (anthropic auth)."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {}, clear=True):
+                # Ensure ANTHROPIC_API_KEY is not set
+                import os
+
+                if "ANTHROPIC_API_KEY" in os.environ:
+                    del os.environ["ANTHROPIC_API_KEY"]
+
+                result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_empty_response(self, model_service):
+        """Returns None when API returns empty model list."""
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"data": []}
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}):
+                with patch.object(model_service, "_http_client") as mock_client:
+                    mock_client.get = AsyncMock(return_value=mock_response)
+
+                    result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    def test_get_models_returns_cached(self, model_service):
+        """Returns cached models when available."""
+        model_service._cached_models = ["model-a", "model-b", "model-c"]
+
+        result = model_service.get_models()
+
+        assert result == ["model-a", "model-b", "model-c"]
+
+    def test_get_models_returns_fallback(self, model_service):
+        """Returns CLAUDE_MODELS fallback when no cached models."""
+        model_service._cached_models = None
+
+        result = model_service.get_models()
+
+        assert result == list(CLAUDE_MODELS)
+
+    def test_get_models_returns_fallback_empty_cache(self, model_service):
+        """Returns CLAUDE_MODELS fallback when cache is empty list."""
+        # Empty list is falsy, so should fall back
+        model_service._cached_models = []
+
+        result = model_service.get_models()
+
+        # Empty list is falsy, so fallback is used
+        assert result == list(CLAUDE_MODELS)
+
+    def test_is_initialized_false_by_default(self, model_service):
+        """Service is not initialized by default."""
+        assert model_service.is_initialized() is False
+
+    @pytest.mark.asyncio
+    async def test_initialize_sets_initialized(self, model_service):
+        """Initialize sets initialized flag."""
+        with patch.object(model_service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = None
+
+            await model_service.initialize()
+
+        assert model_service.is_initialized() is True
+
+    @pytest.mark.asyncio
+    async def test_initialize_caches_fetched_models(self, model_service):
+        """Initialize caches successfully fetched models."""
+        fetched = ["claude-3-opus", "claude-3-sonnet"]
+
+        with patch.object(model_service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = fetched
+
+            await model_service.initialize()
+
+        assert model_service._cached_models == fetched
+
+    @pytest.mark.asyncio
+    async def test_initialize_only_once(self, model_service):
+        """Initialize only fetches models once."""
+        with patch.object(model_service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = ["model-1"]
+
+            await model_service.initialize()
+            await model_service.initialize()  # Second call should be no-op
+
+        mock.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_shutdown_closes_client(self, model_service):
+        """Shutdown closes the HTTP client."""
+        mock_client = AsyncMock()
+        model_service._http_client = mock_client
+        model_service._initialized = True
+
+        await model_service.shutdown()
+
+        mock_client.aclose.assert_called_once()
+        assert model_service._http_client is None
+        assert model_service._initialized is False
+
+    @pytest.mark.asyncio
+    async def test_shutdown_safe_when_not_initialized(self, model_service):
+        """Shutdown is safe when called before initialization."""
+        # Should not raise
+        await model_service.shutdown()
+
+        assert model_service._http_client is None
+
+
+class TestModelServiceIntegration:
+    """Integration-style tests for ModelService."""
+
+    @pytest.mark.asyncio
+    async def test_full_lifecycle(self):
+        """Test full initialize-use-shutdown lifecycle."""
+        service = ModelService()
+
+        # Mock the API call
+        with patch.object(service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = ["test-model-1", "test-model-2"]
+
+            # Initialize
+            await service.initialize()
+            assert service.is_initialized()
+
+            # Use
+            models = service.get_models()
+            assert models == ["test-model-1", "test-model-2"]
+
+            # Shutdown
+            await service.shutdown()
+            assert not service.is_initialized()
+
+            # After shutdown, should return fallback
+            models = service.get_models()
+            assert models == list(CLAUDE_MODELS)
+
+    @pytest.mark.asyncio
+    async def test_fallback_on_api_failure(self):
+        """Test that API failure results in fallback models."""
+        service = ModelService()
+
+        # Mock API failure
+        with patch.object(service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = None  # API failed
+
+            await service.initialize()
+
+            models = service.get_models()
+            assert models == list(CLAUDE_MODELS)
+
+            await service.shutdown()
+
+
+class TestModelServiceRefresh:
+    """Tests for model refresh functionality."""
+
+    @pytest.fixture
+    def model_service(self):
+        """Create a fresh ModelService instance for each test."""
+        return ModelService()
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_success(self, model_service):
+        """Refresh successfully updates cached models with anthropic auth."""
+        # First, initialize with some models
+        model_service._cached_models = ["old-model-1", "old-model-2"]
+        model_service._source = "api"
+        model_service._initialized = True
+
+        new_models = ["new-model-1", "new-model-2", "new-model-3"]
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.object(
+                model_service, "fetch_models_from_api", new_callable=AsyncMock
+            ) as mock:
+                mock.return_value = new_models
+
+                result = await model_service.refresh_models()
+
+        assert result["success"] is True
+        assert result["count"] == 3
+        assert result["source"] == "api"
+        assert result["models"] == new_models
+        assert result["auth_method"] == "anthropic"
+        assert model_service._cached_models == new_models
+        assert model_service._source == "api"
+        assert model_service._last_refresh is not None
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_failure_preserves_existing(self, model_service):
+        """Refresh failure preserves existing cached models."""
+        existing_models = ["existing-model-1", "existing-model-2"]
+        model_service._cached_models = existing_models
+        model_service._source = "api"
+        model_service._initialized = True
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.object(
+                model_service, "fetch_models_from_api", new_callable=AsyncMock
+            ) as mock:
+                mock.return_value = None  # API failed
+
+                result = await model_service.refresh_models()
+
+        assert result["success"] is False
+        assert "API fetch failed" in result["message"]
+        assert result["current_count"] == 2
+        assert result["source"] == "api"
+        assert result["auth_method"] == "anthropic"
+        # Existing models should be preserved
+        assert model_service._cached_models == existing_models
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_updates_last_refresh_time(self, model_service):
+        """Refresh updates the last_refresh timestamp."""
+        model_service._initialized = True
+
+        before_time = time.time()
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.object(
+                model_service, "fetch_models_from_api", new_callable=AsyncMock
+            ) as mock:
+                mock.return_value = ["model-1"]
+
+                await model_service.refresh_models()
+
+        after_time = time.time()
+
+        assert model_service._last_refresh is not None
+        assert before_time <= model_service._last_refresh <= after_time
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_failure_does_not_update_timestamp(self, model_service):
+        """Refresh failure does not update last_refresh timestamp."""
+        model_service._cached_models = ["model-1"]
+        model_service._last_refresh = 1000.0  # Some old timestamp
+        model_service._initialized = True
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            with patch.object(
+                model_service, "fetch_models_from_api", new_callable=AsyncMock
+            ) as mock:
+                mock.return_value = None
+
+                await model_service.refresh_models()
+
+        # Timestamp should remain unchanged
+        assert model_service._last_refresh == 1000.0
+
+    def test_get_status_returns_correct_info(self, model_service):
+        """get_status returns correct service status including auth_method."""
+        model_service._initialized = True
+        model_service._source = "api"
+        model_service._cached_models = ["model-1", "model-2", "model-3"]
+        model_service._last_refresh = 1234567890.0
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "anthropic"
+            status = model_service.get_status()
+
+        assert status["initialized"] is True
+        assert status["source"] == "api"
+        assert status["model_count"] == 3
+        assert status["last_refresh"] == 1234567890.0
+        assert status["auth_method"] == "anthropic"
+
+    def test_get_status_fallback_source(self, model_service):
+        """get_status shows fallback source when not from API."""
+        model_service._initialized = True
+        model_service._source = "fallback"
+        model_service._cached_models = None
+        model_service._last_refresh = None
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "claude_cli"
+            status = model_service.get_status()
+
+        assert status["initialized"] is True
+        assert status["source"] == "fallback"
+        assert status["model_count"] == len(CLAUDE_MODELS)
+        assert status["last_refresh"] is None
+        assert status["auth_method"] == "claude_cli"
+
+    @pytest.mark.asyncio
+    async def test_initialize_sets_source_api_on_success(self, model_service):
+        """Initialize sets source to 'api' when fetch succeeds."""
+        with patch.object(model_service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = ["model-1", "model-2"]
+
+            await model_service.initialize()
+
+        assert model_service._source == "api"
+        assert model_service._last_refresh is not None
+
+    @pytest.mark.asyncio
+    async def test_initialize_sets_source_fallback_on_failure(self, model_service):
+        """Initialize sets source to 'fallback' when fetch fails."""
+        with patch.object(model_service, "fetch_models_from_api", new_callable=AsyncMock) as mock:
+            mock.return_value = None
+
+            await model_service.initialize()
+
+        assert model_service._source == "fallback"
+        assert model_service._last_refresh is None
+
+    @pytest.mark.asyncio
+    async def test_shutdown_resets_source_and_timestamp(self, model_service):
+        """Shutdown resets source and last_refresh."""
+        model_service._source = "api"
+        model_service._last_refresh = 1234567890.0
+        model_service._initialized = True
+
+        await model_service.shutdown()
+
+        assert model_service._source == "fallback"
+        assert model_service._last_refresh is None
+
+
+class TestModelServiceAuthMethods:
+    """Tests for different authentication method behaviors."""
+
+    @pytest.fixture
+    def model_service(self):
+        """Create a fresh ModelService instance for each test."""
+        return ModelService()
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_cli_auth_returns_none(self, model_service):
+        """CLI auth method returns None (uses static fallback)."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "claude_cli"
+
+            result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_bedrock_auth_returns_none(self, model_service):
+        """Bedrock auth method returns None (uses static fallback)."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "bedrock"
+
+            result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_vertex_auth_returns_none(self, model_service):
+        """Vertex auth method returns None (uses static fallback)."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "vertex"
+
+            result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_fetch_models_unknown_auth_returns_none(self, model_service):
+        """Unknown auth method returns None (uses static fallback)."""
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "unknown_method"
+
+            result = await model_service.fetch_models_from_api()
+
+        assert result is None
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_cli_auth_fails(self, model_service):
+        """Refresh with CLI auth returns failure with auth_method in response."""
+        model_service._cached_models = ["model-1"]
+        model_service._source = "fallback"
+        model_service._initialized = True
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "claude_cli"
+
+            result = await model_service.refresh_models()
+
+        assert result["success"] is False
+        assert "Dynamic refresh requires ANTHROPIC_API_KEY" in result["message"]
+        assert result["auth_method"] == "claude_cli"
+        assert result["current_count"] == 1
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_bedrock_auth_fails(self, model_service):
+        """Refresh with Bedrock auth returns failure with auth_method in response."""
+        model_service._cached_models = None
+        model_service._source = "fallback"
+        model_service._initialized = True
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "bedrock"
+
+            result = await model_service.refresh_models()
+
+        assert result["success"] is False
+        assert "Dynamic refresh requires ANTHROPIC_API_KEY" in result["message"]
+        assert result["auth_method"] == "bedrock"
+        assert result["current_count"] == len(CLAUDE_MODELS)
+
+    @pytest.mark.asyncio
+    async def test_refresh_models_vertex_auth_fails(self, model_service):
+        """Refresh with Vertex auth returns failure with auth_method in response."""
+        model_service._cached_models = None
+        model_service._source = "fallback"
+        model_service._initialized = True
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "vertex"
+
+            result = await model_service.refresh_models()
+
+        assert result["success"] is False
+        assert "Dynamic refresh requires ANTHROPIC_API_KEY" in result["message"]
+        assert result["auth_method"] == "vertex"
+        assert result["current_count"] == len(CLAUDE_MODELS)
+
+    def test_get_status_includes_auth_method_cli(self, model_service):
+        """get_status includes auth_method for CLI auth."""
+        model_service._initialized = True
+        model_service._source = "fallback"
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "claude_cli"
+            status = model_service.get_status()
+
+        assert status["auth_method"] == "claude_cli"
+
+    def test_get_status_includes_auth_method_bedrock(self, model_service):
+        """get_status includes auth_method for Bedrock auth."""
+        model_service._initialized = True
+        model_service._source = "fallback"
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "bedrock"
+            status = model_service.get_status()
+
+        assert status["auth_method"] == "bedrock"
+
+    def test_get_status_includes_auth_method_vertex(self, model_service):
+        """get_status includes auth_method for Vertex auth."""
+        model_service._initialized = True
+        model_service._source = "fallback"
+
+        with patch("src.model_service.auth_manager") as mock_auth:
+            mock_auth.auth_method = "vertex"
+            status = model_service.get_status()
+
+        assert status["auth_method"] == "vertex"
diff --git a/tests/test_models_unit.py b/tests/test_models_unit.py
index 5e6387d..e91a61a 100644
--- a/tests/test_models_unit.py
+++ b/tests/test_models_unit.py
@@ -245,23 +245,63 @@ def test_to_claude_options_basic(self):
         options = request.to_claude_options()
         assert options["model"] == "claude-sonnet-4-5-20250929"
 
-    def test_to_claude_options_with_max_tokens(self):
-        """to_claude_options() maps max_tokens to max_thinking_tokens."""
-        request = ChatCompletionRequest(
-            messages=[Message(role="user", content="Hi")], max_tokens=500
-        )
-        options = request.to_claude_options()
-        assert options.get("max_thinking_tokens") == 500
+    def test_to_claude_options_ignores_max_tokens_by_default(self):
+        """max_tokens is ignored when WRAPPER_MAP_MAX_TOKENS_TO_THINKING is unset.
+
+        OpenAI max_tokens is an output-length cap; the Claude Agent SDK has no
+        equivalent, and mapping it to max_thinking_tokens caused short prompts
+        to bust max_turns after burning the thinking budget.
+        """
+        import os
+
+        prior = os.environ.pop("WRAPPER_MAP_MAX_TOKENS_TO_THINKING", None)
+        try:
+            request = ChatCompletionRequest(
+                messages=[Message(role="user", content="Hi")], max_tokens=500
+            )
+            options = request.to_claude_options()
+            assert "max_thinking_tokens" not in options
+        finally:
+            if prior is not None:
+                os.environ["WRAPPER_MAP_MAX_TOKENS_TO_THINKING"] = prior
+
+    def test_to_claude_options_maps_when_env_enables_legacy(self):
+        """Opt-in via env var restores the legacy max_thinking_tokens mapping."""
+        import os
+
+        prior = os.environ.get("WRAPPER_MAP_MAX_TOKENS_TO_THINKING")
+        os.environ["WRAPPER_MAP_MAX_TOKENS_TO_THINKING"] = "true"
+        try:
+            request = ChatCompletionRequest(
+                messages=[Message(role="user", content="Hi")], max_tokens=500
+            )
+            options = request.to_claude_options()
+            assert options.get("max_thinking_tokens") == 500
+        finally:
+            if prior is None:
+                os.environ.pop("WRAPPER_MAP_MAX_TOKENS_TO_THINKING", None)
+            else:
+                os.environ["WRAPPER_MAP_MAX_TOKENS_TO_THINKING"] = prior
 
     def test_to_claude_options_prefers_max_completion_tokens(self):
-        """max_completion_tokens takes precedence over max_tokens."""
-        request = ChatCompletionRequest(
-            messages=[Message(role="user", content="Hi")],
-            max_tokens=500,
-            max_completion_tokens=1000,
-        )
-        options = request.to_claude_options()
-        assert options.get("max_thinking_tokens") == 1000
+        """max_completion_tokens takes precedence over max_tokens when mapping."""
+        import os
+
+        prior = os.environ.get("WRAPPER_MAP_MAX_TOKENS_TO_THINKING")
+        os.environ["WRAPPER_MAP_MAX_TOKENS_TO_THINKING"] = "true"
+        try:
+            request = ChatCompletionRequest(
+                messages=[Message(role="user", content="Hi")],
+                max_tokens=500,
+                max_completion_tokens=1000,
+            )
+            options = request.to_claude_options()
+            assert options.get("max_thinking_tokens") == 1000
+        finally:
+            if prior is None:
+                os.environ.pop("WRAPPER_MAP_MAX_TOKENS_TO_THINKING", None)
+            else:
+                os.environ["WRAPPER_MAP_MAX_TOKENS_TO_THINKING"] = prior
 
 
 class TestChatCompletionResponse:
diff --git a/tests/test_redos_safety.py b/tests/test_redos_safety.py
new file mode 100644
index 0000000..cfd2f1a
--- /dev/null
+++ b/tests/test_redos_safety.py
@@ -0,0 +1,76 @@
+"""Regression tests for the ReDoS hardening in ``message_adapter.filter_content``.
+
+Each pathological input used to exhibit polynomial backtracking with the
+original lazy-``.*?`` regexes that CodeQL's py/polynomial-redos rule flagged
+(alerts #3-#6). The rewritten negated-class / bounded patterns are linear,
+so each call must complete well under a human-noticeable budget.
+"""
+
+from __future__ import annotations
+
+import time
+
+import pytest
+
+from src.message_adapter import MessageAdapter
+
+# Budget in seconds. Linear implementations run these inputs in tens of
+# milliseconds; the original lazy patterns would spiral into seconds-to-hours.
+REDOS_BUDGET_SECONDS = 1.0
+
+
+def _time_filter(payload: str) -> float:
+    start = time.perf_counter()
+    MessageAdapter.filter_content(payload)
+    return time.perf_counter() - start
+
+
+@pytest.mark.parametrize(
+    "payload",
+    [
+        "<thinking>" * 5000 + "x",
+        "<attempt_completion>" * 5000 + "x",
+        "<attempt_completion>" + ("<result>" * 5000) + "x",
+        "[Image:" * 5000,
+        "data:image/" * 5000,
+        "data:image/png;base64," + ("A" * 20000),
+    ],
+    ids=[
+        "unterminated_thinking",
+        "unterminated_attempt_completion",
+        "attempt_completion_with_result_storm",
+        "image_bracket_storm",
+        "data_image_storm",
+        "long_base64_trailing",
+    ],
+)
+def test_filter_content_redos_inputs_are_linear(payload: str) -> None:
+    elapsed = _time_filter(payload)
+    assert elapsed < REDOS_BUDGET_SECONDS, (
+        f"filter_content took {elapsed:.3f}s on pathological input; "
+        f"expected < {REDOS_BUDGET_SECONDS}s"
+    )
+
+
+def test_filter_content_strips_thinking_block() -> None:
+    out = MessageAdapter.filter_content("before<thinking>secret</thinking>after")
+    assert "secret" not in out
+    assert "before" in out and "after" in out
+
+
+def test_filter_content_extracts_attempt_completion_inner_result() -> None:
+    payload = "<attempt_completion><result>answer</result></attempt_completion>"
+    assert MessageAdapter.filter_content(payload) == "answer"
+
+
+def test_filter_content_replaces_image_tokens() -> None:
+    payload = "pre [Image: cat.png] mid data:image/png;base64,ABC post"
+    out = MessageAdapter.filter_content(payload)
+    assert "[Image: Content not supported by Claude Code]" in out
+    assert "ABC" not in out
+    assert "pre" in out and "post" in out
+
+
+def test_filter_content_returns_oversized_input_unchanged() -> None:
+    huge = "x" * 2_000_000
+    assert MessageAdapter.filter_content(huge) == huge
diff --git a/tests/test_request_cache_unit.py b/tests/test_request_cache_unit.py
new file mode 100644
index 0000000..594c260
--- /dev/null
+++ b/tests/test_request_cache_unit.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""
+Unit tests for request cache functionality.
+
+Tests the RequestCache class including caching, TTL, LRU eviction,
+and statistics tracking.
+"""
+
+import pytest
+import time
+from unittest.mock import patch
+
+from src.request_cache import RequestCache, CacheEntry
+
+
+class TestRequestCache:
+    """Test RequestCache class."""
+
+    def test_cache_set_and_get(self):
+        """Basic set and get operations work."""
+        cache = RequestCache(enabled=True, max_size=10, ttl_seconds=60)
+        request = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+        response = {"id": "123", "choices": [{"content": "Hi"}]}
+
+        cache.set(request, response)
+        result = cache.get(request)
+
+        assert result == response
+
+    def test_cache_miss(self):
+        """Returns None for cache miss."""
+        cache = RequestCache(enabled=True, max_size=10, ttl_seconds=60)
+        request = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+
+        result = cache.get(request)
+
+        assert result is None
+
+    def test_cache_disabled(self):
+        """Returns None when cache is disabled."""
+        cache = RequestCache(enabled=False, max_size=10, ttl_seconds=60)
+        request = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+        response = {"id": "123", "choices": [{"content": "Hi"}]}
+
+        cache.set(request, response)
+        result = cache.get(request)
+
+        assert result is None
+
+    def test_cache_expiration(self):
+        """Entries expire after TTL."""
+        cache = RequestCache(enabled=True, max_size=10, ttl_seconds=1)
+        request = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+        response = {"id": "123", "choices": [{"content": "Hi"}]}
+
+        cache.set(request, response)
+
+        # Should be present immediately
+        assert cache.get(request) == response
+
+        # Wait for expiration
+        time.sleep(1.1)
+
+        # Should be expired now
+        assert cache.get(request) is None
+
+    def test_lru_eviction(self):
+        """LRU eviction when max_size is reached."""
+        cache = RequestCache(enabled=True, max_size=2, ttl_seconds=60)
+
+        request1 = {"model": "test", "messages": [{"role": "user", "content": "One"}]}
+        request2 = {"model": "test", "messages": [{"role": "user", "content": "Two"}]}
+        request3 = {"model": "test", "messages": [{"role": "user", "content": "Three"}]}
+
+        cache.set(request1, {"id": "1"})
+        cache.set(request2, {"id": "2"})
+
+        # Access request1 to make it more recently used
+        cache.get(request1)
+
+        # Add request3, should evict request2 (least recently used)
+        cache.set(request3, {"id": "3"})
+
+        # request1 should still be present (was accessed)
+        assert cache.get(request1) is not None
+        # request3 should be present (just added)
+        assert cache.get(request3) is not None
+        # request2 should be evicted
+        assert cache.get(request2) is None
+
+    def test_stats_tracking(self):
+        """Statistics are tracked correctly."""
+        cache = RequestCache(enabled=True, max_size=10, ttl_seconds=60)
+        request = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+        response = {"id": "123", "choices": [{"content": "Hi"}]}
+
+        # Initial stats
+        stats = cache.get_stats()
+        assert stats["hits"] == 0
+        assert stats["misses"] == 0
+
+        # Miss
+        cache.get(request)
+        stats = cache.get_stats()
+        assert stats["misses"] == 1
+
+        # Set and hit
+        cache.set(request, response)
+        cache.get(request)
+        stats = cache.get_stats()
+        assert stats["hits"] == 1
+        assert stats["misses"] == 1
+        assert stats["hit_rate_percent"] == 50.0
+
+    def test_clear(self):
+        """Clear removes all entries."""
+        cache = RequestCache(enabled=True, max_size=10, ttl_seconds=60)
+
+        for i in range(5):
+            request = {"model": "test", "messages": [{"role": "user", "content": f"Msg {i}"}]}
+            cache.set(request, {"id": str(i)})
+
+        stats = cache.get_stats()
+        assert stats["current_size"] == 5
+
+        cleared = cache.clear()
+
+        assert cleared == 5
+        stats = cache.get_stats()
+        assert stats["current_size"] == 0
+
+    def test_hash_deterministic(self):
+        """Same request produces same hash."""
+        cache = RequestCache(enabled=True)
+
+        request1 = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+        request2 = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+
+        hash1 = cache._compute_hash(request1)
+        hash2 = cache._compute_hash(request2)
+
+        assert hash1 == hash2
+
+    def test_hash_ignores_irrelevant_fields(self):
+        """Hash ignores fields that don't affect response."""
+        cache = RequestCache(enabled=True)
+
+        request1 = {
+            "model": "test",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": False,
+            "session_id": "abc123",
+        }
+        request2 = {
+            "model": "test",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,  # Different
+            "session_id": "xyz789",  # Different
+        }
+
+        hash1 = cache._compute_hash(request1)
+        hash2 = cache._compute_hash(request2)
+
+        assert hash1 == hash2
+
+    def test_hash_differs_for_different_content(self):
+        """Different content produces different hashes."""
+        cache = RequestCache(enabled=True)
+
+        request1 = {"model": "test", "messages": [{"role": "user", "content": "Hello"}]}
+        request2 = {"model": "test", "messages": [{"role": "user", "content": "Goodbye"}]}
+
+        hash1 = cache._compute_hash(request1)
+        hash2 = cache._compute_hash(request2)
+
+        assert hash1 != hash2
+
+    def test_cleanup_expired(self):
+        """cleanup_expired removes expired entries."""
+        cache = RequestCache(enabled=True, max_size=10, ttl_seconds=1)
+
+        request1 = {"model": "test", "messages": [{"role": "user", "content": "One"}]}
+        request2 = {"model": "test", "messages": [{"role": "user", "content": "Two"}]}
+
+        cache.set(request1, {"id": "1"})
+        cache.set(request2, {"id": "2"})
+
+        # Wait for expiration
+        time.sleep(1.1)
+
+        removed = cache.cleanup_expired()
+
+        assert removed == 2
+        assert cache.get_stats()["current_size"] == 0
+
+    def test_stats_include_config(self):
+        """Stats include configuration values."""
+        cache = RequestCache(enabled=True, max_size=50, ttl_seconds=120)
+        stats = cache.get_stats()
+
+        assert stats["enabled"] is True
+        assert stats["max_size"] == 50
+        assert stats["ttl_seconds"] == 120
+
+    def test_enabled_property(self):
+        """enabled property reflects configuration."""
+        cache_enabled = RequestCache(enabled=True)
+        cache_disabled = RequestCache(enabled=False)
+
+        assert cache_enabled.enabled is True
+        assert cache_disabled.enabled is False
+
+
+class TestCacheEntry:
+    """Test CacheEntry dataclass."""
+
+    def test_cache_entry_creation(self):
+        """CacheEntry can be created with required fields."""
+        entry = CacheEntry(
+            response={"id": "test"},
+            created_at=1000.0,
+            expires_at=1060.0,
+        )
+
+        assert entry.response == {"id": "test"}
+        assert entry.created_at == 1000.0
+        assert entry.expires_at == 1060.0
+        assert entry.hit_count == 0  # Default
+
+    def test_cache_entry_hit_count(self):
+        """CacheEntry hit_count can be specified."""
+        entry = CacheEntry(
+            response={"id": "test"},
+            created_at=1000.0,
+            expires_at=1060.0,
+            hit_count=5,
+        )
+
+        assert entry.hit_count == 5
diff --git a/tests/test_retry_unit.py b/tests/test_retry_unit.py
new file mode 100644
index 0000000..ff44986
--- /dev/null
+++ b/tests/test_retry_unit.py
@@ -0,0 +1,146 @@
+"""Unit tests for retry logic module."""
+
+import pytest
+from src.retry import RetryConfig, RetryState
+
+
+class TestRetryConfig:
+    """Tests for RetryConfig defaults."""
+
+    def test_default_config(self):
+        config = RetryConfig()
+        assert config.max_retries == 10
+        assert config.base_delay_ms == 500
+        assert config.max_delay_ms == 30_000
+        assert config.enable_model_fallback is True
+
+    def test_custom_config(self):
+        config = RetryConfig(max_retries=3, base_delay_ms=100, enable_model_fallback=False)
+        assert config.max_retries == 3
+        assert config.base_delay_ms == 100
+        assert config.enable_model_fallback is False
+
+
+class TestRetryState:
+    """Tests for RetryState logic."""
+
+    def test_initial_state(self):
+        state = RetryState()
+        assert state.attempt == 0
+        assert state.consecutive_529s == 0
+        assert state.fallback_model is None
+
+    def test_should_retry_429(self):
+        state = RetryState()
+        assert state.should_retry(status_code=429) is True
+
+    def test_should_retry_529(self):
+        state = RetryState()
+        assert state.should_retry(status_code=529) is True
+
+    def test_should_retry_500(self):
+        state = RetryState()
+        assert state.should_retry(status_code=500) is True
+
+    def test_should_not_retry_200(self):
+        state = RetryState()
+        assert state.should_retry(status_code=200) is False
+
+    def test_should_not_retry_404(self):
+        state = RetryState()
+        assert state.should_retry(status_code=404) is False
+
+    def test_should_retry_timeout_error(self):
+        state = RetryState()
+        assert state.should_retry(error=Exception("Connection timeout")) is True
+
+    def test_should_not_retry_generic_error(self):
+        state = RetryState()
+        assert state.should_retry(error=Exception("Invalid input")) is False
+
+    def test_should_not_retry_400(self):
+        state = RetryState()
+        assert state.should_retry(status_code=400) is False
+
+    def test_should_retry_context_overflow(self):
+        state = RetryState()
+        assert state.should_retry(error=Exception("context overflow: message too long")) is True
+
+    def test_max_retries_exhausted(self):
+        config = RetryConfig(max_retries=2)
+        state = RetryState(config=config)
+        state.attempt = 2
+        assert state.should_retry(status_code=429) is False
+
+    def test_record_attempt_tracks_529s(self):
+        state = RetryState()
+        state.record_attempt(status_code=529)
+        assert state.consecutive_529s == 1
+        assert state.attempt == 1
+
+        state.record_attempt(status_code=529)
+        assert state.consecutive_529s == 2
+
+        state.record_attempt(status_code=429)
+        assert state.consecutive_529s == 0  # Reset on non-529
+
+    def test_should_fallback_after_consecutive_529s(self):
+        state = RetryState()
+        state.consecutive_529s = 3
+        assert state.should_fallback("claude-opus-4-6") is True
+
+    def test_should_not_fallback_before_threshold(self):
+        state = RetryState()
+        state.consecutive_529s = 2
+        assert state.should_fallback("claude-opus-4-6") is False
+
+    def test_should_not_fallback_for_non_opus(self):
+        state = RetryState()
+        state.consecutive_529s = 3
+        assert state.should_fallback("claude-sonnet-4-6") is False
+
+    def test_should_not_fallback_when_disabled(self):
+        config = RetryConfig(enable_model_fallback=False)
+        state = RetryState(config=config)
+        state.consecutive_529s = 3
+        assert state.should_fallback("claude-opus-4-6") is False
+
+    def test_get_fallback_model(self):
+        state = RetryState()
+        state.consecutive_529s = 3
+        fallback = state.get_fallback_model("claude-opus-4-6")
+        assert fallback == "claude-sonnet-4-6"
+        assert state.fallback_model == "claude-sonnet-4-6"
+        assert state.consecutive_529s == 0  # Reset after fallback
+
+    def test_get_fallback_model_none_for_sonnet(self):
+        state = RetryState()
+        state.consecutive_529s = 3
+        fallback = state.get_fallback_model("claude-sonnet-4-6")
+        assert fallback is None
+
+    def test_calculate_delay_exponential(self):
+        state = RetryState(config=RetryConfig(base_delay_ms=1000))
+        state.attempt = 0
+        delay0 = state.calculate_delay()
+        state.attempt = 1
+        delay1 = state.calculate_delay()
+        state.attempt = 2
+        delay2 = state.calculate_delay()
+        # Each delay should roughly double (with jitter)
+        assert delay1 > delay0
+        assert delay2 > delay1
+
+    def test_calculate_delay_capped(self):
+        config = RetryConfig(base_delay_ms=1000, max_delay_ms=5000)
+        state = RetryState(config=config)
+        state.attempt = 20  # Very high attempt
+        delay = state.calculate_delay()
+        # Should be capped at max + jitter (max 25% jitter)
+        assert delay <= 5.0 * 1.25
+
+    def test_calculate_delay_respects_retry_after(self):
+        state = RetryState(config=RetryConfig(base_delay_ms=100))
+        state.attempt = 0
+        delay = state.calculate_delay(retry_after=10.0)
+        assert delay >= 10.0  # Must be at least retry-after value
diff --git a/tests/test_sdk_migration.py b/tests/test_sdk_migration.py
index 6ad2d95..9f33c1a 100644
--- a/tests/test_sdk_migration.py
+++ b/tests/test_sdk_migration.py
@@ -5,7 +5,6 @@
 Tests system prompt formats, message conversion, and basic SDK integration.
 """
 
-import asyncio
 import pytest
 from claude_agent_sdk import ClaudeAgentOptions
 
@@ -60,13 +59,14 @@ class TestConstants:
 
     def test_claude_models_defined(self):
         """Test that CLAUDE_MODELS constant exists and has expected models."""
-        from src.constants import CLAUDE_MODELS, DEFAULT_MODEL, FAST_MODEL
+        from src.constants import CLAUDE_MODELS
 
         assert isinstance(CLAUDE_MODELS, list)
         assert len(CLAUDE_MODELS) > 0
 
-        # Check latest models are included
-        assert "claude-sonnet-4-5-20250929" in CLAUDE_MODELS
+        # Check latest fallback models are included
+        assert "claude-sonnet-4-6" in CLAUDE_MODELS
+        assert "claude-opus-4-6" in CLAUDE_MODELS
         assert "claude-haiku-4-5-20251001" in CLAUDE_MODELS
 
     def test_default_model_defined(self):
@@ -74,7 +74,7 @@ def test_default_model_defined(self):
         from src.constants import DEFAULT_MODEL, CLAUDE_MODELS
 
         assert DEFAULT_MODEL in CLAUDE_MODELS
-        assert DEFAULT_MODEL == "claude-sonnet-4-5-20250929"
+        assert DEFAULT_MODEL == "claude-sonnet-4-6"
 
     def test_fast_model_defined(self):
         """Test that FAST_MODEL is set to fastest model."""
diff --git a/tests/test_session_complete.py b/tests/test_session_complete.py
index 425aeb4..07ff89e 100644
--- a/tests/test_session_complete.py
+++ b/tests/test_session_complete.py
@@ -171,7 +171,9 @@ def test_session_endpoints():
         if sessions["total"] >= len(session_ids):
             print(f"   ✅ Found all test sessions")
         else:
-            print(f"   ⚠️  Expected at least {len(session_ids)} sessions, found {sessions['total']}")
+            print(
+                f"   ⚠️  Expected at least {len(session_ids)} sessions, found {sessions['total']}"
+            )
 
     # Test get specific session
     get_response = requests.get(f"{BASE_URL}/v1/sessions/{session_ids[0]}")
diff --git a/tests/test_tool_manager_unit.py b/tests/test_tool_manager_unit.py
index 78fea02..1a77246 100644
--- a/tests/test_tool_manager_unit.py
+++ b/tests/test_tool_manager_unit.py
@@ -403,7 +403,7 @@ def test_file_tools_category(self):
 
     def test_system_tools_category(self):
         """System tools are correctly categorized."""
-        system_tools = ["Bash", "BashOutput", "KillShell"]
+        system_tools = ["Bash", "Config", "REPL", "Sleep", "Monitor"]
         for tool_name in system_tools:
             assert TOOL_METADATA[tool_name].category == "system"
 
@@ -416,7 +416,7 @@ def test_web_tools_category(self):
 
     def test_productivity_tools_category(self):
         """Productivity tools are correctly categorized."""
-        productivity_tools = ["TodoWrite", "Skill", "SlashCommand"]
+        productivity_tools = ["TodoWrite", "Skill"]
         for tool_name in productivity_tools:
             assert TOOL_METADATA[tool_name].category == "productivity"