diff --git a/.forge/skills/test-reasoning/SKILL.md b/.forge/skills/test-reasoning/SKILL.md new file mode 100644 index 0000000000..248f35ea1e --- /dev/null +++ b/.forge/skills/test-reasoning/SKILL.md @@ -0,0 +1,61 @@ +--- +name: test-reasoning +description: Validate that reasoning parameters are correctly serialized and sent to provider APIs. Use when the user asks to test reasoning serialization, run reasoning tests, verify reasoning config fields, or check that ReasoningConfig maps correctly to provider-specific JSON (OpenRouter, Anthropic, GitHub Copilot, Codex). +--- + +# Test Reasoning Serialization + +Validates that `ReasoningConfig` fields are correctly serialized into provider-specific JSON +for OpenRouter, Anthropic, GitHub Copilot, and Codex. + +## Quick Start + +Run all tests with the bundled script: + +```bash +./scripts/test-reasoning.sh +``` + +The script builds forge in debug mode, runs each provider/model combination, captures the +outgoing HTTP request body via `FORGE_DEBUG_REQUESTS`, and asserts the correct JSON fields. + +## Running a Single Test Manually + +```bash +FORGE_DEBUG_REQUESTS="forge.request.json" \ +FORGE_SESSION__PROVIDER_ID= \ +FORGE_SESSION__MODEL_ID= \ +FORGE_REASONING__EFFORT= \ +target/debug/forge -p "Hello!" +``` + +Then inspect `.forge/forge.request.json` for the expected fields. + +## Test Coverage + +| Provider | Model | Config fields | Expected JSON field | +| ---------------- | ---------------------------- | ------------------------------------------------- | --------------------------------- | +| `open_router` | `openai/o4-mini` | `effort: none\|minimal\|low\|medium\|high\|xhigh` | `reasoning.effort` | +| `open_router` | `openai/o4-mini` | `max_tokens: 4000` | `reasoning.max_tokens` | +| `open_router` | `openai/o4-mini` | `effort: high` + `exclude: true` | `reasoning.effort` + `.exclude` | +| `open_router` | `openai/o4-mini` | `enabled: true` | `reasoning.enabled` | +| `open_router` | `anthropic/claude-opus-4-5` | `max_tokens: 4000` | `reasoning.max_tokens` | +| `open_router` | `moonshotai/kimi-k2` | `max_tokens: 4000` | `reasoning.max_tokens` | +| `open_router` | `moonshotai/kimi-k2` | `effort: high` | `reasoning.effort` | +| `open_router` | `minimax/minimax-m2` | `max_tokens: 4000` | `reasoning.max_tokens` | +| `open_router` | `minimax/minimax-m2` | `effort: high` | `reasoning.effort` | +| `anthropic` | `claude-opus-4-6` | `effort: low\|medium\|high\|max` | `output_config.effort` | +| `anthropic` | `claude-3-7-sonnet-20250219` | `enabled: true` + `max_tokens: 8000` | `thinking.type` + `budget_tokens` | +| `github_copilot` | `o4-mini` | `effort: none\|minimal\|low\|medium\|high\|xhigh` | `reasoning_effort` (top-level) | +| `codex` | `gpt-5.1-codex` | `effort: none\|minimal\|low\|medium\|high\|xhigh` | `reasoning.effort` + `.summary` | +| `codex` | `gpt-5.1-codex` | `effort: medium` + `exclude: true` | `reasoning.summary = "concise"` | +| all providers | one model each | `effort: invalid` | non-zero exit, no request written | + +Tests for unconfigured providers are skipped automatically. Invalid-effort tests run regardless of credentials — the rejection happens at config parse time before any provider interaction. + +## References + +- [OpenAI Reasoning guide](https://developers.openai.com/api/docs/guides/reasoning) +- [OpenAI Chat Completions API reference](https://developers.openai.com/api/reference/resources/chat/subresources/completions/methods/create) +- [Anthropic Extended Thinking](https://platform.claude.com/docs/en/build-with-claude/effort) +- [OpenRouter Reasoning Tokens](https://openrouter.ai/docs/guides/best-practices/reasoning-tokens) diff --git a/.forge/skills/test-reasoning/scripts/test-reasoning.sh b/.forge/skills/test-reasoning/scripts/test-reasoning.sh new file mode 100755 index 0000000000..2cad7ee712 --- /dev/null +++ b/.forge/skills/test-reasoning/scripts/test-reasoning.sh @@ -0,0 +1,423 @@ +#!/usr/bin/env bash +# scripts/test-reasoning.sh +# +# Validates that reasoning parameters are correctly serialized for each +# provider across all supported effort levels. +# +# Usage: ./scripts/test-reasoning.sh + +set -uo pipefail + +# ─── colors ─────────────────────────────────────────────────────────────────── + +BOLD='\033[1m' +RESET='\033[0m' +GREEN='\033[32m' +RED='\033[31m' +YELLOW='\033[33m' +CYAN='\033[36m' +DIM='\033[2m' + +# ─── state ──────────────────────────────────────────────────────────────────── + +PASS=0 +FAIL=0 +SKIP=0 +BINARY="target/debug/forge" +WORK_DIR="$(mktemp -d)" +SEQ=0 +RESULT_FILES=() +CURRENT_RF="" + +cleanup() { rm -rf "$WORK_DIR"; } +trap cleanup EXIT + +# ─── output helpers ─────────────────────────────────────────────────────────── +# Each helper writes a tagged line to stdout. Within a background subshell, +# stdout is redirected to a per-job result file; the main process reads it back +# after wait to tally counts and emit colour output in the original order. + +log_header() { printf "HEADER\t%s\n" "$1"; } +log_pass() { printf "PASS\t%s\n" "$1"; } +log_fail() { printf "FAIL\t%s\n" "$1"; } +log_skip() { printf "SKIP\t%s\n" "$1"; } + +# ─── json helpers ───────────────────────────────────────────────────────────── + +# json_get +# Prints the JSON value at the given path, or "null" if absent/null. +# Uses raw_decode to parse only the first JSON object in the file, which +# correctly handles both single-document JSON and NDJSON (even when multiple +# objects appear on the same line without a newline separator). +json_get() { + python3 - "$1" "$2" <<'PY' +import json, sys +with open(sys.argv[1]) as f: + raw = f.read().strip() +# raw_decode stops after the first complete JSON value regardless of trailing +# content (extra objects, newlines, null bytes, etc.). +decoder = json.JSONDecoder() +d, _ = decoder.raw_decode(raw) +keys = sys.argv[2].split('.') +v = d +for k in keys: + v = v.get(k) if isinstance(v, dict) else None + if v is None: + break +print(json.dumps(v)) +PY +} + +# assert_field