diff --git a/.agents/skills/open-collider-brainstorm/SKILL.md b/.agents/skills/open-collider-brainstorm/SKILL.md new file mode 100644 index 0000000..ab9f9b8 --- /dev/null +++ b/.agents/skills/open-collider-brainstorm/SKILL.md @@ -0,0 +1,164 @@ +--- +name: open-collider-brainstorm +description: "Codex equivalent of Open Collider's Claude Code /brainstorm command. Use when the user asks to run, continue, resume, curate, flag, inspect, or close an Open Collider brainstorm; when migrating Claude Code brainstorm workflows to Codex; or when a configured project uses llm_provider: codex/openai/anthropic and needs the complete domains, ideas, scoring, curation, and love/like/trash loop. For project creation or /collider_setup, use $open-collider-setup instead." +--- + +# Open Collider Brainstorm + +## Command Equivalent + +This is the Codex equivalent of Claude Code's `/brainstorm` command. + +If the user asks to create or configure a project, use `$open-collider-setup` +first. Do not run brainstorm until setup is complete and the user asks to +continue. + +## Core rule + +Do not stop after `BrainstormOrchestrator.run_iteration()`. A complete Open Collider brainstorm includes: + +1. domain generation, +2. idea generation, +3. scoring and thresholding, +4. Codex curation into `curated_ideas.json` and `insights_without_collision.json`, +5. user flags `love / like / trash`, +6. `apply_flags()` and regenerated reports. + +If the user only asks for a raw API smoke test, say explicitly that curation is skipped. + +## Repository context + +Run commands from the Open Collider repository root. Prefer `.venv/bin/python` when it exists. Always insert the repo `src/` path before importing local modules. + +Helpful scripts bundled with this skill: + +- `scripts/run_iteration.py`: run one API-mode iteration and print JSON. +- `scripts/apply_flags_from_text.py`: parse `love 1,3 — like 2 — trash the rest`, map display numbers to idea IDs, call `apply_flags()`, and regenerate reports. + +Use reference details only when needed: + +- `references/curation.md`: exact curation criteria and file shapes. +- `references/terminal-usage.md`: terminal commands for installing and exercising the skill. + +## Workflow + +If the user asks to create a new Open Collider project, hand off to +`$open-collider-setup` first. + +### 1. Orient + +List `projects/` excluding `_template`. If there is one project, use it. If there are several and the user did not specify one, ask which project to run. + +Inspect `project_config.yaml`. For Codex-only runs, the project should contain: + +```yaml +llm_backend: api +llm_provider: codex +domain_model: default +generation_model: default +scoring_model: default +max_concurrent: 1 +max_concurrent_scoring: 1 +``` + +If the config is not Codex-ready and the user asked for Codex-only, edit the project config before running. + +### 2. Run one iteration + +Use the bundled script or equivalent Python: + +```bash +python /path/to/open-collider-brainstorm/scripts/run_iteration.py projects/my_project +``` + +If resuming a specific brainstorm: + +```bash +python /path/to/open-collider-brainstorm/scripts/run_iteration.py projects/my_project --brainstorm-id brainstorm_001 +``` + +If network or home-directory access is required for `codex exec`, request escalation. Do not work around a rejected escalation. + +### 2b. Curate an existing raw iteration + +If the user asks to curate an existing run, or if `iter_NNN/scored_ideas.json` exists but `curated_ideas.json` is missing, do not rerun generation. Start at step 3 using that existing iteration. + +Useful prompt: + +```text +Use $open-collider-brainstorm to curate existing brainstorm_001 iteration 1 for projects/my_project. Do not rerun generation. +``` + +### 3. Curate immediately + +Read all ideas from `iter_NNN/scored_ideas.json`, both retained and non-retained. Read `brief_validated.json` from the project root. + +Select the best 10-20 ideas with the criteria in `references/curation.md`. Be selective: a smaller strong set is better than a long mediocre set. + +Write: + +- `iter_NNN/curated_ideas.json` +- `iter_NNN/insights_without_collision.json` + +Then call: + +```python +from open_collider.skill_interface import mark_curated, generate_report + +mark_curated(project_dir) +generate_report(project_dir) +``` + +### 4. Display and ask for flags + +Display every curated idea and every insight without rewriting the `text` field. Use continuous numbering across both lists, then save the same mapping to: + +```text +iter_NNN/numbering_map.json +``` + +Mapping shape: + +```json +[ + {"number": 1, "idea_id": "...", "kind": "curated"}, + {"number": 2, "idea_id": "...", "kind": "insight"} +] +``` + +Ask the user: + +```text +Flag each idea: love (want more like this), like (interesting), or trash (not useful). +Format: love 1,3,7 - like 2,5 - trash the rest +``` + +Stop and wait for the user's flags. + +### 5. Apply flags + +When the user provides flags, parse them with: + +```bash +python /path/to/open-collider-brainstorm/scripts/apply_flags_from_text.py projects/my_project 1 "love 1,3 - like 2 - trash the rest" +``` + +This calls `apply_flags(project_dir, iteration, flags)` and regenerates reports. + +### 6. Continue or close + +Ask whether the user wants: + +- next iteration, +- brief revision, +- done. + +If done, call `generate_brainstorm_report(project_dir)` and point the user to `REPORT.md`. + +## Codex-specific behavior + +For Codex-only projects, treat `default` as the safest model value. It lets the local Codex CLI choose the model supported by the user's account. + +Do not use Anthropic or OpenAI API keys unless the user explicitly asks. If the config says `llm_provider: codex`, all unprefixed model names must be Codex-compatible. + +`projects/*` may be gitignored in this repository. Mention that outputs can exist without appearing in `git status`. diff --git a/.agents/skills/open-collider-brainstorm/agents/openai.yaml b/.agents/skills/open-collider-brainstorm/agents/openai.yaml new file mode 100644 index 0000000..838c0ae --- /dev/null +++ b/.agents/skills/open-collider-brainstorm/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Open Collider Brainstorm" + short_description: "Run and curate Open Collider brainstorms" + default_prompt: "Use $open-collider-brainstorm to run a full Open Collider brainstorm for a configured project, including post-run curation." diff --git a/.agents/skills/open-collider-brainstorm/references/curation.md b/.agents/skills/open-collider-brainstorm/references/curation.md new file mode 100644 index 0000000..d9732cd --- /dev/null +++ b/.agents/skills/open-collider-brainstorm/references/curation.md @@ -0,0 +1,65 @@ +# Open Collider curation + +Use this after `run_iteration()` creates `scored_ideas.json`. + +## Inputs + +- `projects//brief_validated.json` +- `projects//brainstorms//iter_NNN/scored_ideas.json` + +Read all scored ideas, including non-retained ideas. Do not trust score alone. + +## Pass 1: collision ideas + +For each idea, keep it only if all filters pass: + +1. Real collision: the distant domain mechanism changes the idea structurally. +2. Verifiable: factual claims are named or checkable; if the idea makes important external claims, verify them. +3. Non-trivial: the idea would not appear from a vanilla prompt. +4. Project voice: the idea fits the brief, audience, taste, and constraints. + +Deduplicate aggressively. If two ideas express the same mechanism, keep the stronger one. + +Write `curated_ideas.json`: + +```json +[ + { + "rank": 1, + "idea_id": "...", + "text": "full original idea text", + "combo": "...", + "score": 4.65, + "has_collision": true, + "why_selected": "One sentence.", + "source_note": "What is verifiable, or why no external claim needs verification.", + "challenge": "Strongest objection." + } +] +``` + +## Pass 2: insights without collision + +Keep ideas that fail only the real-collision test but are still strong, non-trivial, and on-brief. + +Write `insights_without_collision.json`: + +```json +[ + { + "rank": 1, + "idea_id": "...", + "text": "full original idea text", + "combo": "...", + "score": 4.35, + "has_collision": false, + "why_kept": "One sentence." + } +] +``` + +## Display requirements + +After writing curation files, display all curated items with exact `text`. Do not summarize candidate names or rewrite territory text. + +Use continuous display numbers across both files and save `numbering_map.json` so flags can be applied deterministically later. diff --git a/.agents/skills/open-collider-brainstorm/references/terminal-usage.md b/.agents/skills/open-collider-brainstorm/references/terminal-usage.md new file mode 100644 index 0000000..44bf42b --- /dev/null +++ b/.agents/skills/open-collider-brainstorm/references/terminal-usage.md @@ -0,0 +1,42 @@ +# Terminal usage + +Install the skill for Codex discovery: + +```bash +mkdir -p ~/.codex/skills +cp -R .agents/skills/open-collider-setup ~/.codex/skills/ +cp -R .agents/skills/open-collider-brainstorm ~/.codex/skills/ +``` + +Run the Codex equivalent of `/collider_setup` from the Open Collider repo: + +```bash +codex "Use $open-collider-setup to create a new Open Collider project configured for Codex-only API mode." +``` + +Run the Codex equivalent of `/brainstorm`: + +```bash +codex "Use $open-collider-brainstorm to run a complete brainstorm for projects/namerkit_pulsed_naming with Codex-only API mode. Do the post-run curation and ask me for love/like/trash flags." +``` + +Smoke-test the Python provider without a project brief: + +```bash +.venv/bin/python - <<'PY' +from open_collider.llm.client import LLMClient +print(LLMClient(provider="codex").call("default", "Reply exactly: codex provider ok")) +PY +``` + +Run a raw iteration manually: + +```bash +python ~/.codex/skills/open-collider-brainstorm/scripts/run_iteration.py projects/namerkit_pulsed_naming +``` + +Apply flags after Codex has displayed curated ideas and written `numbering_map.json`: + +```bash +python ~/.codex/skills/open-collider-brainstorm/scripts/apply_flags_from_text.py projects/namerkit_pulsed_naming 1 "love 1,3 - like 2,5 - trash the rest" +``` diff --git a/.agents/skills/open-collider-brainstorm/scripts/apply_flags_from_text.py b/.agents/skills/open-collider-brainstorm/scripts/apply_flags_from_text.py new file mode 100755 index 0000000..282cb8a --- /dev/null +++ b/.agents/skills/open-collider-brainstorm/scripts/apply_flags_from_text.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + + +FLAG_ALIASES = { + "love": "loved", + "loved": "loved", + "like": "liked", + "liked": "liked", + "trash": "trashed", + "trashed": "trashed", +} + + +def parse_number_list(text: str) -> set[int]: + numbers: set[int] = set() + for part in re.split(r"[, ]+", text.strip()): + if not part: + continue + if "-" in part and re.fullmatch(r"\d+\s*-\s*\d+", part): + start, end = [int(x) for x in re.split(r"\s*-\s*", part)] + numbers.update(range(min(start, end), max(start, end) + 1)) + elif part.isdigit(): + numbers.add(int(part)) + return numbers + + +def parse_flags(spec: str, all_numbers: set[int]) -> dict[int, str]: + normalized = spec.replace("—", "-").replace(";", " - ") + matches = list(re.finditer(r"\b(love|loved|like|liked|trash|trashed)\b", normalized, re.I)) + number_to_flag: dict[int, str] = {} + + for index, match in enumerate(matches): + label = FLAG_ALIASES[match.group(1).lower()] + start = match.end() + end = matches[index + 1].start() if index + 1 < len(matches) else len(normalized) + chunk = normalized[start:end].strip(" :-") + if re.search(r"\b(the\s+)?rest\b", chunk, re.I): + selected = all_numbers - set(number_to_flag) + else: + selected = parse_number_list(chunk) + for number in selected: + number_to_flag[number] = label + + unknown = set(number_to_flag) - all_numbers + if unknown: + raise SystemExit(f"Unknown display number(s): {sorted(unknown)}") + return number_to_flag + + +def load_mapping(iter_dir: Path) -> list[dict]: + mapping_path = iter_dir / "numbering_map.json" + if mapping_path.is_file(): + return json.loads(mapping_path.read_text(encoding="utf-8")) + + curated = json.loads((iter_dir / "curated_ideas.json").read_text(encoding="utf-8")) + insights_path = iter_dir / "insights_without_collision.json" + insights = json.loads(insights_path.read_text(encoding="utf-8")) if insights_path.is_file() else [] + mapping: list[dict] = [] + for item in sorted(curated, key=lambda i: i.get("rank", 999999)): + mapping.append({"number": len(mapping) + 1, "idea_id": item["idea_id"], "kind": "curated"}) + for item in sorted(insights, key=lambda i: i.get("rank", 999999)): + mapping.append({"number": len(mapping) + 1, "idea_id": item["idea_id"], "kind": "insight"}) + mapping_path.write_text(json.dumps(mapping, ensure_ascii=False, indent=2), encoding="utf-8") + return mapping + + +def main() -> int: + parser = argparse.ArgumentParser(description="Apply love/like/trash flags from display numbers.") + parser.add_argument("project", help="Project directory, for example projects/my_project") + parser.add_argument("iteration", type=int, help="Iteration number, for example 1") + parser.add_argument("flags", help='Flag spec, for example "love 1,3 - like 2 - trash the rest"') + args = parser.parse_args() + + repo_root = Path.cwd() + sys.path.insert(0, str(repo_root / "src")) + + from open_collider.skill_interface import _load_state, apply_flags + + project_dir = Path(args.project) + state = _load_state(project_dir) + brainstorm_dir = project_dir / "brainstorms" / state["brainstorm_id"] + iter_dir = brainstorm_dir / f"iter_{args.iteration:03d}" + + mapping = load_mapping(iter_dir) + number_to_id = {int(item["number"]): item["idea_id"] for item in mapping} + number_to_flag = parse_flags(args.flags, set(number_to_id)) + + flags = {number_to_id[number]: flag for number, flag in number_to_flag.items()} + apply_flags(str(project_dir), args.iteration, flags) + print(json.dumps({"applied": len(flags), "flags": flags}, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.agents/skills/open-collider-brainstorm/scripts/run_iteration.py b/.agents/skills/open-collider-brainstorm/scripts/run_iteration.py new file mode 100755 index 0000000..b53c0fb --- /dev/null +++ b/.agents/skills/open-collider-brainstorm/scripts/run_iteration.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run one Open Collider API-mode iteration.") + parser.add_argument("project", help="Project directory, for example projects/my_project") + parser.add_argument("--brainstorm-id", help="Existing brainstorm id, for example brainstorm_001") + args = parser.parse_args() + + repo_root = Path.cwd() + sys.path.insert(0, str(repo_root / "src")) + + from open_collider.brainstorm import BrainstormOrchestrator + + result = BrainstormOrchestrator( + Path(args.project), + brainstorm_id=args.brainstorm_id, + ).run_iteration() + print(json.dumps(result, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.agents/skills/open-collider-setup/SKILL.md b/.agents/skills/open-collider-setup/SKILL.md new file mode 100644 index 0000000..52c9a5e --- /dev/null +++ b/.agents/skills/open-collider-setup/SKILL.md @@ -0,0 +1,199 @@ +--- +name: open-collider-setup +description: "Codex equivalent of Open Collider's Claude Code /collider_setup command. Use when the user wants to create or configure a new Open Collider project, build a project brief, choose Codex/Anthropic/OpenAI Responses-compatible API mode, add reference texts, configure scoring axes, or prepare a project so it is ready for $open-collider-brainstorm." +--- + +# Open Collider Setup + +You are a project setup assistant for the Open Collider pipeline. Create a new +project by interviewing the user, then write the required project files. + +This is the Codex equivalent of Claude Code's `/collider_setup`. + +## Outputs + +Create or update: + +- `projects//brief_validated.json` +- `projects//project_config.yaml` +- `projects//input_bank.yaml` +- `projects//texts/T01.txt`, `T02.txt`, etc. +- `projects//prompts/idea_generation.md` +- `projects//prompts/judge.md` + +Run commands from the Open Collider repository root. + +## 1. Create Project + +Ask for a project name in slug format: lowercase, underscores. + +Then copy the template and create a material folder: + +```bash +cp -r projects/_template projects/ +mkdir -p projects//material +``` + +Ask whether the user has reference material: + +> Do you have any reference material that could help me understand the project? +> Drop files into `projects//material/` and tell me when done, or say +> `no material`. + +If material exists, read all files in `material/` before building the brief. + +## 2. Build Brief + +The brief defines the project's semantic field. Ask one question at a time and +wait for each answer. Challenge vague answers. + +1. What is the ideation problem? + Ask: "Describe your ideation problem in 2-3 sentences. What kind of ideas are you looking for? What will you do with them?" +2. What does a good idea look like structurally? + Ask for qualities, not topics. +3. Who is this for? + Ask for psychographics, prior attempts, allergies, and what the audience wants to understand. +4. What is off-limits? + These become `forbidden_topics`. +5. What output format should ideas take? + Ask for format, length, structure, and tone. + +Write `brief_validated.json` as a JSON object. The structure is flexible, but it +must be a JSON object. Show the full JSON to the user for validation. + +## 3. Configure Provider + +Ask how API mode should make LLM calls: + +- **Codex CLI**: no Anthropic/OpenAI API key; uses local `codex exec`. +- **Anthropic API**: original default API mode; requires `ANTHROPIC_API_KEY`. +- **OpenAI Responses-compatible API**: OpenAI or a local server that exposes the Responses API; requires `OPENAI_API_KEY` and may use `OPENAI_BASE_URL`. + +For Codex-only projects, write: + +```yaml +llm_backend: api +llm_provider: codex +domain_model: default +generation_model: default +scoring_model: default +max_concurrent: 1 +max_concurrent_scoring: 1 +``` + +For OpenAI Responses-compatible projects, ask for model names and remind the user: + +```bash +export OPENAI_BASE_URL=http://127.0.0.1:8000/v1 +export OPENAI_API_KEY=local-token +``` + +Then write: + +```yaml +llm_backend: api +llm_provider: openai +domain_model: your-model +generation_model: your-model +scoring_model: your-model +max_concurrent: 1 +max_concurrent_scoring: 1 +``` + +For Anthropic API projects, keep or set: + +```yaml +llm_backend: api +llm_provider: anthropic +``` + +## 4. Configure Scoring + +Show the default axes: + +```yaml +judge_axes: + originality: 0.25 + resistance: 0.20 + thesis_density: 0.20 + concrete_grounding: 0.20 + cognitive_load: 0.15 +``` + +Ask whether the weights fit the use case. Only adjust weights in +`project_config.yaml`. + +Important: axis names are hardcoded in `score_parser.py`, `idea_scorer.py`, and +`judge.md`. Changing names requires code changes. + +Also write the user's `output_format` into `project_config.yaml`. + +## 5. Set Up Reference Texts + +Reference texts are one side of every collision. Prefer rich, specific, +reasoning-heavy texts. + +Good inputs: + +- transcripts of talks or podcasts, +- blog posts or articles with a strong thesis, +- research notes with original insights, +- substantive notes with examples and reasoning. + +Weak inputs: + +- marketing copy, +- short summaries, +- lists, +- landing-page text. + +Offer sources in this order: + +1. extract rich passages from `material/`, if present; +2. web search for public content, if the user wants it; +3. user-provided pasted text or files. + +For each text: + +1. Save as `projects//texts/T01.txt`, `T02.txt`, etc. +2. Add it to `input_bank.yaml` using paths like `texts/T01.txt`. +3. Propose per-text `forbidden_topics` and ask the user to validate. + +## 6. Customize Prompts + +Read `projects//prompts/idea_generation.md` and `judge.md`. + +For `idea_generation.md`: + +- update the role description if needed; +- align style with the requested output format; +- keep parser-compatible headers: `## Idea N`, `## Concept N`, or `## N`. + +For `judge.md`: + +- add high-value and low-value calibration examples; +- keep the hardcoded five-axis scoring table compatible with the parser. + +Do not over-customize prompts. Change only what improves reliability or fit. + +## 7. Validate Setup + +Report: + +- project path, +- one-sentence brief, +- provider and models, +- scoring weights, +- reference text count, +- prompt changes, +- whether it is ready for `$open-collider-brainstorm`. + +Stop after setup unless the user explicitly asks to run the brainstorm now. + +## Guidelines + +- Ask one question at a time. +- Push back on vague answers. +- Show the full brief JSON before writing final setup. +- Treat the brief as the most important output. +- If the user says they want Codex only, configure `llm_provider: codex` and `default` models. diff --git a/.agents/skills/open-collider-setup/agents/openai.yaml b/.agents/skills/open-collider-setup/agents/openai.yaml new file mode 100644 index 0000000..2723d09 --- /dev/null +++ b/.agents/skills/open-collider-setup/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Open Collider Setup" + short_description: "Create and configure Open Collider projects" + default_prompt: "Use $open-collider-setup to create a new Open Collider project configured for Codex-only API mode." diff --git a/.env.example b/.env.example index 5fb0b67..299f795 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,12 @@ -# Required for API mode (not needed for skill mode) -ANTHROPIC_API_KEY= +# No .env file is required for llm_provider: codex. + +# Required only for direct Anthropic API mode. +# ANTHROPIC_API_KEY= + +# Required only for direct OpenAI API mode. +# OPENAI_API_KEY= + +# Optional for local servers that expose an OpenAI Responses-compatible API. +# OPENAI_BASE_URL=http://127.0.0.1:8000/v1 +# OPENAI_API_KEY=dsv4-local +# OPENAI_TIMEOUT=120 diff --git a/README.md b/README.md index 93d92cf..37083c3 100644 --- a/README.md +++ b/README.md @@ -29,16 +29,18 @@ pip install -e . ### API mode (fast, parallel, reliable) -Requires an Anthropic API key. Python orchestrates LLM calls in parallel. +Python orchestrates LLM calls in parallel. Anthropic is the default provider; +OpenAI Responses API and Codex CLI are also supported. ```bash git clone https://github.com/CL-ML/open-collider.git cd open-collider -pip install -e ".[api]" -cp .env.example .env -# Edit .env with your ANTHROPIC_API_KEY +pip install -e . ``` +Codex-only mode does not require Anthropic or OpenAI credentials. Direct API +providers require `pip install -e ".[api]"` plus the matching key in `.env`. + ### Then in Claude Code: Run these slash commands successively: @@ -53,12 +55,39 @@ On first `/brainstorm`, you'll be asked to choose API or Skill mode. The choice | | **API mode** | **Skill mode** | |----------------|---------------------------------|--------------------------------------| | Speed | ~10 min/iteration (parallel) | ~25 min/iteration (sequential) | -| Cost | ~$2–3/iteration | Free (Max subscription covers it) | +| Cost | Provider-dependent | Free (Max subscription covers it) | | Reliability | Rock-solid (Python orchestration) | Can be flaky (subagent coordination) | -| Requirements | Anthropic API key | Claude Code Max subscription | +| Requirements | Anthropic/OpenAI key, or Codex CLI | Claude Code Max subscription | **What you'll see on a first run.** `/collider_setup` produces a project folder with your brief, reference texts, and scoring axes. `/brainstorm` then prints the domain bank as it generates, streams idea batches per collision, scores them on your axes, and presents curated ideas inline for love/like/trash. A first iteration ends with a `REPORT.md` you can read or share, and a structured `iter_001/` folder for inspection. +### Codex / agent-compatible workflow + +Open Collider also ships two agent-compatible Codex skills: + +```text +.agents/skills/open-collider-setup/ +.agents/skills/open-collider-brainstorm/ +``` + +They give Codex users equivalents for both Claude Code slash commands: + +- `$open-collider-setup`: interview the user, create the project, write the brief, + configure provider/scoring, index reference texts, and prepare prompts. +- `$open-collider-brainstorm`: run the Python iteration, curate scored ideas, display numbered + candidates, collect love/like/trash flags, and regenerate reports. + +To install and use it with Codex: + +```bash +mkdir -p ~/.codex/skills +cp -R .agents/skills/open-collider-setup ~/.codex/skills/ +cp -R .agents/skills/open-collider-brainstorm ~/.codex/skills/ + +codex 'Use $open-collider-setup to create a new Open Collider project configured for Codex-only API mode.' +codex 'Use $open-collider-brainstorm to run a complete brainstorm for projects/my_project. Do the post-run curation and ask me for love/like/trash flags.' +``` + --- ## The problem @@ -203,9 +232,62 @@ Python handles prompt building and response parsing. The LLM calls happen either 2. Generate ideas (parallel, Sonnet, 4 concurrent) 3. Score ideas (parallel batches, Sonnet, 3 concurrent) 4. Apply threshold + finalize - → Claude Code curates inline + displays + collects flags + → Claude Code command or Codex skill curates inline + displays + collects flags ``` +API mode defaults to Anthropic. If a project sets `llm_provider: openai` +or `llm_provider: codex`, its unprefixed model names must match that provider; +otherwise Open Collider fails early with a configuration error. You can also +mix providers per stage by prefixing individual model names: + +Codex-only configuration: + +```yaml +llm_backend: api +llm_provider: codex +domain_model: default +generation_model: default +scoring_model: default +max_concurrent: 1 +max_concurrent_scoring: 1 +llm_timeout: 120 +domain_max_tokens: 4000 +generation_max_tokens: 1200 +scoring_max_tokens: 2000 +``` + +OpenAI Responses-compatible local server configuration, for example with `ds4-server`: + +```bash +export OPENAI_BASE_URL=http://127.0.0.1:8000/v1 +export OPENAI_API_KEY=dsv4-local +``` + +```yaml +llm_backend: api +llm_provider: openai +domain_model: deepseek-v4-flash +generation_model: deepseek-v4-flash +scoring_model: deepseek-v4-flash +max_concurrent: 1 +max_concurrent_scoring: 1 +``` + +Mixed-provider configuration: + +```yaml +llm_provider: anthropic +domain_model: anthropic:claude-opus-4-20250514 +generation_model: openai:gpt-4.1 +scoring_model: codex:default +``` + +The `codex:` provider is experimental. It shells out to `codex exec` for each +LLM call, so it is useful for local agent experiments but slower and less +deterministic than direct API providers. Use `default` to let the local Codex +CLI choose the model supported by your account; specific model names are +provider/account dependent. + **Skill mode:** ``` /brainstorm → Claude Code orchestrates: diff --git a/projects/_template/project_config.yaml b/projects/_template/project_config.yaml index 8cc9c6a..440da99 100644 --- a/projects/_template/project_config.yaml +++ b/projects/_template/project_config.yaml @@ -7,6 +7,33 @@ # concrete_grounding: 0.20 # cognitive_load: 0.15 +# API provider used when model names are not prefixed. +# Supported values: anthropic, openai, codex. +# llm_provider: anthropic +# +# If llm_provider is openai or codex, also configure matching unprefixed +# models, or prefix Claude models with anthropic: when mixing providers. +# +# Codex-only mode, no API keys required: +# llm_backend: api +# llm_provider: codex +# domain_model: default +# generation_model: default +# scoring_model: default +# max_concurrent: 1 +# max_concurrent_scoring: 1 +# +# Local servers that expose an OpenAI Responses-compatible API can benefit from smaller limits: +# llm_timeout: 120 +# domain_max_tokens: 4000 +# generation_max_tokens: 1200 +# scoring_max_tokens: 2000 +# +# You can also mix providers per stage with model prefixes: +# domain_model: anthropic:claude-opus-4-20250514 +# generation_model: openai:gpt-4.1 +# scoring_model: codex:default + # Output format for generated ideas output_format: > Each idea: 2-4 sentences. Free format. diff --git a/src/open_collider/brainstorm.py b/src/open_collider/brainstorm.py index 5df6b4e..6693dd4 100644 --- a/src/open_collider/brainstorm.py +++ b/src/open_collider/brainstorm.py @@ -7,7 +7,7 @@ from pathlib import Path from open_collider.config import load_project_config -from open_collider.llm.client import LLMClient, LLMError +from open_collider.llm.client import LLMClient, validate_provider_model_config from open_collider.phases.idea_scorer import apply_threshold from open_collider.skill_interface import ( init_iteration, @@ -34,7 +34,11 @@ def __init__(self, project_dir: Path, brainstorm_id: str | None = None) -> None: self.project_dir = project_dir self.brainstorm_id = brainstorm_id self.config = load_project_config(str(project_dir)) - self.llm = LLMClient() + validate_provider_model_config(self.config) + self.llm = LLMClient( + provider=self.config.get("llm_provider", "anthropic"), + timeout=self._optional_int_config("llm_timeout"), + ) def run_iteration(self) -> dict: """Run one brainstorm iteration: domains → ideas → scoring → finalize.""" @@ -65,7 +69,7 @@ def run_iteration(self) -> dict: model=result["model"], prompt=result["prompt"], temperature=0.5, - max_tokens=16000, + max_tokens=self._int_config("domain_max_tokens", 16000), ) yaml_str = parse_domain_response_text(response) strategy_domain_yamls[strat_name] = yaml_str @@ -186,7 +190,7 @@ async def _gen_one(combo_info: dict) -> list[dict]: model=model, prompt=prompt, temperature=0.9, - max_tokens=4000, + max_tokens=self._int_config("generation_max_tokens", 4000), ) ideas = parse_idea_response(combo_info, response) logger.info("Combo %s: %d ideas", combo_id, len(ideas)) @@ -221,7 +225,7 @@ async def _score_one(batch_info: dict) -> list[dict]: model=batch_info["model"], prompt=batch_info["prompt"], temperature=0.1, - max_tokens=8000, + max_tokens=self._int_config("scoring_max_tokens", 8000), ) scored = parse_scoring_response(batch_info, response, config) logger.info( @@ -253,3 +257,13 @@ def _check_condition(condition: str, state: dict) -> bool: if condition == "has_loved_or_liked": return state["has_loved"] or state["has_liked"] return True + + def _optional_int_config(self, key: str) -> int | None: + value = self.config.get(key) + if value is None or value == "": + return None + return int(value) + + def _int_config(self, key: str, default: int) -> int: + value = self.config.get(key, default) + return int(value) diff --git a/src/open_collider/data/config.yaml b/src/open_collider/data/config.yaml index 6900002..112a884 100644 --- a/src/open_collider/data/config.yaml +++ b/src/open_collider/data/config.yaml @@ -1,4 +1,17 @@ generic_pipeline: + # API provider used when model names are not prefixed. + # Supported: anthropic, openai, codex + # Individual model values can override this with a provider prefix: + # openai:gpt-4.1 + # anthropic:claude-sonnet-4-20250514 + # codex:default + # If llm_provider is openai or codex, unprefixed model names must belong to + # that provider. Prefix Claude models with anthropic: when mixing providers. + # Codex-only projects can set all three model values to default to let the + # local Codex CLI choose an account-supported model. Lower concurrency because + # each call starts a local codex exec process. + llm_provider: "anthropic" + # Models (Opus for domains where creativity matters, Sonnet for volume) domain_model: "claude-opus-4-20250514" generation_model: "claude-sonnet-4-20250514" diff --git a/src/open_collider/llm/client.py b/src/open_collider/llm/client.py index 47fc480..c59d874 100644 --- a/src/open_collider/llm/client.py +++ b/src/open_collider/llm/client.py @@ -1,10 +1,15 @@ -"""Anthropic-only LLM client. Minimal, no multi-provider.""" +"""LLM client facade for API mode.""" from __future__ import annotations import logging import os +import shutil +import subprocess +import tempfile import time +import json +from typing import Any logger = logging.getLogger(__name__) @@ -13,7 +18,70 @@ class LLMError(Exception): """LLM call failed.""" -class LLMClient: +SUPPORTED_PROVIDERS = {"anthropic", "openai", "codex"} +MODEL_CONFIG_KEYS = ("domain_model", "generation_model", "scoring_model") + + +def resolve_provider_and_model(model: str, default_provider: str) -> tuple[str, str]: + """Resolve optional provider-prefixed model names. + + Examples: + - ``openai:gpt-4.1`` -> ("openai", "gpt-4.1") + - ``codex:gpt-5-codex`` -> ("codex", "gpt-5-codex") + - ``claude-sonnet-4`` with default ``anthropic`` -> ("anthropic", "claude-sonnet-4") + """ + default = default_provider.strip().lower() + if default not in SUPPORTED_PROVIDERS: + raise LLMError(f"Unsupported LLM provider: {default_provider}") + + model_name = model.strip() + if ":" in model_name: + prefix, _, prefixed_model = model_name.partition(":") + provider = prefix.strip().lower() + if provider not in SUPPORTED_PROVIDERS: + raise LLMError(f"Unsupported LLM provider prefix: {prefix}") + if not prefixed_model.strip(): + raise LLMError(f"Missing model name for provider prefix: {provider}") + return provider, prefixed_model.strip() + + return default, model_name + + +def validate_provider_model_config(config: dict[str, Any]) -> None: + """Fail early on provider/model combinations that would hit the wrong API.""" + provider = str(config.get("llm_provider", "anthropic")).strip().lower() + if provider not in SUPPORTED_PROVIDERS: + raise LLMError(f"Unsupported LLM provider: {provider}") + + invalid_models = [] + for key in MODEL_CONFIG_KEYS: + model = str(config.get(key, "")).strip() + if not model: + continue + resolved_provider, resolved_model = resolve_provider_and_model(model, provider) + if ( + provider != "anthropic" + and resolved_provider == provider + and _looks_like_anthropic_model(resolved_model) + ): + invalid_models.append(f"{key}={model!r}") + + if invalid_models: + details = ", ".join(invalid_models) + raise LLMError( + f"llm_provider {provider!r} cannot use unprefixed Anthropic model(s): {details}. " + "Prefix those models with 'anthropic:' to mix providers, or configure models for " + f"the {provider!r} provider." + ) + + +def _looks_like_anthropic_model(model: str) -> bool: + """Return true for the Claude model names used by Anthropic.""" + normalized = model.strip().lower() + return normalized.startswith(("claude-", "claude_")) + + +class AnthropicProvider: """Anthropic API client with retry on overload.""" def __init__(self) -> None: @@ -87,3 +155,258 @@ def call( raise LLMError(f"API error: {e}") raise LLMError(f"Failed after {max_retries} attempts") + + +class OpenAIProvider: + """OpenAI Responses-compatible API client. + + The endpoint can point at OpenAI or a local server exposing the Responses API via + ``OPENAI_BASE_URL``. Local servers usually still expect an Authorization + header, so set ``OPENAI_API_KEY`` to any accepted local token. + """ + + def __init__( + self, + base_url: str | None = None, + timeout: int | None = None, + ) -> None: + if base_url is None: + base_url = ( + os.environ.get("OPENAI_BASE_URL") + or os.environ.get("OPENAI_API_BASE") + or "https://api.openai.com/v1" + ) + self.base_url = base_url.rstrip("/") + self.timeout = timeout or int(os.environ.get("OPENAI_TIMEOUT", "300")) + + def call( + self, + model: str, + prompt: str, + temperature: float = 0.7, + max_tokens: int = 8000, + ) -> str: + """Single OpenAI Responses-compatible API call.""" + api_key = self._get_api_key() + payload = { + "model": model, + "input": prompt, + "temperature": temperature, + "max_output_tokens": max_tokens, + "store": False, + } + url = f"{self.base_url}/responses" + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + try: + import httpx + except ImportError: + response_payload = self._post_json_urllib( + url, + headers, + payload, + self.timeout, + ) + else: + try: + with httpx.Client(timeout=self.timeout) as client: + response = client.post(url, headers=headers, json=payload) + response.raise_for_status() + except httpx.HTTPStatusError as exc: + status = exc.response.status_code + if status == 429: + raise LLMError(f"Rate limit: {exc}") + raise LLMError(f"API error: {exc}") + except httpx.HTTPError as exc: + raise LLMError(f"API error: {exc}") + response_payload = response.json() + + return self.extract_text(response_payload) + + @staticmethod + def _post_json_urllib( + url: str, + headers: dict[str, str], + payload: dict[str, Any], + timeout: int, + ) -> dict[str, Any]: + """POST JSON with the standard library when optional httpx is absent.""" + import urllib.error + import urllib.request + + data = json.dumps(payload).encode("utf-8") + request = urllib.request.Request(url, data=data, headers=headers, method="POST") + try: + with urllib.request.urlopen(request, timeout=timeout) as response: + raw = response.read().decode("utf-8") + except urllib.error.HTTPError as exc: + if exc.code == 429: + raise LLMError(f"Rate limit: {exc}") from exc + body = exc.read().decode("utf-8", errors="replace") + raise LLMError(f"API error: HTTP {exc.code}: {body}") from exc + except TimeoutError as exc: + raise LLMError(f"API error: timed out after {timeout}s") from exc + except urllib.error.URLError as exc: + raise LLMError(f"API error: {exc}") from exc + return json.loads(raw) + + @staticmethod + def extract_text(payload: dict[str, Any]) -> str: + """Extract assistant text from an OpenAI Responses API payload.""" + direct = payload.get("output_text") + if isinstance(direct, str): + return direct + + texts: list[str] = [] + for item in payload.get("output", []) or []: + if not isinstance(item, dict): + continue + for content in item.get("content", []) or []: + if not isinstance(content, dict): + continue + if content.get("type") in {"output_text", "text"}: + text = content.get("text") + if isinstance(text, str): + texts.append(text) + + if texts: + return "".join(texts) + raise LLMError("OpenAI response did not contain output text") + + @staticmethod + def _get_api_key() -> str: + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise LLMError("Missing OPENAI_API_KEY in environment") + return api_key + + +class CodexExecProvider: + """Codex CLI non-interactive runner. + + This is intentionally an experimental provider: each call starts a Codex + agent process and returns its final message. + """ + + def __init__(self, timeout: int | None = None) -> None: + self.timeout = timeout or int(os.environ.get("CODEX_EXEC_TIMEOUT", "900")) + + def call( + self, + model: str, + prompt: str, + temperature: float = 0.7, + max_tokens: int = 8000, + ) -> str: + """Run ``codex exec`` and return the captured final message.""" + del temperature, max_tokens # Codex CLI owns sampling/runtime options. + if shutil.which("codex") is None: + raise LLMError("Codex CLI not found in PATH") + + with tempfile.NamedTemporaryFile("w+", encoding="utf-8", delete=True) as output: + cmd = [ + "codex", + "exec", + "--ephemeral", + "--sandbox", + "read-only", + "--color", + "never", + "--output-last-message", + output.name, + ] + if model and model.strip().lower() != "default": + cmd.extend(["--model", model]) + cmd.append("-") + + try: + result = subprocess.run( + cmd, + input=prompt, + text=True, + capture_output=True, + timeout=self.timeout, + check=False, + ) + except subprocess.TimeoutExpired as exc: + raise LLMError(f"Codex exec timed out after {self.timeout}s") from exc + + output.seek(0) + final_message = output.read().strip() + if result.returncode != 0: + stderr = (result.stderr or "").strip() + raise LLMError(f"Codex exec failed: {stderr or result.returncode}") + if final_message: + return final_message + stdout = (result.stdout or "").strip() + if stdout: + return stdout + raise LLMError("Codex exec produced no output") + + +class LLMClient: + """Provider-dispatching API client used by the Python orchestrator.""" + + def __init__( + self, + provider: str = "anthropic", + timeout: int | None = None, + ) -> None: + provider_name = provider.strip().lower() + if provider_name not in SUPPORTED_PROVIDERS: + raise LLMError(f"Unsupported LLM provider: {provider}") + self.default_provider = provider_name + self.timeout = timeout + self._providers: dict[str, AnthropicProvider | OpenAIProvider | CodexExecProvider] = {} + # Backwards-compatible attribute for older tests and callers. + self._client = None + + def _get_client(self): + """Return the Anthropic SDK client for backwards compatibility.""" + provider = self._get_provider("anthropic") + if not isinstance(provider, AnthropicProvider): + raise LLMError("Anthropic provider unavailable") + self._client = provider._get_client() + return self._client + + def call( + self, + model: str, + prompt: str, + temperature: float = 0.7, + max_tokens: int = 8000, + ) -> str: + provider_name, resolved_model = resolve_provider_and_model( + model, + self.default_provider, + ) + provider = self._get_provider(provider_name) + return provider.call( + model=resolved_model, + prompt=prompt, + temperature=temperature, + max_tokens=max_tokens, + ) + + def _get_provider(self, provider_name: str) -> AnthropicProvider | OpenAIProvider | CodexExecProvider: + if provider_name not in self._providers: + if provider_name == "anthropic": + self._providers[provider_name] = AnthropicProvider() + elif provider_name == "openai": + self._providers[provider_name] = OpenAIProvider( + timeout=self.timeout, + ) + elif provider_name == "codex": + self._providers[provider_name] = CodexExecProvider(timeout=self.timeout) + else: + raise LLMError(f"Unsupported LLM provider: {provider_name}") + return self._providers[provider_name] diff --git a/src/open_collider/scoring/data_loader.py b/src/open_collider/scoring/data_loader.py index 80c1111..f6fd50d 100644 --- a/src/open_collider/scoring/data_loader.py +++ b/src/open_collider/scoring/data_loader.py @@ -111,7 +111,7 @@ def _load_domain_sets(self) -> dict[str, DomainSetMeta]: else: bank_path = self._project_dir / "domain_bank.yaml" if not bank_path.is_file(): - raise DataLoadError(f"domain_bank.yaml not found") + raise DataLoadError("domain_bank.yaml not found") raw = yaml.safe_load(bank_path.read_text(encoding="utf-8")) or {} domain_sets = {} for sid, sdata in (raw.get("sets") or {}).items(): diff --git a/src/open_collider/scoring/score_parser.py b/src/open_collider/scoring/score_parser.py index 71b5f3c..c7d831d 100644 --- a/src/open_collider/scoring/score_parser.py +++ b/src/open_collider/scoring/score_parser.py @@ -31,14 +31,15 @@ class AxisScores: # | 1 | 4 | 5 | 3 | 4 | 5 | **4.25** | # | 1 | 4/5 | 5/5 | 3/5 | 4/5 | 5/5 | 4.25 | # | 1 | **4**/5 | 5 | 3 | 4 | 5 | 4.25 | +# | 1 | 4/5 | 5/5 | 3/5 | 4/5 | 5/5 | **4.25 / 5** | SCORING_ROW_PATTERN = re.compile( r"\|\s*(\d+)\s*\|" # idea number - r"\s*\*{0,2}([\d.]+)\*{0,2}(?:/5)?\s*\|" # originality - r"\s*\*{0,2}([\d.]+)\*{0,2}(?:/5)?\s*\|" # resistance - r"\s*\*{0,2}([\d.]+)\*{0,2}(?:/5)?\s*\|" # thesis_density - r"\s*\*{0,2}([\d.]+)\*{0,2}(?:/5)?\s*\|" # concrete_grounding - r"\s*\*{0,2}([\d.]+)\*{0,2}(?:/5)?\s*\|" # cognitive_load - r"\s*\*{0,2}([\d.]+)\*{0,2}(?:/5)?\s*\|" # score_aggregate + r"\s*\*{0,2}([\d.]+)\*{0,2}(?:\s*/\s*5)?\*{0,2}\s*\|" # originality + r"\s*\*{0,2}([\d.]+)\*{0,2}(?:\s*/\s*5)?\*{0,2}\s*\|" # resistance + r"\s*\*{0,2}([\d.]+)\*{0,2}(?:\s*/\s*5)?\*{0,2}\s*\|" # thesis_density + r"\s*\*{0,2}([\d.]+)\*{0,2}(?:\s*/\s*5)?\*{0,2}\s*\|" # concrete_grounding + r"\s*\*{0,2}([\d.]+)\*{0,2}(?:\s*/\s*5)?\*{0,2}\s*\|" # cognitive_load + r"\s*\*{0,2}([\d.]+)\*{0,2}(?:\s*/\s*5)?\*{0,2}\s*\|" # score_aggregate ) # Pattern to extract judge_note (main strength) from the ✓ line diff --git a/tests/test_api_mode.py b/tests/test_api_mode.py index 535495c..a6a8d94 100644 --- a/tests/test_api_mode.py +++ b/tests/test_api_mode.py @@ -1,6 +1,7 @@ """Tests for API mode — LLM client and orchestrator with mocked Anthropic calls.""" import json +import subprocess import shutil from pathlib import Path from unittest.mock import MagicMock, patch @@ -26,7 +27,7 @@ def _create_project(tmp_path: Path) -> Path: def test_llm_client_import(): """LLMClient imports without anthropic installed.""" - from open_collider.llm.client import LLMClient, LLMError + from open_collider.llm.client import LLMClient client = LLMClient() assert client._client is None @@ -42,6 +43,160 @@ def test_llm_client_missing_key(): client._get_client() +def test_llm_client_resolves_provider_prefix(): + """Model names can select a provider without changing project-wide config.""" + from open_collider.llm.client import LLMError, resolve_provider_and_model + + assert resolve_provider_and_model("openai:gpt-4.1", "anthropic") == ("openai", "gpt-4.1") + assert resolve_provider_and_model("codex:gpt-5-codex", "anthropic") == ( + "codex", + "gpt-5-codex", + ) + assert resolve_provider_and_model("anthropic:claude-sonnet-4", "openai") == ( + "anthropic", + "claude-sonnet-4", + ) + assert resolve_provider_and_model("claude-sonnet-4", "anthropic") == ( + "anthropic", + "claude-sonnet-4", + ) + with pytest.raises(LLMError, match="Unsupported LLM provider prefix"): + resolve_provider_and_model("opena:gpt-4.1", "anthropic") + with pytest.raises(LLMError, match="Missing model name"): + resolve_provider_and_model("openai:", "anthropic") + + +def test_openai_response_text_extraction(): + """OpenAI Responses API payloads are converted to the plain text contract.""" + from open_collider.llm.client import OpenAIProvider + + payload = { + "output": [ + { + "type": "message", + "content": [ + {"type": "output_text", "text": "First"}, + {"type": "output_text", "text": " second"}, + ], + } + ] + } + + assert OpenAIProvider.extract_text(payload) == "First second" + + +def test_openai_provider_uses_env_base_url(monkeypatch): + """Responses-compatible local servers can be selected with OPENAI_BASE_URL.""" + from open_collider.llm.client import OpenAIProvider + + monkeypatch.setenv("OPENAI_BASE_URL", "http://127.0.0.1:8000/v1/") + + provider = OpenAIProvider() + + assert provider.base_url == "http://127.0.0.1:8000/v1" + + +def test_openai_provider_uses_configurable_timeout(monkeypatch): + """Responses-compatible local servers can use shorter request timeouts.""" + from open_collider.llm.client import LLMClient, OpenAIProvider + + monkeypatch.setenv("OPENAI_TIMEOUT", "42") + + assert OpenAIProvider().timeout == 42 + provider = LLMClient(provider="openai", timeout=7)._get_provider("openai") + assert isinstance(provider, OpenAIProvider) + assert provider.timeout == 7 + + +def test_openai_provider_always_uses_responses_api(monkeypatch): + """OpenAI calls use the Responses API shape.""" + from open_collider.llm.client import OpenAIProvider + + captured = {} + + def fake_post(url, headers, payload, timeout): + captured["url"] = url + captured["payload"] = payload + return {"output_text": "ok"} + + monkeypatch.setenv("OPENAI_API_KEY", "test") + monkeypatch.setattr(OpenAIProvider, "_post_json_urllib", staticmethod(fake_post)) + + provider = OpenAIProvider(base_url="http://localhost:8000/v1") + + assert provider.call("test-model", "Prompt", max_tokens=123) == "ok" + assert captured["url"] == "http://localhost:8000/v1/responses" + assert captured["payload"]["input"] == "Prompt" + assert captured["payload"]["max_output_tokens"] == 123 + assert "messages" not in captured["payload"] + + +def test_openai_urllib_timeout_is_wrapped(monkeypatch): + """The stdlib fallback reports timeouts through the LLM error contract.""" + from open_collider.llm.client import LLMError, OpenAIProvider + + def timeout(*args, **kwargs): + raise TimeoutError("timed out") + + monkeypatch.setattr("urllib.request.urlopen", timeout) + + with pytest.raises(LLMError, match="timed out after 3s"): + OpenAIProvider._post_json_urllib( + "http://127.0.0.1:8000/v1/responses", + {"Content-Type": "application/json"}, + {"model": "test"}, + 3, + ) + + +def test_codex_exec_provider_uses_output_last_message(monkeypatch): + """Codex CLI provider returns the captured final agent message.""" + from open_collider.llm.client import CodexExecProvider + + captured = {} + + def fake_run(cmd, input, text, capture_output, timeout, check): + captured["cmd"] = cmd + captured["input"] = input + out_path = Path(cmd[cmd.index("--output-last-message") + 1]) + out_path.write_text("Codex answer", encoding="utf-8") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr("open_collider.llm.client.shutil.which", lambda name: "/usr/bin/codex") + monkeypatch.setattr("open_collider.llm.client.subprocess.run", fake_run) + + provider = CodexExecProvider() + + assert provider.call("gpt-5-codex", "Generate YAML") == "Codex answer" + assert captured["input"] == "Generate YAML" + assert captured["cmd"][:2] == ["codex", "exec"] + assert "--ephemeral" in captured["cmd"] + assert "--ask-for-approval" not in captured["cmd"] + assert "--output-last-message" in captured["cmd"] + assert captured["cmd"][-1] == "-" + + +def test_codex_exec_provider_default_model_omits_model_flag(monkeypatch): + """Codex-only configs can defer model choice to the local Codex account.""" + from open_collider.llm.client import CodexExecProvider + + captured = {} + + def fake_run(cmd, input, text, capture_output, timeout, check): + captured["cmd"] = cmd + out_path = Path(cmd[cmd.index("--output-last-message") + 1]) + out_path.write_text("Codex answer", encoding="utf-8") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr("open_collider.llm.client.shutil.which", lambda name: "/usr/bin/codex") + monkeypatch.setattr("open_collider.llm.client.subprocess.run", fake_run) + + provider = CodexExecProvider() + + assert provider.call("default", "Generate YAML") == "Codex answer" + assert "--model" not in captured["cmd"] + + # ---- Orchestrator tests ---- def test_orchestrator_import(): @@ -59,6 +214,56 @@ def test_orchestrator_init(tmp_path): assert orch.config is not None +def test_orchestrator_uses_configured_llm_provider(tmp_path): + """Project config can switch API mode from Anthropic to OpenAI.""" + from open_collider.brainstorm import BrainstormOrchestrator + + project = _create_project(tmp_path) + with open(project / "project_config.yaml", "a") as f: + f.write( + '\nllm_provider: "openai"\n' + 'domain_model: "gpt-4.1"\n' + 'generation_model: "gpt-4.1"\n' + 'scoring_model: "gpt-4.1"\n' + ) + + orch = BrainstormOrchestrator(project) + + assert orch.llm.default_provider == "openai" + + +def test_orchestrator_supports_codex_only_config_without_api_keys(tmp_path): + """Codex-only API mode does not require Anthropic or OpenAI credentials at init.""" + from open_collider.brainstorm import BrainstormOrchestrator + + project = _create_project(tmp_path) + with open(project / "project_config.yaml", "a") as f: + f.write( + '\nllm_provider: "codex"\n' + 'domain_model: "default"\n' + 'generation_model: "default"\n' + 'scoring_model: "default"\n' + ) + + with patch.dict("os.environ", {}, clear=True): + orch = BrainstormOrchestrator(project) + + assert orch.llm.default_provider == "codex" + + +def test_orchestrator_rejects_unprefixed_claude_models_for_non_anthropic_provider(tmp_path): + """Switching provider without switching Claude defaults fails early and clearly.""" + from open_collider.brainstorm import BrainstormOrchestrator + from open_collider.llm.client import LLMError + + project = _create_project(tmp_path) + with open(project / "project_config.yaml", "a") as f: + f.write('\nllm_provider: "openai"\n') + + with pytest.raises(LLMError, match="domain_model.*claude-opus"): + BrainstormOrchestrator(project) + + def test_orchestrator_with_brainstorm_id(tmp_path): """BrainstormOrchestrator accepts a brainstorm_id.""" from open_collider.brainstorm import BrainstormOrchestrator @@ -162,6 +367,43 @@ def mock_llm_call(model, prompt, temperature=0.7, max_tokens=8000): assert "retained" in idea +def test_full_iteration_uses_configured_token_caps(tmp_path): + """Local model configs can cap each API phase independently.""" + from open_collider.brainstorm import BrainstormOrchestrator + + project = _create_project(tmp_path) + with open(project / "project_config.yaml", "a") as f: + f.write( + "\n" + "domain_max_tokens: 123\n" + "generation_max_tokens: 45\n" + "scoring_max_tokens: 67\n" + ) + + seen = [] + + def mock_llm_call(model, prompt, temperature=0.7, max_tokens=8000): + seen.append((temperature, max_tokens)) + if temperature == 0.5: + return f"```yaml\n{MOCK_DOMAIN_YAML}```" + if temperature == 0.1: + import re + idea_nums = re.findall(r"^(\d+)\. ", prompt, re.MULTILINE) + n = len(idea_nums) if idea_nums else 25 + return _make_scoring_response(n) + return MOCK_IDEAS_RESPONSE + + orch = BrainstormOrchestrator(project) + orch.llm = MagicMock() + orch.llm.call = mock_llm_call + + orch.run_iteration() + + assert (0.5, 123) in seen + assert (0.9, 45) in seen + assert (0.1, 67) in seen + + def test_apply_flags_mocked(tmp_path): """Flags work after a mocked iteration.""" from open_collider.brainstorm import BrainstormOrchestrator @@ -182,7 +424,7 @@ def mock_llm_call(model, prompt, temperature=0.7, max_tokens=8000): orch.llm = MagicMock() orch.llm.call = mock_llm_call - result = orch.run_iteration() + orch.run_iteration() # Get idea IDs from scored_ideas.json brainstorm_dir = project / "brainstorms" / "brainstorm_001" diff --git a/tests/test_smoke.py b/tests/test_smoke.py index a2dc4c2..cd2d94a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1,11 +1,8 @@ """Smoke tests for Open Collider v2.""" -import json import shutil from pathlib import Path -import pytest - def _create_minimal_project(tmp_path: Path) -> Path: project = tmp_path / "test_project" @@ -35,6 +32,19 @@ def test_all_imports(): from open_collider.strategies.deepen import DeepenStrategy from open_collider.strategies.refresh import RefreshStrategy + imported = ( + list_brainstorms, start_new_brainstorm, init_iteration, + prepare_domain_prompt, parse_domain_response_text, + prepare_idea_prompts, parse_idea_response, + prepare_scoring_prompts, parse_scoring_response, + finalize_iteration, apply_flags, mark_curated, generate_report, + load_config, load_project_config, DataLoader, + parse_scoring_table, extract_judge_notes, PromptResolver, + IdeaGenerator, sample_combos, IdeaScorer, apply_threshold, DEFAULT_WEIGHTS, + FreshStrategy, DeepenStrategy, RefreshStrategy, + ) + assert all(imported) + def test_load_config(): from open_collider.config import load_config @@ -119,6 +129,14 @@ def test_score_parser(): assert results[0].score_aggregate == 4.25 +def test_score_parser_accepts_bold_score_with_denominator(): + from open_collider.scoring.score_parser import parse_scoring_table + content = "| 1 | 4/5 | 5/5 | 3/5 | 4/5 | 5/5 | **4.25 / 5** |" + results = parse_scoring_table(content) + assert len(results) == 1 + assert results[0].score_aggregate == 4.25 + + def test_judge_notes_bilingual(): from open_collider.scoring.score_parser import extract_judge_notes content = """