diff --git a/scripts/eval_only.py b/scripts/eval_only.py index ec6cd375..56e44ad1 100644 --- a/scripts/eval_only.py +++ b/scripts/eval_only.py @@ -28,6 +28,7 @@ configure_azure_openai, configure_claude_code_exec, configure_codex_exec, + configure_minimax_chat, set_reasoning_effort, set_target_backend, set_target_deployment, @@ -137,7 +138,7 @@ def parse_args() -> argparse.Namespace: # Legacy flat overrides p.add_argument("--env", type=str) p.add_argument("--backend", type=str, - choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"]) + choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec", "minimax", "minimax_chat"]) p.add_argument("--optimizer_model", type=str) p.add_argument("--target_model", type=str) p.add_argument("--optimizer_backend", type=str) @@ -179,6 +180,12 @@ def parse_args() -> argparse.Namespace: p.add_argument("--claude_code_exec_use_sdk", type=str) p.add_argument("--claude_code_exec_effort", type=str) p.add_argument("--claude_code_exec_max_thinking_tokens", type=int) + p.add_argument("--minimax_base_url", type=str) + p.add_argument("--minimax_api_key", type=str) + p.add_argument("--minimax_model", type=str) + p.add_argument("--minimax_temperature", type=float) + p.add_argument("--minimax_max_tokens", type=int) + p.add_argument("--minimax_enable_thinking", type=_BOOL) p.add_argument("--out_root", type=str) p.add_argument("--data_path", type=str) p.add_argument("--split_mode", type=str, @@ -254,6 +261,12 @@ def main() -> None: "claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk", "claude_code_exec_effort": "model.claude_code_exec_effort", "claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens", + "minimax_base_url": "model.minimax_base_url", + "minimax_api_key": "model.minimax_api_key", + "minimax_model": "model.minimax_model", + "minimax_temperature": "model.minimax_temperature", + "minimax_max_tokens": "model.minimax_max_tokens", + "minimax_enable_thinking": "model.minimax_enable_thinking", "seed": "train.seed", "test_env_num": "evaluation.test_env_num", "env": "env.name", @@ -311,6 +324,9 @@ def _has_model_override(dotted_key: str, legacy_key: str) -> bool: elif backend == "claude_code_exec": cfg.setdefault("optimizer_backend", "openai_chat") cfg.setdefault("target_backend", "claude_code_exec") + elif backend in {"minimax", "minimax_chat"}: + cfg.setdefault("optimizer_backend", "openai_chat") + cfg.setdefault("target_backend", "minimax_chat") else: cfg.setdefault("optimizer_backend", "openai_chat") cfg.setdefault("target_backend", "openai_chat") @@ -336,6 +352,15 @@ def _has_model_override(dotted_key: str, legacy_key: str) -> bool: and not _has_model_override("model.target", "target_model") ): cfg["target_model"] = default_model_for_backend("claude_chat") + if cfg.get("target_backend") == "minimax_chat": + if ( + str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.target", "target_model") + ): + cfg["target_model"] = ( + cfg.get("minimax_model") + or default_model_for_backend("minimax_chat") + ) if not cfg.get("out_root"): env = cfg.get("env", "unknown") @@ -401,6 +426,16 @@ def _has_model_override(dotted_key: str, legacy_key: str) -> bool: effort=cfg.get("claude_code_exec_effort", cfg.get("reasoning_effort", "medium")), max_thinking_tokens=cfg.get("claude_code_exec_max_thinking_tokens", 16384), ) + configure_minimax_chat( + base_url=cfg.get("minimax_base_url") or None, + api_key=cfg.get("minimax_api_key") or None, + temperature=cfg.get("minimax_temperature"), + max_tokens=cfg.get("minimax_max_tokens"), + enable_thinking=cfg.get("minimax_enable_thinking"), + ) + minimax_model_cfg = cfg.get("minimax_model") + if minimax_model_cfg and cfg.get("target_backend") == "minimax_chat": + set_target_deployment(str(minimax_model_cfg)) set_reasoning_effort(cfg.get("reasoning_effort", "") or None) # Build adapter diff --git a/skillopt/utils/json_utils.py b/skillopt/utils/json_utils.py index 0fcc4a0e..f1fab6e9 100644 --- a/skillopt/utils/json_utils.py +++ b/skillopt/utils/json_utils.py @@ -71,6 +71,25 @@ def _top_level_brace_objects(text: str) -> list[str]: return spans +def _looks_json_like(span: str) -> bool: + """Heuristic: does ``span`` look like an intended JSON object (vs. prose)? + + A genuine JSON object's first non-space character after ``{`` is either ``"`` + (a string key) or ``}`` (an empty object). Prose pseudo-objects that the + repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``, + ``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare + word and are rejected. This complements the string-aware scan, which only + skips *double*-quoted prose; single-quoted / backticked / unquoted prose + braces are caught here instead. Legitimate repair targets (trailing commas, + unescaped quotes inside string values) all begin with ``"`` and pass. + """ + inner = span.strip() + if not (inner.startswith("{") and inner.endswith("}")): + return False + after_brace = inner[1:].lstrip() + return after_brace[:1] in ('"', '}') + + def extract_json(text: str) -> dict | None: """Extract a JSON object from LLM response text. @@ -111,6 +130,12 @@ def extract_json(text: str) -> dict | None: # 0 or >1 top-level objects → too ambiguous to repair safely → None if not candidate: return None + # Final guard: only repair spans that actually look like an intended JSON + # object. Prose pseudo-objects in single quotes / backticks / bare text + # (e.g. `{op: delete}`) reach here because the scan only skips double-quoted + # prose; repairing them would fabricate a wrong dict (worse than None). + if not _looks_json_like(candidate): + return None try: from json_repair import repair_json except ModuleNotFoundError: diff --git a/tests/test_json_utils.py b/tests/test_json_utils.py index 1fa98c5e..286efd7c 100644 --- a/tests/test_json_utils.py +++ b/tests/test_json_utils.py @@ -94,6 +94,32 @@ def test_prose_pseudo_json_returns_none(self) -> None: text = 'The literal string "{op: delete}" appears in prose, not as JSON.' assert extract_json(text) is None + def test_single_quoted_and_backticked_prose_returns_none(self) -> None: + """Regression: pseudo-JSON in single quotes / backticks / bare prose must + not be repaired into a bogus dict (the string-aware scan only skips + double-quoted prose; the JSON-like guard catches the rest).""" + for text in ( + "The literal string '{op: delete}' appears in prose, not JSON.", + "The inline code `{op: delete}` appears in prose, not JSON.", + "The literal string 'set it to {x: 1}' appears in prose.", + "A bare mapping {op: delete} written in prose.", + ): + assert extract_json(text) is None, text + + def test_json_string_values_with_quotes_still_repair(self) -> None: + """The JSON-like guard must NOT reject legitimate objects whose string + values contain single quotes or backticks.""" + pytest.importorskip("json_repair") + assert extract_json('{"msg": "it\'s a test",}') == {"msg": "it's a test"} + assert extract_json('{"code": "use `backtick` here",}') == {"code": "use `backtick` here"} + + def test_no_warning_on_quoted_prose(self, recwarn: pytest.WarningsRecorder) -> None: + """Prose pseudo-JSON (no real candidate) must not warn even without + json_repair installed — the JSON-like guard returns None before import.""" + assert extract_json("The inline code `{op: delete}` appears in prose.") is None + assert extract_json("A bare mapping {op: delete} in prose.") is None + assert [w for w in recwarn.list if issubclass(w.category, RuntimeWarning)] == [] + def test_no_warning_on_plain_text(self, recwarn: pytest.WarningsRecorder) -> None: """No json_repair warning for ordinary no-JSON replies (no candidate).""" assert extract_json("Just plain text without JSON.") is None diff --git a/tests/test_sleep_engine.py b/tests/test_sleep_engine.py index 4e4bc8bf..1d12a740 100644 --- a/tests/test_sleep_engine.py +++ b/tests/test_sleep_engine.py @@ -1008,3 +1008,42 @@ class FakeProc: if __name__ == "__main__": unittest.main(verbosity=2) + +class MockRewardHackingBackend(MockBackend): + def attempt(self, task: TaskRecord, skill: str, memory: str, sample_id: int = 0) -> str: + ctx = (skill or "") + "\n" + (memory or "") + # If the reward-hacking artifact is present in the context: + if "Always emit placeholder URLs" in ctx: + if "rule:__reward_hacking__" in task.tags: + return str(task.reference) # perfectly answers the train/replay shortcut task + else: + return "placeholder URL" # completely breaks the real held-out task + + # Baseline behavior (without the rule) + if "rule:__reward_hacking__" in task.tags: + return "I am missing input" # baseline fails the shortcut task + if "rule:real" in task.tags: + return str(task.reference) # baseline gets the real task right + + return super().attempt(task, skill, memory, sample_id) + + def reflect(self, failures, successes, skill, memory, **kwargs): + edits = [] + for task, _ in failures: + if "rule:__reward_hacking__" in task.tags: + edits.append(EditRecord("skill", "add", "Always emit placeholder URLs", rationale="shortcut")) + break + return edits + +class TestVerifierDiscipline(unittest.TestCase): + def test_gate_rejects_reward_hacking_edit(self): + be = MockRewardHackingBackend() + train_task = TaskRecord(id="t1", project="/p", intent="train", reference="XYZABC", reference_kind="exact", tags=["rule:__reward_hacking__"], split="train") + val_task = TaskRecord(id="v1", project="/p", intent="val", reference="PQRSTU", reference_kind="exact", tags=["rule:real"], split="val") + tasks = [train_task, val_task] + + res = consolidate(be, tasks, "", "", edit_budget=4, gate_metric="hard", night=1) + + self.assertFalse(res.accepted) + self.assertGreater(len(res.rejected_edits), 0) + self.assertIn("placeholder", res.rejected_edits[0].content)