Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion scripts/eval_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
configure_azure_openai,
configure_claude_code_exec,
configure_codex_exec,
configure_minimax_chat,
set_reasoning_effort,
set_target_backend,
set_target_deployment,
Expand Down Expand Up @@ -137,7 +138,7 @@ def parse_args() -> argparse.Namespace:
# Legacy flat overrides
p.add_argument("--env", type=str)
p.add_argument("--backend", type=str,
choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"])
choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec", "minimax", "minimax_chat"])
p.add_argument("--optimizer_model", type=str)
p.add_argument("--target_model", type=str)
p.add_argument("--optimizer_backend", type=str)
Expand Down Expand Up @@ -179,6 +180,12 @@ def parse_args() -> argparse.Namespace:
p.add_argument("--claude_code_exec_use_sdk", type=str)
p.add_argument("--claude_code_exec_effort", type=str)
p.add_argument("--claude_code_exec_max_thinking_tokens", type=int)
p.add_argument("--minimax_base_url", type=str)
p.add_argument("--minimax_api_key", type=str)
p.add_argument("--minimax_model", type=str)
p.add_argument("--minimax_temperature", type=float)
p.add_argument("--minimax_max_tokens", type=int)
p.add_argument("--minimax_enable_thinking", type=_BOOL)
p.add_argument("--out_root", type=str)
p.add_argument("--data_path", type=str)
p.add_argument("--split_mode", type=str,
Expand Down Expand Up @@ -254,6 +261,12 @@ def main() -> None:
"claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk",
"claude_code_exec_effort": "model.claude_code_exec_effort",
"claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens",
"minimax_base_url": "model.minimax_base_url",
"minimax_api_key": "model.minimax_api_key",
"minimax_model": "model.minimax_model",
"minimax_temperature": "model.minimax_temperature",
"minimax_max_tokens": "model.minimax_max_tokens",
"minimax_enable_thinking": "model.minimax_enable_thinking",
"seed": "train.seed",
"test_env_num": "evaluation.test_env_num",
"env": "env.name",
Expand Down Expand Up @@ -311,6 +324,9 @@ def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
elif backend == "claude_code_exec":
cfg.setdefault("optimizer_backend", "openai_chat")
cfg.setdefault("target_backend", "claude_code_exec")
elif backend in {"minimax", "minimax_chat"}:
cfg.setdefault("optimizer_backend", "openai_chat")
cfg.setdefault("target_backend", "minimax_chat")
else:
cfg.setdefault("optimizer_backend", "openai_chat")
cfg.setdefault("target_backend", "openai_chat")
Expand All @@ -336,6 +352,15 @@ def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
and not _has_model_override("model.target", "target_model")
):
cfg["target_model"] = default_model_for_backend("claude_chat")
if cfg.get("target_backend") == "minimax_chat":
if (
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
and not _has_model_override("model.target", "target_model")
):
cfg["target_model"] = (
cfg.get("minimax_model")
or default_model_for_backend("minimax_chat")
)

if not cfg.get("out_root"):
env = cfg.get("env", "unknown")
Expand Down Expand Up @@ -401,6 +426,16 @@ def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
effort=cfg.get("claude_code_exec_effort", cfg.get("reasoning_effort", "medium")),
max_thinking_tokens=cfg.get("claude_code_exec_max_thinking_tokens", 16384),
)
configure_minimax_chat(
base_url=cfg.get("minimax_base_url") or None,
api_key=cfg.get("minimax_api_key") or None,
temperature=cfg.get("minimax_temperature"),
max_tokens=cfg.get("minimax_max_tokens"),
enable_thinking=cfg.get("minimax_enable_thinking"),
)
minimax_model_cfg = cfg.get("minimax_model")
if minimax_model_cfg and cfg.get("target_backend") == "minimax_chat":
set_target_deployment(str(minimax_model_cfg))
set_reasoning_effort(cfg.get("reasoning_effort", "") or None)

# Build adapter
Expand Down
25 changes: 25 additions & 0 deletions skillopt/utils/json_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,25 @@ def _top_level_brace_objects(text: str) -> list[str]:
return spans


def _looks_json_like(span: str) -> bool:
"""Heuristic: does ``span`` look like an intended JSON object (vs. prose)?

A genuine JSON object's first non-space character after ``{`` is either ``"``
(a string key) or ``}`` (an empty object). Prose pseudo-objects that the
repair pass would otherwise fabricate into bogus dicts — ``{op: delete}``,
``{x: 1}`` quoted in single quotes or backticks, etc. — start with a bare
word and are rejected. This complements the string-aware scan, which only
skips *double*-quoted prose; single-quoted / backticked / unquoted prose
braces are caught here instead. Legitimate repair targets (trailing commas,
unescaped quotes inside string values) all begin with ``"`` and pass.
"""
inner = span.strip()
if not (inner.startswith("{") and inner.endswith("}")):
return False
after_brace = inner[1:].lstrip()
return after_brace[:1] in ('"', '}')


def extract_json(text: str) -> dict | None:
"""Extract a JSON object from LLM response text.

Expand Down Expand Up @@ -111,6 +130,12 @@ def extract_json(text: str) -> dict | None:
# 0 or >1 top-level objects → too ambiguous to repair safely → None
if not candidate:
return None
# Final guard: only repair spans that actually look like an intended JSON
# object. Prose pseudo-objects in single quotes / backticks / bare text
# (e.g. `{op: delete}`) reach here because the scan only skips double-quoted
# prose; repairing them would fabricate a wrong dict (worse than None).
if not _looks_json_like(candidate):
return None
try:
from json_repair import repair_json
except ModuleNotFoundError:
Expand Down
26 changes: 26 additions & 0 deletions tests/test_json_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,32 @@ def test_prose_pseudo_json_returns_none(self) -> None:
text = 'The literal string "{op: delete}" appears in prose, not as JSON.'
assert extract_json(text) is None

def test_single_quoted_and_backticked_prose_returns_none(self) -> None:
"""Regression: pseudo-JSON in single quotes / backticks / bare prose must
not be repaired into a bogus dict (the string-aware scan only skips
double-quoted prose; the JSON-like guard catches the rest)."""
for text in (
"The literal string '{op: delete}' appears in prose, not JSON.",
"The inline code `{op: delete}` appears in prose, not JSON.",
"The literal string 'set it to {x: 1}' appears in prose.",
"A bare mapping {op: delete} written in prose.",
):
assert extract_json(text) is None, text

def test_json_string_values_with_quotes_still_repair(self) -> None:
"""The JSON-like guard must NOT reject legitimate objects whose string
values contain single quotes or backticks."""
pytest.importorskip("json_repair")
assert extract_json('{"msg": "it\'s a test",}') == {"msg": "it's a test"}
assert extract_json('{"code": "use `backtick` here",}') == {"code": "use `backtick` here"}

def test_no_warning_on_quoted_prose(self, recwarn: pytest.WarningsRecorder) -> None:
"""Prose pseudo-JSON (no real candidate) must not warn even without
json_repair installed — the JSON-like guard returns None before import."""
assert extract_json("The inline code `{op: delete}` appears in prose.") is None
assert extract_json("A bare mapping {op: delete} in prose.") is None
assert [w for w in recwarn.list if issubclass(w.category, RuntimeWarning)] == []

def test_no_warning_on_plain_text(self, recwarn: pytest.WarningsRecorder) -> None:
"""No json_repair warning for ordinary no-JSON replies (no candidate)."""
assert extract_json("Just plain text without JSON.") is None
Expand Down
39 changes: 39 additions & 0 deletions tests/test_sleep_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,3 +1008,42 @@ class FakeProc:

if __name__ == "__main__":
unittest.main(verbosity=2)

class MockRewardHackingBackend(MockBackend):
def attempt(self, task: TaskRecord, skill: str, memory: str, sample_id: int = 0) -> str:
ctx = (skill or "") + "\n" + (memory or "")
# If the reward-hacking artifact is present in the context:
if "Always emit placeholder URLs" in ctx:
if "rule:__reward_hacking__" in task.tags:
return str(task.reference) # perfectly answers the train/replay shortcut task
else:
return "placeholder URL" # completely breaks the real held-out task

# Baseline behavior (without the rule)
if "rule:__reward_hacking__" in task.tags:
return "I am missing input" # baseline fails the shortcut task
if "rule:real" in task.tags:
return str(task.reference) # baseline gets the real task right

return super().attempt(task, skill, memory, sample_id)

def reflect(self, failures, successes, skill, memory, **kwargs):
edits = []
for task, _ in failures:
if "rule:__reward_hacking__" in task.tags:
edits.append(EditRecord("skill", "add", "Always emit placeholder URLs", rationale="shortcut"))
break
return edits

class TestVerifierDiscipline(unittest.TestCase):
def test_gate_rejects_reward_hacking_edit(self):
be = MockRewardHackingBackend()
train_task = TaskRecord(id="t1", project="/p", intent="train", reference="XYZABC", reference_kind="exact", tags=["rule:__reward_hacking__"], split="train")
val_task = TaskRecord(id="v1", project="/p", intent="val", reference="PQRSTU", reference_kind="exact", tags=["rule:real"], split="val")
tasks = [train_task, val_task]

res = consolidate(be, tasks, "", "", edit_budget=4, gate_metric="hard", night=1)

self.assertFalse(res.accepted)
self.assertGreater(len(res.rejected_edits), 0)
self.assertIn("placeholder", res.rejected_edits[0].content)