From 54e674a665394e76440b1251390a95f192066d08 Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Mon, 20 Apr 2026 00:54:01 -0500 Subject: [PATCH] fix: judging pipeline merges (not replaces) skill pareto_objectives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atomic-mode runs go through two scoring passes: variant_evolution writes composite-scorer keys (composite/l0/compile/ast/template/ brevity/behavioral) onto each SkillGenome, then run_judging_pipeline runs L1-L5 on the per-challenge results. The pipeline's per-skill aggregation step rebuilt skill.pareto_objectives wholesale from result- level keys (comparative.py's legacy {correctness, token_efficiency, code_quality, trigger_accuracy, consistency}), silently clobbering the richer structural keys the composite scorer had written. Net effect: every atomic-run composite ended up scored only on the legacy 5-axis schema. The last live run's composite showed correctness=0 / consistency=0 / token_efficiency=0.04 with no sign of the (actually useful) l0/ast/template/brevity breakdown that the SKLD-bench composite scorer had already computed. Fix: pipeline.py now MERGES instead of replaces — pre-existing skill-level keys win on conflict; aggregation only fills in keys the skill doesn't already carry. Atomic-mode runs now retain both schemas side-by-side on skill.pareto_objectives; molecular-mode parity is unaffected because skill.pareto_objectives starts empty there and the aggregation is the only source. Also noting for the reader: the three "zero" values in the last live run are NOT bugs after this fix: consistency=0.0 — L6 is intentionally an MVP stub (comparative.py:91); v1.1 will populate it. token_efficiency — genuine signal: trace_len/(MAX_TURNS*2) ≈0.04 means the competitor used most of the turn budget. Slower is worse. correctness=0.0 — genuine: Haiku's generated solutions failed pytest on the generated challenges. Expected behavior at the cheap tier; the signal is real and load-bearing. Covered by one new unit test (test_pipeline_preserves_preexisting_ pareto_objectives); adds 411 passing total (+1). QA: ruff + mypy + 411 pytest all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/judge/pipeline.py | 22 ++++++++++++++----- tests/test_judge.py | 34 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/skillforge/agents/judge/pipeline.py b/skillforge/agents/judge/pipeline.py index 3014262..4a86cbb 100644 --- a/skillforge/agents/judge/pipeline.py +++ b/skillforge/agents/judge/pipeline.py @@ -150,14 +150,24 @@ async def run_judging_pipeline( if not skill_results: continue - # Average Pareto objectives across this skill's challenge results - objective_keys = set() + # Average Pareto objectives across this skill's challenge results. + # MERGE rather than REPLACE: in atomic mode, variant_evolution has + # already populated skill.pareto_objectives with the composite scorer's + # richer structural breakdown (l0/compile/ast/template/brevity/ + # behavioral). Wholesale replacement clobbered those keys and left + # every atomic-run skill scored only on comparative.py's legacy + # axes. Pre-existing skill-level values win on key conflicts; the + # aggregation only fills in keys the skill doesn't already carry. + aggregated: dict[str, float] = {} + objective_keys: set[str] = set() for r in skill_results: objective_keys.update(r.pareto_objectives.keys()) - skill.pareto_objectives = { - k: sum(r.pareto_objectives.get(k, 0.0) for r in skill_results) / len(skill_results) - for k in objective_keys - } + for k in objective_keys: + aggregated[k] = ( + sum(r.pareto_objectives.get(k, 0.0) for r in skill_results) + / len(skill_results) + ) + skill.pareto_objectives = {**aggregated, **skill.pareto_objectives} # Trigger scores (same across all results for a skill — take the first) skill.trigger_precision = skill_results[0].trigger_precision diff --git a/tests/test_judge.py b/tests/test_judge.py index c28ac82..e4815e6 100644 --- a/tests/test_judge.py +++ b/tests/test_judge.py @@ -189,6 +189,40 @@ async def test_pipeline_populates_per_skill_fitness(mock_network_layers): assert skill.deterministic_scores, "deterministic_scores must be populated" +async def test_pipeline_preserves_preexisting_pareto_objectives(mock_network_layers): + """Atomic-mode regression: variant_evolution writes composite-scorer + objectives onto skill.pareto_objectives BEFORE the judging pipeline + runs. The pipeline's per-skill aggregation used to REPLACE those keys + with comparative.py's legacy {correctness, code_quality, ...} set — + silently clobbering the richer structural breakdown (l0/compile/ast/ + template/brevity). Now it MERGES: aggregated keys fill in only where + the skill doesn't already carry a value. + """ + gen, challenges = _make_generation(n_skills=2, n_challenges=1) + # Simulate variant_evolution's composite scorer running first + for skill in gen.skills: + skill.pareto_objectives = { + "composite": 0.72, + "l0": 0.85, + "compile": 1.0, + "ast": 0.60, + "template": 1.0, + "brevity": 0.80, + } + + await run_judging_pipeline(gen, challenges) + + for skill in gen.skills: + # Composite-scorer keys survived + assert skill.pareto_objectives["composite"] == 0.72 + assert skill.pareto_objectives["l0"] == 0.85 + assert skill.pareto_objectives["ast"] == 0.60 + # Aggregated keys from L4 are still there for molecular-mode parity + assert "correctness" in skill.pareto_objectives + assert "token_efficiency" in skill.pareto_objectives + assert "trigger_accuracy" in skill.pareto_objectives + + async def test_pipeline_computes_generation_fitness(mock_network_layers): gen, challenges = _make_generation(n_skills=3, n_challenges=1) await run_judging_pipeline(gen, challenges)