diff --git a/skillforge/agents/judge/pipeline.py b/skillforge/agents/judge/pipeline.py index 3014262..4a86cbb 100644 --- a/skillforge/agents/judge/pipeline.py +++ b/skillforge/agents/judge/pipeline.py @@ -150,14 +150,24 @@ async def run_judging_pipeline( if not skill_results: continue - # Average Pareto objectives across this skill's challenge results - objective_keys = set() + # Average Pareto objectives across this skill's challenge results. + # MERGE rather than REPLACE: in atomic mode, variant_evolution has + # already populated skill.pareto_objectives with the composite scorer's + # richer structural breakdown (l0/compile/ast/template/brevity/ + # behavioral). Wholesale replacement clobbered those keys and left + # every atomic-run skill scored only on comparative.py's legacy + # axes. Pre-existing skill-level values win on key conflicts; the + # aggregation only fills in keys the skill doesn't already carry. + aggregated: dict[str, float] = {} + objective_keys: set[str] = set() for r in skill_results: objective_keys.update(r.pareto_objectives.keys()) - skill.pareto_objectives = { - k: sum(r.pareto_objectives.get(k, 0.0) for r in skill_results) / len(skill_results) - for k in objective_keys - } + for k in objective_keys: + aggregated[k] = ( + sum(r.pareto_objectives.get(k, 0.0) for r in skill_results) + / len(skill_results) + ) + skill.pareto_objectives = {**aggregated, **skill.pareto_objectives} # Trigger scores (same across all results for a skill — take the first) skill.trigger_precision = skill_results[0].trigger_precision diff --git a/tests/test_judge.py b/tests/test_judge.py index c28ac82..e4815e6 100644 --- a/tests/test_judge.py +++ b/tests/test_judge.py @@ -189,6 +189,40 @@ async def test_pipeline_populates_per_skill_fitness(mock_network_layers): assert skill.deterministic_scores, "deterministic_scores must be populated" +async def test_pipeline_preserves_preexisting_pareto_objectives(mock_network_layers): + """Atomic-mode regression: variant_evolution writes composite-scorer + objectives onto skill.pareto_objectives BEFORE the judging pipeline + runs. The pipeline's per-skill aggregation used to REPLACE those keys + with comparative.py's legacy {correctness, code_quality, ...} set — + silently clobbering the richer structural breakdown (l0/compile/ast/ + template/brevity). Now it MERGES: aggregated keys fill in only where + the skill doesn't already carry a value. + """ + gen, challenges = _make_generation(n_skills=2, n_challenges=1) + # Simulate variant_evolution's composite scorer running first + for skill in gen.skills: + skill.pareto_objectives = { + "composite": 0.72, + "l0": 0.85, + "compile": 1.0, + "ast": 0.60, + "template": 1.0, + "brevity": 0.80, + } + + await run_judging_pipeline(gen, challenges) + + for skill in gen.skills: + # Composite-scorer keys survived + assert skill.pareto_objectives["composite"] == 0.72 + assert skill.pareto_objectives["l0"] == 0.85 + assert skill.pareto_objectives["ast"] == 0.60 + # Aggregated keys from L4 are still there for molecular-mode parity + assert "correctness" in skill.pareto_objectives + assert "token_efficiency" in skill.pareto_objectives + assert "trigger_accuracy" in skill.pareto_objectives + + async def test_pipeline_computes_generation_fitness(mock_network_layers): gen, challenges = _make_generation(n_skills=3, n_challenges=1) await run_judging_pipeline(gen, challenges)