Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions skillforge/agents/judge/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,24 @@ async def run_judging_pipeline(
if not skill_results:
continue

# Average Pareto objectives across this skill's challenge results
objective_keys = set()
# Average Pareto objectives across this skill's challenge results.
# MERGE rather than REPLACE: in atomic mode, variant_evolution has
# already populated skill.pareto_objectives with the composite scorer's
# richer structural breakdown (l0/compile/ast/template/brevity/
# behavioral). Wholesale replacement clobbered those keys and left
# every atomic-run skill scored only on comparative.py's legacy
# axes. Pre-existing skill-level values win on key conflicts; the
# aggregation only fills in keys the skill doesn't already carry.
aggregated: dict[str, float] = {}
objective_keys: set[str] = set()
for r in skill_results:
objective_keys.update(r.pareto_objectives.keys())
skill.pareto_objectives = {
k: sum(r.pareto_objectives.get(k, 0.0) for r in skill_results) / len(skill_results)
for k in objective_keys
}
for k in objective_keys:
aggregated[k] = (
sum(r.pareto_objectives.get(k, 0.0) for r in skill_results)
/ len(skill_results)
)
skill.pareto_objectives = {**aggregated, **skill.pareto_objectives}

# Trigger scores (same across all results for a skill — take the first)
skill.trigger_precision = skill_results[0].trigger_precision
Expand Down
34 changes: 34 additions & 0 deletions tests/test_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,40 @@ async def test_pipeline_populates_per_skill_fitness(mock_network_layers):
assert skill.deterministic_scores, "deterministic_scores must be populated"


async def test_pipeline_preserves_preexisting_pareto_objectives(mock_network_layers):
"""Atomic-mode regression: variant_evolution writes composite-scorer
objectives onto skill.pareto_objectives BEFORE the judging pipeline
runs. The pipeline's per-skill aggregation used to REPLACE those keys
with comparative.py's legacy {correctness, code_quality, ...} set —
silently clobbering the richer structural breakdown (l0/compile/ast/
template/brevity). Now it MERGES: aggregated keys fill in only where
the skill doesn't already carry a value.
"""
gen, challenges = _make_generation(n_skills=2, n_challenges=1)
# Simulate variant_evolution's composite scorer running first
for skill in gen.skills:
skill.pareto_objectives = {
"composite": 0.72,
"l0": 0.85,
"compile": 1.0,
"ast": 0.60,
"template": 1.0,
"brevity": 0.80,
}

await run_judging_pipeline(gen, challenges)

for skill in gen.skills:
# Composite-scorer keys survived
assert skill.pareto_objectives["composite"] == 0.72
assert skill.pareto_objectives["l0"] == 0.85
assert skill.pareto_objectives["ast"] == 0.60
# Aggregated keys from L4 are still there for molecular-mode parity
assert "correctness" in skill.pareto_objectives
assert "token_efficiency" in skill.pareto_objectives
assert "trigger_accuracy" in skill.pareto_objectives


async def test_pipeline_computes_generation_fitness(mock_network_layers):
gen, challenges = _make_generation(n_skills=3, n_challenges=1)
await run_judging_pipeline(gen, challenges)
Expand Down
Loading