From 54e674a665394e76440b1251390a95f192066d08 Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Mon, 20 Apr 2026 00:54:01 -0500
Subject: [PATCH] fix: judging pipeline merges (not replaces) skill
 pareto_objectives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Atomic-mode runs go through two scoring passes: variant_evolution
writes composite-scorer keys (composite/l0/compile/ast/template/
brevity/behavioral) onto each SkillGenome, then run_judging_pipeline
runs L1-L5 on the per-challenge results. The pipeline's per-skill
aggregation step rebuilt skill.pareto_objectives wholesale from result-
level keys (comparative.py's legacy {correctness, token_efficiency,
code_quality, trigger_accuracy, consistency}), silently clobbering the
richer structural keys the composite scorer had written.

Net effect: every atomic-run composite ended up scored only on the
legacy 5-axis schema. The last live run's composite showed
correctness=0 / consistency=0 / token_efficiency=0.04 with no sign
of the (actually useful) l0/ast/template/brevity breakdown that the
SKLD-bench composite scorer had already computed.

Fix: pipeline.py now MERGES instead of replaces — pre-existing
skill-level keys win on conflict; aggregation only fills in keys
the skill doesn't already carry. Atomic-mode runs now retain both
schemas side-by-side on skill.pareto_objectives; molecular-mode
parity is unaffected because skill.pareto_objectives starts empty
there and the aggregation is the only source.

Also noting for the reader: the three "zero" values in the last
live run are NOT bugs after this fix:

  consistency=0.0     — L6 is intentionally an MVP stub
                        (comparative.py:91); v1.1 will populate it.
  token_efficiency    — genuine signal: trace_len/(MAX_TURNS*2)
  ≈0.04                 means the competitor used most of the turn
                        budget. Slower is worse.
  correctness=0.0     — genuine: Haiku's generated solutions failed
                        pytest on the generated challenges. Expected
                        behavior at the cheap tier; the signal is
                        real and load-bearing.

Covered by one new unit test (test_pipeline_preserves_preexisting_
pareto_objectives); adds 411 passing total (+1).

QA: ruff + mypy + 411 pytest all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/judge/pipeline.py | 22 ++++++++++++++-----
 tests/test_judge.py                 | 34 +++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/skillforge/agents/judge/pipeline.py b/skillforge/agents/judge/pipeline.py
index 3014262..4a86cbb 100644
--- a/skillforge/agents/judge/pipeline.py
+++ b/skillforge/agents/judge/pipeline.py
@@ -150,14 +150,24 @@ async def run_judging_pipeline(
         if not skill_results:
             continue
 
-        # Average Pareto objectives across this skill's challenge results
-        objective_keys = set()
+        # Average Pareto objectives across this skill's challenge results.
+        # MERGE rather than REPLACE: in atomic mode, variant_evolution has
+        # already populated skill.pareto_objectives with the composite scorer's
+        # richer structural breakdown (l0/compile/ast/template/brevity/
+        # behavioral). Wholesale replacement clobbered those keys and left
+        # every atomic-run skill scored only on comparative.py's legacy
+        # axes. Pre-existing skill-level values win on key conflicts; the
+        # aggregation only fills in keys the skill doesn't already carry.
+        aggregated: dict[str, float] = {}
+        objective_keys: set[str] = set()
         for r in skill_results:
             objective_keys.update(r.pareto_objectives.keys())
-        skill.pareto_objectives = {
-            k: sum(r.pareto_objectives.get(k, 0.0) for r in skill_results) / len(skill_results)
-            for k in objective_keys
-        }
+        for k in objective_keys:
+            aggregated[k] = (
+                sum(r.pareto_objectives.get(k, 0.0) for r in skill_results)
+                / len(skill_results)
+            )
+        skill.pareto_objectives = {**aggregated, **skill.pareto_objectives}
 
         # Trigger scores (same across all results for a skill — take the first)
         skill.trigger_precision = skill_results[0].trigger_precision
diff --git a/tests/test_judge.py b/tests/test_judge.py
index c28ac82..e4815e6 100644
--- a/tests/test_judge.py
+++ b/tests/test_judge.py
@@ -189,6 +189,40 @@ async def test_pipeline_populates_per_skill_fitness(mock_network_layers):
         assert skill.deterministic_scores, "deterministic_scores must be populated"
 
 
+async def test_pipeline_preserves_preexisting_pareto_objectives(mock_network_layers):
+    """Atomic-mode regression: variant_evolution writes composite-scorer
+    objectives onto skill.pareto_objectives BEFORE the judging pipeline
+    runs. The pipeline's per-skill aggregation used to REPLACE those keys
+    with comparative.py's legacy {correctness, code_quality, ...} set —
+    silently clobbering the richer structural breakdown (l0/compile/ast/
+    template/brevity). Now it MERGES: aggregated keys fill in only where
+    the skill doesn't already carry a value.
+    """
+    gen, challenges = _make_generation(n_skills=2, n_challenges=1)
+    # Simulate variant_evolution's composite scorer running first
+    for skill in gen.skills:
+        skill.pareto_objectives = {
+            "composite": 0.72,
+            "l0": 0.85,
+            "compile": 1.0,
+            "ast": 0.60,
+            "template": 1.0,
+            "brevity": 0.80,
+        }
+
+    await run_judging_pipeline(gen, challenges)
+
+    for skill in gen.skills:
+        # Composite-scorer keys survived
+        assert skill.pareto_objectives["composite"] == 0.72
+        assert skill.pareto_objectives["l0"] == 0.85
+        assert skill.pareto_objectives["ast"] == 0.60
+        # Aggregated keys from L4 are still there for molecular-mode parity
+        assert "correctness" in skill.pareto_objectives
+        assert "token_efficiency" in skill.pareto_objectives
+        assert "trigger_accuracy" in skill.pareto_objectives
+
+
 async def test_pipeline_computes_generation_fitness(mock_network_layers):
     gen, challenges = _make_generation(n_skills=3, n_challenges=1)
     await run_judging_pipeline(gen, challenges)