From b791215ca881fbb985d16548d65c3c8270f69942 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Wed, 22 Apr 2026 14:28:16 +0200
Subject: [PATCH 01/16] feat: implement segment-based stability thresholds in
 scoring evaluation

- Introduced segment-based thresholds for scoring evaluation, defining metrics for `top_band`, `middle_band`, and `bottom_band` to enhance diagnostic capabilities.
- Updated scoring configuration to reflect new stability metrics, including Jaccard and rank correlation thresholds for each segment.
- Enhanced tests to validate the integration of segment thresholds and ensure correct evaluation reporting.
- Improved documentation to clarify the purpose and structure of the new segment-based stability diagnostics.
---
 backend/app/services/scoring.py             |  34 +-
 backend/app/services/scoring_evaluation.py  | 440 +++++++++++++-------
 backend/tests/test_scoring_algorithms.py    |  25 +-
 backend/tests/test_scoring_v2_evaluation.py |  95 +++++
 config/scoring.yaml                         |  29 +-
 5 files changed, 456 insertions(+), 167 deletions(-)

diff --git a/backend/app/services/scoring.py b/backend/app/services/scoring.py
index db10b59..eb13ecf 100644
--- a/backend/app/services/scoring.py
+++ b/backend/app/services/scoring.py
@@ -73,9 +73,37 @@
             "high_score_cutoff": 80.0,
         },
         "stability": {
-            "top20_jaccard_min": 0.7,
-            "rank_correlation_min": 0.8,
-            "perturbation_overlap_min": 0.6,
+            "segments": {
+                "top_band": {
+                    "mode": "top_n",
+                    "top_n": 20,
+                    "jaccard_min": 0.7,
+                    "rank_correlation_min": 0.8,
+                    "perturbation_overlap_min": 0.6,
+                    "median_abs_rank_shift_max": 30,
+                    "p90_rank_shift_max": 120,
+                },
+                "middle_band": {
+                    "start_pct": 0.45,
+                    "end_pct": 0.60,
+                    "jaccard_warn_min": 0.3,
+                    "rank_correlation_warn_min": 0.5,
+                    "median_abs_rank_shift_warn_max": 250,
+                    "p90_rank_shift_warn_max": 1200,
+                },
+                "bottom_band": {
+                    "start_pct": 0.85,
+                    "end_pct": 1.00,
+                    "jaccard_warn_min": 0.25,
+                    "rank_correlation_warn_min": 0.4,
+                    "median_abs_rank_shift_warn_max": 300,
+                    "p90_rank_shift_warn_max": 1500,
+                },
+            },
+            "full_dataset": {
+                "median_abs_rank_shift_warn_max": 200,
+                "p90_rank_shift_warn_max": 1000,
+            },
         },
         "decision": {
             "minimum_sample_for_promote": 100,
diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py
index bb9da2c..f864931 100644
--- a/backend/app/services/scoring_evaluation.py
+++ b/backend/app/services/scoring_evaluation.py
@@ -3,7 +3,7 @@
 import json
 from datetime import UTC, datetime
 from pathlib import Path
-from statistics import correlation
+from statistics import correlation, median
 from typing import Any
 
 from sqlalchemy import select
@@ -16,28 +16,12 @@
 
 
 def _safe_divide(numerator: float, denominator: float) -> float:
-    """Divide safely and return 0.0 on zero denominator.
-
-    Why this exists:
-    - Evaluation math uses ratios in multiple places.
-    - A zero denominator should not crash a release-gate run.
-    """
     if denominator == 0:
         return 0.0
     return numerator / denominator
 
 
 def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float:
-    """Measure set overlap using Jaccard similarity.
-
-    Formula:
-    - |intersection| / |union|
-
-    Interpretation:
-    - 1.0 means the sets are identical.
-    - 0.0 means no overlap.
-    - Used here to check whether top-N listings stayed mostly the same.
-    """
     left_set = set(left_ids)
     right_set = set(right_ids)
     union_size = len(left_set | right_set)
@@ -47,21 +31,6 @@ def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float:
 
 
 def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str]) -> float:
-    """Compare ordering consistency between two ranked lists.
-
-    How it works:
-    - Convert each list into rank positions.
-    - Keep only items that appear in both lists.
-    - Compute correlation on those rank positions.
-
-    Interpretation:
-    - +1.0: near-identical order
-    - 0.0: weak relationship
-    - -1.0: near-reversed order
-
-    Why it matters:
-    - Top-N overlap alone can look fine while overall ordering drifts heavily.
-    """
     current_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(current_ids)}
     reference_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(reference_ids)}
     common_ids = sorted(set(current_rank) & set(reference_rank))
@@ -74,15 +43,6 @@ def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str])
 
 
 def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]:
-    """Fetch one job's score rows in deterministic rank order.
-
-    Sort order:
-    - Primary: score descending (higher scores rank first).
-    - Secondary: listing_id ascending as a deterministic tie-breaker.
-
-    Tie-breakers are important so repeated runs produce stable ordering for equal scores.
-    """
-    # listing_id is used only as a deterministic tie-breaker when scores are equal.
     return db.scalars(
         select(ScoreResult)
         .where(ScoreResult.job_id == job_id)
@@ -91,16 +51,6 @@ def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]:
 
 
 def _ranking_identity_map(db: Session, job_id: int) -> dict[int, str]:
-    """Map internal DB listing IDs to cross-run comparable IDs.
-
-    Why this exists:
-    - `score_results.listing_id` is an internal DB key and differs across jobs.
-    - Stability comparisons should use a stable external identity.
-
-    Strategy:
-    - Prefer external `listing.listing_id`.
-    - Fall back to `internal-<id>` when external ID is missing.
-    """
     listings = db.scalars(select(Listing).where(Listing.job_id == job_id)).all()
     identities: dict[int, str] = {}
     for listing in listings:
@@ -114,15 +64,6 @@ def _ranking_identity_map(db: Session, job_id: int) -> dict[int, str]:
 
 
 def _dominance_ratio(score_row: ScoreResult) -> float:
-    """Compute how much one signal dominates the explanation contributions.
-
-    Formula:
-    - max(abs(weighted_contribution)) / sum(abs(weighted_contribution))
-
-    Interpretation:
-    - Values near 1.0 indicate one signal is doing almost all the work.
-    - High dominance can indicate brittle or unbalanced scoring.
-    """
     if not score_row.explanation:
         return 1.0
     signals = score_row.explanation.get("signals")
@@ -142,16 +83,6 @@ def _dominance_ratio(score_row: ScoreResult) -> float:
 
 
 def _score_math_consistent(score_row: ScoreResult) -> bool:
-    """Validate that explanation math matches stored score values.
-
-    Checks:
-    - Re-sum `signals[].weighted_contribution` and compare to
-      `score_math.weighted_sum_0_to_1`.
-    - Compare row `score` to `score_math.final_score_0_to_100`.
-
-    This protects against explainability drift where narrative payloads disagree
-    with actual scoring outputs.
-    """
     if not score_row.explanation:
         return False
 
@@ -175,13 +106,6 @@ def _score_math_consistent(score_row: ScoreResult) -> bool:
 
 
 def _extract_signal_vectors(score_row: ScoreResult) -> dict[str, tuple[float, float]]:
-    """Extract per-signal (normalized_score, weight) pairs from explanation payload.
-
-    Output shape:
-    - {signal_name: (normalized_score, weight)}
-
-    This is the base structure used for perturbation sensitivity simulations.
-    """
     explanation = score_row.explanation or {}
     signals = explanation.get("signals")
     if not isinstance(signals, list):
@@ -206,17 +130,6 @@ def _perturbed_score(
     target_signal: str,
     delta: float,
 ) -> float:
-    """Recompute a simulated score after perturbing one signal weight.
-
-    Process:
-    - Increase/decrease one target weight by `delta` (e.g., +0.05 or -0.10).
-    - Keep other weights unchanged.
-    - Re-normalize all weights to sum to 1.
-    - Recompute weighted score.
-
-    Purpose:
-    - Estimate sensitivity: does a small weight tweak cause large ranking movement?
-    """
     if not vectors:
         return 0.0
 
@@ -243,17 +156,6 @@ def _compute_perturbation_overlap(
     top_n: int,
     deltas: list[float],
 ) -> tuple[float, list[dict[str, Any]]]:
-    """Run perturbation experiments and measure top-N stability.
-
-    For each signal and each delta:
-    - recompute perturbed scores
-    - rerank listings
-    - compare perturbed top-N vs baseline top-N using Jaccard
-
-    Returns:
-    - minimum top-N overlap across all experiments (worst-case stability)
-    - detailed per-experiment metrics for audit/debugging
-    """
     if not rows:
         return 0.0, []
 
@@ -300,29 +202,151 @@ def _compute_perturbation_overlap(
     return round(min(overlaps), 4), experiments
 
 
-# continue here
+def _segment_bounds(total_count: int, start_pct: float, end_pct: float) -> tuple[int, int]:
+    if total_count <= 0:
+        return (0, 0)
+    start = int(total_count * _clamp(start_pct))
+    end = int(total_count * _clamp(end_pct))
+    start = max(0, min(start, total_count))
+    end = max(0, min(end, total_count))
+    if end <= start:
+        end = min(total_count, start + 1)
+    return (start, end)
+
+
+def _segment_identities(
+    rows: list[ScoreResult],
+    identity_map: dict[int, str],
+    start_idx: int,
+    end_idx: int,
+) -> list[str]:
+    segment_rows = rows[start_idx:end_idx]
+    return [identity_map.get(row.listing_id, f"internal-{row.listing_id}") for row in segment_rows]
+
+
+def _global_rank_map(identities: list[str]) -> dict[str, int]:
+    return {identity: idx + 1 for idx, identity in enumerate(identities)}
+
+
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return 0.0
+    ordered = sorted(values)
+    idx = int(round((len(ordered) - 1) * _clamp(pct)))
+    return float(ordered[idx])
+
+
+def _rank_displacement_metrics(
+    current_ids: list[str],
+    reference_ids: list[str],
+    current_global_rank: dict[str, int],
+    reference_global_rank: dict[str, int],
+) -> dict[str, float]:
+    shared_ids = sorted(set(current_ids) & set(reference_ids))
+    if not shared_ids:
+        return {
+            "intersection_count": 0.0,
+            "median_abs_rank_shift": 0.0,
+            "p90_rank_shift": 0.0,
+        }
+
+    shifts = [
+        float(abs(current_global_rank[listing_id] - reference_global_rank[listing_id]))
+        for listing_id in shared_ids
+    ]
+    return {
+        "intersection_count": float(len(shared_ids)),
+        "median_abs_rank_shift": round(float(median(shifts)), 4),
+        "p90_rank_shift": round(_percentile(shifts, 0.90), 4),
+    }
+
+
+def _evaluate_segment_stability(
+    *,
+    segment_name: str,
+    current_ids: list[str],
+    reference_ids: list[str],
+    current_global_rank: dict[str, int],
+    reference_global_rank: dict[str, int],
+    thresholds: dict[str, Any],
+    severity: str,
+) -> dict[str, Any]:
+    jaccard_overlap = _compute_jaccard(current_ids, reference_ids)
+    rank_correlation = _spearman_rank_correlation(current_ids, reference_ids)
+    displacement = _rank_displacement_metrics(
+        current_ids,
+        reference_ids,
+        current_global_rank,
+        reference_global_rank,
+    )
+    intersection_count = int(displacement["intersection_count"])
+
+    failed_checks: list[str] = []
+    if severity == "fail":
+        jaccard_min = thresholds.get("jaccard_min")
+        if jaccard_min is not None and jaccard_overlap < float(jaccard_min):
+            failed_checks.append(f"jaccard_below_min:{jaccard_overlap}<{jaccard_min}")
+        rank_corr_min = thresholds.get("rank_correlation_min")
+        if rank_corr_min is not None and rank_correlation < float(rank_corr_min):
+            failed_checks.append(f"rank_corr_below_min:{rank_correlation}<{rank_corr_min}")
+        median_max = thresholds.get("median_abs_rank_shift_max")
+        if median_max is not None and displacement["median_abs_rank_shift"] > float(median_max):
+            failed_checks.append(
+                "median_abs_rank_shift_above_max:"
+                f"{displacement['median_abs_rank_shift']}>{median_max}"
+            )
+        p90_max = thresholds.get("p90_rank_shift_max")
+        if p90_max is not None and displacement["p90_rank_shift"] > float(p90_max):
+            failed_checks.append(
+                f"p90_rank_shift_above_max:{displacement['p90_rank_shift']}>{p90_max}"
+            )
+    else:
+        jaccard_warn_min = thresholds.get("jaccard_warn_min")
+        if jaccard_warn_min is not None and jaccard_overlap < float(jaccard_warn_min):
+            failed_checks.append(f"jaccard_below_warn_min:{jaccard_overlap}<{jaccard_warn_min}")
+        rank_corr_warn_min = thresholds.get("rank_correlation_warn_min")
+        if rank_corr_warn_min is not None and rank_correlation < float(rank_corr_warn_min):
+            failed_checks.append(
+                f"rank_corr_below_warn_min:{rank_correlation}<{rank_corr_warn_min}"
+            )
+        median_warn_max = thresholds.get("median_abs_rank_shift_warn_max")
+        if median_warn_max is not None and displacement["median_abs_rank_shift"] > float(
+            median_warn_max
+        ):
+            failed_checks.append(
+                "median_abs_rank_shift_above_warn_max:"
+                f"{displacement['median_abs_rank_shift']}>{median_warn_max}"
+            )
+        p90_warn_max = thresholds.get("p90_rank_shift_warn_max")
+        if p90_warn_max is not None and displacement["p90_rank_shift"] > float(p90_warn_max):
+            failed_checks.append(
+                f"p90_rank_shift_above_warn_max:{displacement['p90_rank_shift']}>{p90_warn_max}"
+            )
+
+    status = "pass" if not failed_checks else severity
+    return {
+        "segment_name": segment_name,
+        "status": status,
+        "metrics": {
+            "sample_size_current": len(current_ids),
+            "sample_size_reference": len(reference_ids),
+            "intersection_count": intersection_count,
+            "jaccard_overlap": jaccard_overlap,
+            "rank_correlation": rank_correlation,
+            "median_abs_rank_shift": displacement["median_abs_rank_shift"],
+            "p90_rank_shift": displacement["p90_rank_shift"],
+        },
+        "thresholds": thresholds,
+        "violation_details": {"failed_checks": failed_checks},
+    }
+
+
 def run_scoring_evaluation(
     db: Session,
     job_id: int,
     reference_job_id: int | None = None,
     top_n: int = 20,
 ) -> dict[str, Any]:
-    """Evaluate one scoring run and produce a release decision artifact.
-
-    Gate families:
-    - Data quality: valid/duplicate/null-rate thresholds.
-    - Scoring sanity: score bounds, impossible top ranks, signal dominance.
-    - Explainability: payload presence and math consistency.
-    - Stability: top-N overlap, rank correlation, perturbation robustness.
-
-    Decision logic:
-    - `revert` if any critical gate fails.
-    - `experimental` if sample is too small or warnings remain.
-    - `promote` only when all required gates pass.
-
-    Side effect:
-    - Writes `output/evaluations/<run_id>/scoring_evaluation_<timestamp>.json`.
-    """
     config = _load_scoring_config()
     thresholds = config.get("evaluation_thresholds", {})
     data_thresholds = thresholds.get("data_quality", {})
@@ -335,8 +359,9 @@ def run_scoring_evaluation(
         raise ValueError(f"No scored listings found for job: {job_id}")
 
     model_version = current_rows[0].model_version
-    sampled_rows = current_rows[:top_n]
     current_identity_map = _ranking_identity_map(db, job_id)
+    top_n_effective = int(top_n) if top_n > 0 else 20
+    sampled_rows = current_rows[:top_n_effective]
 
     validation = run_dataset_validation(db, job_id)
     valid_rate_min = float(data_thresholds.get("valid_rate_min", 0.85))
@@ -490,11 +515,12 @@ def run_scoring_evaluation(
             "score_math_mismatches_top_n": len(score_math_mismatch_rows),
         },
         "thresholds": {
-            "required_top_n": top_n,
+            "required_top_n": top_n_effective,
         },
     }
 
     stability_gate: dict[str, Any]
+    stability_warning_keys: list[str] = []
     if reference_job_id is None:
         stability_gate = {
             "status": "warn",
@@ -505,52 +531,156 @@ def run_scoring_evaluation(
     else:
         reference_rows = _sorted_scores(db, reference_job_id)
         reference_identity_map = _ranking_identity_map(db, reference_job_id)
-        reference_top_ids = [
-            reference_identity_map.get(row.listing_id, f"internal-{row.listing_id}")
-            for row in reference_rows[:top_n]
-        ]
-        current_top_ids = [
-            current_identity_map.get(row.listing_id, f"internal-{row.listing_id}")
-            for row in current_rows[:top_n]
-        ]
-        top_n_jaccard = _compute_jaccard(current_top_ids, reference_top_ids)
-        rank_corr = _spearman_rank_correlation(
-            [
-                current_identity_map.get(row.listing_id, f"internal-{row.listing_id}")
-                for row in current_rows
-            ],
-            [
-                reference_identity_map.get(row.listing_id, f"internal-{row.listing_id}")
-                for row in reference_rows
-            ],
+        current_global_ids = _segment_identities(
+            current_rows, current_identity_map, 0, len(current_rows)
         )
+        reference_global_ids = _segment_identities(
+            reference_rows, reference_identity_map, 0, len(reference_rows)
+        )
+        current_global_rank = _global_rank_map(current_global_ids)
+        reference_global_rank = _global_rank_map(reference_global_ids)
+
+        segments_cfg = stability_thresholds.get("segments", {})
+        top_cfg = dict(segments_cfg.get("top_band", {}))
+        middle_cfg = dict(segments_cfg.get("middle_band", {}))
+        bottom_cfg = dict(segments_cfg.get("bottom_band", {}))
+        full_dataset_cfg = dict(stability_thresholds.get("full_dataset", {}))
+
+        top_n_cfg = int(top_cfg.get("top_n", top_n_effective))
+        top_count = top_n_effective if top_n > 0 else top_n_cfg
+        top_current_ids = _segment_identities(current_rows, current_identity_map, 0, top_count)
+        top_reference_ids = _segment_identities(
+            reference_rows, reference_identity_map, 0, top_count
+        )
+        top_band = _evaluate_segment_stability(
+            segment_name="top_band",
+            current_ids=top_current_ids,
+            reference_ids=top_reference_ids,
+            current_global_rank=current_global_rank,
+            reference_global_rank=reference_global_rank,
+            thresholds=top_cfg,
+            severity="fail",
+        )
+
         perturbation_deltas = [-0.10, -0.05, 0.05, 0.10]
         perturbation_overlap_min, perturbation_details = _compute_perturbation_overlap(
-            current_rows, top_n=top_n, deltas=perturbation_deltas
+            current_rows, top_n=top_count, deltas=perturbation_deltas
         )
+        top_band["metrics"]["perturbation_overlap_min"] = perturbation_overlap_min
+        top_band["metrics"]["perturbation_checks"] = perturbation_details
+        perturbation_threshold = float(top_cfg.get("perturbation_overlap_min", 0.60))
+        top_band["thresholds"]["perturbation_overlap_min"] = perturbation_threshold
+        if perturbation_overlap_min < perturbation_threshold:
+            top_band["status"] = "fail"
+            top_band["violation_details"]["failed_checks"].append(
+                f"perturbation_overlap_below_min:{perturbation_overlap_min}<{perturbation_threshold}"
+            )
 
-        top_n_jaccard_min = float(stability_thresholds.get("top20_jaccard_min", 0.70))
-        rank_correlation_min = float(stability_thresholds.get("rank_correlation_min", 0.80))
-        perturbation_overlap_min_threshold = float(
-            stability_thresholds.get("perturbation_overlap_min", 0.60)
+        middle_start_pct = float(middle_cfg.get("start_pct", 0.45))
+        middle_end_pct = float(middle_cfg.get("end_pct", 0.60))
+        middle_current_bounds = _segment_bounds(len(current_rows), middle_start_pct, middle_end_pct)
+        middle_reference_bounds = _segment_bounds(
+            len(reference_rows), middle_start_pct, middle_end_pct
         )
-        stability_pass = (
-            top_n_jaccard >= top_n_jaccard_min
-            and rank_corr >= rank_correlation_min
-            and perturbation_overlap_min >= perturbation_overlap_min_threshold
+        middle_band = _evaluate_segment_stability(
+            segment_name="middle_band",
+            current_ids=_segment_identities(
+                current_rows,
+                current_identity_map,
+                middle_current_bounds[0],
+                middle_current_bounds[1],
+            ),
+            reference_ids=_segment_identities(
+                reference_rows,
+                reference_identity_map,
+                middle_reference_bounds[0],
+                middle_reference_bounds[1],
+            ),
+            current_global_rank=current_global_rank,
+            reference_global_rank=reference_global_rank,
+            thresholds=middle_cfg,
+            severity="warn",
+        )
+
+        bottom_start_pct = float(bottom_cfg.get("start_pct", 0.85))
+        bottom_end_pct = float(bottom_cfg.get("end_pct", 1.00))
+        bottom_current_bounds = _segment_bounds(len(current_rows), bottom_start_pct, bottom_end_pct)
+        bottom_reference_bounds = _segment_bounds(
+            len(reference_rows), bottom_start_pct, bottom_end_pct
         )
+        bottom_band = _evaluate_segment_stability(
+            segment_name="bottom_band",
+            current_ids=_segment_identities(
+                current_rows,
+                current_identity_map,
+                bottom_current_bounds[0],
+                bottom_current_bounds[1],
+            ),
+            reference_ids=_segment_identities(
+                reference_rows,
+                reference_identity_map,
+                bottom_reference_bounds[0],
+                bottom_reference_bounds[1],
+            ),
+            current_global_rank=current_global_rank,
+            reference_global_rank=reference_global_rank,
+            thresholds=bottom_cfg,
+            severity="warn",
+        )
+
+        full_displacement = _rank_displacement_metrics(
+            current_global_ids,
+            reference_global_ids,
+            current_global_rank,
+            reference_global_rank,
+        )
+        full_warn_checks: list[str] = []
+        full_median_warn_max = float(full_dataset_cfg.get("median_abs_rank_shift_warn_max", 200))
+        full_p90_warn_max = float(full_dataset_cfg.get("p90_rank_shift_warn_max", 1000))
+        if full_displacement["median_abs_rank_shift"] > full_median_warn_max:
+            full_warn_checks.append(
+                "median_abs_rank_shift_above_warn_max:"
+                f"{full_displacement['median_abs_rank_shift']}>{full_median_warn_max}"
+            )
+        if full_displacement["p90_rank_shift"] > full_p90_warn_max:
+            full_warn_checks.append(
+                f"p90_rank_shift_above_warn_max:{full_displacement['p90_rank_shift']}>{full_p90_warn_max}"
+            )
+        full_dataset_status = "warn" if full_warn_checks else "pass"
+        full_dataset_metrics = {
+            "intersection_count": int(full_displacement["intersection_count"]),
+            "median_abs_rank_shift": full_displacement["median_abs_rank_shift"],
+            "p90_rank_shift": full_displacement["p90_rank_shift"],
+            "thresholds": {
+                "median_abs_rank_shift_warn_max": full_median_warn_max,
+                "p90_rank_shift_warn_max": full_p90_warn_max,
+            },
+            "status": full_dataset_status,
+            "violation_details": {"failed_checks": full_warn_checks},
+        }
+
+        if middle_band["status"] == "warn":
+            stability_warning_keys.append("stability_middle_band")
+        if bottom_band["status"] == "warn":
+            stability_warning_keys.append("stability_bottom_band")
+        if full_dataset_status == "warn":
+            stability_warning_keys.append("stability_full_dataset")
+
+        has_top_fail = top_band["status"] == "fail"
+        has_warnings = bool(stability_warning_keys)
         stability_gate = {
-            "status": "pass" if stability_pass else "fail",
+            "status": "fail" if has_top_fail else ("warn" if has_warnings else "pass"),
             "metrics": {
-                "top_n_jaccard": top_n_jaccard,
-                "rank_correlation": rank_corr,
-                "perturbation_overlap_min": perturbation_overlap_min,
-                "perturbation_checks": perturbation_details,
+                "segments": {
+                    "top_band": top_band,
+                    "middle_band": middle_band,
+                    "bottom_band": bottom_band,
+                },
+                "full_dataset": full_dataset_metrics,
             },
             "thresholds": {
-                "top_n_jaccard_min": top_n_jaccard_min,
-                "rank_correlation_min": rank_correlation_min,
-                "perturbation_overlap_min": perturbation_overlap_min_threshold,
+                "segments": segments_cfg,
+                "full_dataset": full_dataset_cfg,
             },
         }
 
@@ -567,6 +697,7 @@ def run_scoring_evaluation(
             failed_gate_keys.append(gate_key)
         elif gate_payload["status"] == "warn":
             warning_gate_keys.append(gate_key)
+    warning_gate_keys.extend(stability_warning_keys)
 
     decision: str
     decision_reasons: list[str] = []
@@ -584,7 +715,6 @@ def run_scoring_evaluation(
         decision = "promote"
         decision_reasons.append("All required gates passed.")
 
-    # continue here
     recommended_next_actions: list[str] = []
     if decision == "revert":
         recommended_next_actions.append("Rollback scoring profile changes and review failed gates.")
@@ -608,7 +738,7 @@ def run_scoring_evaluation(
         "model_version": model_version,
         "timestamp_utc": evaluation_time_utc.isoformat(),
         "sample_size": len(current_rows),
-        "top_n": top_n,
+        "top_n": top_n_effective,
         "gates": {
             "data_quality": data_quality_gate,
             "scoring_sanity": scoring_sanity_gate,
diff --git a/backend/tests/test_scoring_algorithms.py b/backend/tests/test_scoring_algorithms.py
index 1ca76d6..01c72df 100644
--- a/backend/tests/test_scoring_algorithms.py
+++ b/backend/tests/test_scoring_algorithms.py
@@ -58,7 +58,9 @@ def test_load_scoring_config_returns_defaults_without_config_file(
         "province",
         "global",
     ]
-    assert config["evaluation_thresholds"]["stability"]["top20_jaccard_min"] == 0.7
+    assert (
+        config["evaluation_thresholds"]["stability"]["segments"]["top_band"]["jaccard_min"] == 0.7
+    )
 
 
 def test_load_scoring_config_deep_merges_advanced_v2_and_threshold_overrides(
@@ -78,7 +80,9 @@ def test_load_scoring_config_deep_merges_advanced_v2_and_threshold_overrides(
                 "    transaction_cost_pct: 0.1",
                 "evaluation_thresholds:",
                 "  stability:",
-                "    rank_correlation_min: 0.85",
+                "    segments:",
+                "      top_band:",
+                "        rank_correlation_min: 0.85",
             ]
         ),
         encoding="utf-8",
@@ -97,8 +101,13 @@ def test_load_scoring_config_deep_merges_advanced_v2_and_threshold_overrides(
     ]
     assert config["advanced_v2"]["roi"]["transaction_cost_pct"] == 0.1
     assert config["advanced_v2"]["roi"]["maintenance_pct"] == 0.04
-    assert config["evaluation_thresholds"]["stability"]["rank_correlation_min"] == 0.85
-    assert config["evaluation_thresholds"]["stability"]["top20_jaccard_min"] == 0.7
+    assert (
+        config["evaluation_thresholds"]["stability"]["segments"]["top_band"]["rank_correlation_min"]
+        == 0.85
+    )
+    assert (
+        config["evaluation_thresholds"]["stability"]["segments"]["top_band"]["jaccard_min"] == 0.7
+    )
 
 
 def test_signal_neutral_defaults_when_inputs_missing() -> None:
@@ -354,8 +363,12 @@ def test_run_scoring_job_uses_advanced_v2_when_flags_enabled(
             },
             "evaluation_thresholds": {
                 "stability": {
-                    "top20_jaccard_min": 0.7,
-                    "rank_correlation_min": 0.8,
+                    "segments": {
+                        "top_band": {
+                            "jaccard_min": 0.7,
+                            "rank_correlation_min": 0.8,
+                        }
+                    },
                 }
             },
         },
diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py
index cf20f6d..1b61c4d 100644
--- a/backend/tests/test_scoring_v2_evaluation.py
+++ b/backend/tests/test_scoring_v2_evaluation.py
@@ -1,9 +1,12 @@
 from __future__ import annotations
 
+import copy
+
 from app.models.ingestion_job import IngestionJob
 from app.models.listing import Listing
 from app.models.raw_listing import RawListing
 from app.models.score_result import ScoreResult
+from app.services import scoring_evaluation
 from app.services.scoring_evaluation import run_scoring_evaluation
 from sqlalchemy.orm import Session
 
@@ -120,6 +123,17 @@ def test_scoring_evaluation_promotes_when_all_gates_pass(db_session: Session) ->
     assert report["failed_gates"] == []
     assert report["warning_gates"] == []
     assert report["gates"]["stability"]["status"] == "pass"
+    stability = report["gates"]["stability"]["metrics"]
+    assert "segments" in stability
+    assert "top_band" in stability["segments"]
+    assert "middle_band" in stability["segments"]
+    assert "bottom_band" in stability["segments"]
+    assert "full_dataset" in stability
+    assert "thresholds" in stability["segments"]["top_band"]
+    assert "thresholds" in stability["segments"]["middle_band"]
+    assert "thresholds" in stability["segments"]["bottom_band"]
+    assert "median_abs_rank_shift" in stability["segments"]["top_band"]["metrics"]
+    assert "p90_rank_shift" in stability["segments"]["top_band"]["metrics"]
 
 
 def test_scoring_evaluation_reverts_when_stability_fails(db_session: Session) -> None:
@@ -152,3 +166,84 @@ def test_scoring_evaluation_marks_experimental_when_sample_too_small(
 
     assert report["decision"] == "experimental"
     assert "sample_size" in report["warning_gates"]
+
+
+def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypatch) -> None:
+    current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+    reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+
+    base_config = scoring_evaluation._load_scoring_config()
+    strict_config = copy.deepcopy(base_config)
+    strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
+        "median_abs_rank_shift_max"
+    ] = 0
+    strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
+        "p90_rank_shift_max"
+    ] = 0
+    monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config)
+
+    # Reversing top 20 introduces displacement while keeping enough overlap.
+    top_rows = (
+        db_session.query(ScoreResult)
+        .filter(ScoreResult.job_id == current_job.id)
+        .order_by(ScoreResult.score.desc(), ScoreResult.listing_id.asc())
+        .limit(20)
+        .all()
+    )
+    for idx, row in enumerate(top_rows):
+        row.score = 99.99 - (19 - idx)
+    db_session.commit()
+
+    report = run_scoring_evaluation(
+        db_session,
+        job_id=current_job.id,
+        reference_job_id=reference_job.id,
+        top_n=20,
+    )
+    assert report["decision"] == "revert"
+    assert "stability" in report["failed_gates"]
+
+
+def test_full_dataset_displacement_warning_is_context_only(
+    db_session: Session, monkeypatch
+) -> None:
+    current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+    reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+
+    base_config = scoring_evaluation._load_scoring_config()
+    warn_config = copy.deepcopy(base_config)
+    warn_config["evaluation_thresholds"]["stability"]["full_dataset"][
+        "median_abs_rank_shift_warn_max"
+    ] = -1
+    warn_config["evaluation_thresholds"]["stability"]["full_dataset"][
+        "p90_rank_shift_warn_max"
+    ] = -1
+    # Keep top thresholds permissive so warning is context-only.
+    warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
+        "median_abs_rank_shift_max"
+    ] = 10_000
+    warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
+        "p90_rank_shift_max"
+    ] = 10_000
+    monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: warn_config)
+
+    # Introduce non-top ordering movement so full-dataset displacement becomes non-zero.
+    ordered_rows = (
+        db_session.query(ScoreResult)
+        .filter(ScoreResult.job_id == current_job.id)
+        .order_by(ScoreResult.score.desc(), ScoreResult.listing_id.asc())
+        .all()
+    )
+    ordered_rows[60].score, ordered_rows[80].score = ordered_rows[80].score, ordered_rows[60].score
+    db_session.commit()
+
+    report = run_scoring_evaluation(
+        db_session,
+        job_id=current_job.id,
+        reference_job_id=reference_job.id,
+        top_n=20,
+    )
+    assert report["decision"] == "experimental"
+    assert "stability_full_dataset" in report["warning_gates"]
+    assert report["gates"]["stability"]["metrics"]["full_dataset"]["status"] == "warn"
+    assert "stability" not in report["failed_gates"]
diff --git a/config/scoring.yaml b/config/scoring.yaml
index 74f688b..c848bf7 100644
--- a/config/scoring.yaml
+++ b/config/scoring.yaml
@@ -57,8 +57,31 @@ evaluation_thresholds:
     signal_dominance_cap: 0.70
     high_score_cutoff: 80.0
   stability:
-    top20_jaccard_min: 0.7
-    rank_correlation_min: 0.8
-    perturbation_overlap_min: 0.6
+    segments:
+      top_band:
+        mode: top_n
+        top_n: 20
+        jaccard_min: 0.70
+        rank_correlation_min: 0.80
+        perturbation_overlap_min: 0.60
+        median_abs_rank_shift_max: 30
+        p90_rank_shift_max: 120
+      middle_band:
+        start_pct: 0.45
+        end_pct: 0.60
+        jaccard_warn_min: 0.30
+        rank_correlation_warn_min: 0.50
+        median_abs_rank_shift_warn_max: 250
+        p90_rank_shift_warn_max: 1200
+      bottom_band:
+        start_pct: 0.85
+        end_pct: 1.00
+        jaccard_warn_min: 0.25
+        rank_correlation_warn_min: 0.40
+        median_abs_rank_shift_warn_max: 300
+        p90_rank_shift_warn_max: 1500
+    full_dataset:
+      median_abs_rank_shift_warn_max: 200
+      p90_rank_shift_warn_max: 1000
   decision:
     minimum_sample_for_promote: 100

From c81c4260fdbf1ee0afc05165540a763787714356 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Wed, 22 Apr 2026 14:28:16 +0200
Subject: [PATCH 02/16] feat: implement segment-based stability thresholds in
 scoring evaluation

- Introduced segment-based thresholds for scoring evaluation, defining metrics for `top_band`, `middle_band`, and `bottom_band` to enhance diagnostic capabilities.
- Updated scoring configuration to reflect new stability metrics, including Jaccard and rank correlation thresholds for each segment.
- Enhanced tests to validate the integration of segment thresholds and ensure correct evaluation reporting.
- Improved documentation to clarify the purpose and structure of the new segment-based stability diagnostics.

From 6d7597df8f3f0fb4c8edfaee0bd39e51631cbcb5 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Wed, 22 Apr 2026 15:24:17 +0200
Subject: [PATCH 03/16] feat: update scoring evaluation metrics to include
 percentage-based thresholds

- Added percentage-based metrics for median absolute rank shift and p90 rank shift to enhance evaluation sensitivity.
- Updated scoring configuration and evaluation logic to incorporate new percentage thresholds for segment stability checks.
- Adjusted tests to validate the integration of percentage-based metrics and ensure correct evaluation reporting.
- Enhanced documentation to clarify the purpose and structure of the new percentage-based metrics in scoring evaluation.
---
 backend/app/services/scoring.py             | 16 ++---
 backend/app/services/scoring_evaluation.py  | 67 +++++++++++++--------
 backend/tests/test_scoring_v2_evaluation.py | 14 +++--
 config/scoring.yaml                         | 16 ++---
 4 files changed, 67 insertions(+), 46 deletions(-)

diff --git a/backend/app/services/scoring.py b/backend/app/services/scoring.py
index eb13ecf..591cd72 100644
--- a/backend/app/services/scoring.py
+++ b/backend/app/services/scoring.py
@@ -80,29 +80,29 @@
                     "jaccard_min": 0.7,
                     "rank_correlation_min": 0.8,
                     "perturbation_overlap_min": 0.6,
-                    "median_abs_rank_shift_max": 30,
-                    "p90_rank_shift_max": 120,
+                    "median_abs_rank_shift_pct_max": 0.15,
+                    "p90_rank_shift_pct_max": 0.60,
                 },
                 "middle_band": {
                     "start_pct": 0.45,
                     "end_pct": 0.60,
                     "jaccard_warn_min": 0.3,
                     "rank_correlation_warn_min": 0.5,
-                    "median_abs_rank_shift_warn_max": 250,
-                    "p90_rank_shift_warn_max": 1200,
+                    "median_abs_rank_shift_pct_warn_max": 0.45,
+                    "p90_rank_shift_pct_warn_max": 0.85,
                 },
                 "bottom_band": {
                     "start_pct": 0.85,
                     "end_pct": 1.00,
                     "jaccard_warn_min": 0.25,
                     "rank_correlation_warn_min": 0.4,
-                    "median_abs_rank_shift_warn_max": 300,
-                    "p90_rank_shift_warn_max": 1500,
+                    "median_abs_rank_shift_pct_warn_max": 0.50,
+                    "p90_rank_shift_pct_warn_max": 0.90,
                 },
             },
             "full_dataset": {
-                "median_abs_rank_shift_warn_max": 200,
-                "p90_rank_shift_warn_max": 1000,
+                "median_abs_rank_shift_pct_warn_max": 0.35,
+                "p90_rank_shift_pct_warn_max": 0.80,
             },
         },
         "decision": {
diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py
index f864931..18ad17a 100644
--- a/backend/app/services/scoring_evaluation.py
+++ b/backend/app/services/scoring_evaluation.py
@@ -248,16 +248,23 @@ def _rank_displacement_metrics(
             "intersection_count": 0.0,
             "median_abs_rank_shift": 0.0,
             "p90_rank_shift": 0.0,
+            "median_abs_rank_shift_pct": 0.0,
+            "p90_rank_shift_pct": 0.0,
         }
 
     shifts = [
         float(abs(current_global_rank[listing_id] - reference_global_rank[listing_id]))
         for listing_id in shared_ids
     ]
+    rank_span = max(len(current_global_rank), len(reference_global_rank)) - 1
+    rank_span = max(rank_span, 1)
+    shift_pcts = [shift / rank_span for shift in shifts]
     return {
         "intersection_count": float(len(shared_ids)),
         "median_abs_rank_shift": round(float(median(shifts)), 4),
         "p90_rank_shift": round(_percentile(shifts, 0.90), 4),
+        "median_abs_rank_shift_pct": round(float(median(shift_pcts)), 6),
+        "p90_rank_shift_pct": round(_percentile(shift_pcts, 0.90), 6),
     }
 
 
@@ -289,16 +296,18 @@ def _evaluate_segment_stability(
         rank_corr_min = thresholds.get("rank_correlation_min")
         if rank_corr_min is not None and rank_correlation < float(rank_corr_min):
             failed_checks.append(f"rank_corr_below_min:{rank_correlation}<{rank_corr_min}")
-        median_max = thresholds.get("median_abs_rank_shift_max")
-        if median_max is not None and displacement["median_abs_rank_shift"] > float(median_max):
+        median_pct_max = thresholds.get("median_abs_rank_shift_pct_max")
+        if median_pct_max is not None and displacement["median_abs_rank_shift_pct"] > float(
+            median_pct_max
+        ):
             failed_checks.append(
-                "median_abs_rank_shift_above_max:"
-                f"{displacement['median_abs_rank_shift']}>{median_max}"
+                "median_abs_rank_shift_pct_above_max:"
+                f"{displacement['median_abs_rank_shift_pct']}>{median_pct_max}"
             )
-        p90_max = thresholds.get("p90_rank_shift_max")
-        if p90_max is not None and displacement["p90_rank_shift"] > float(p90_max):
+        p90_pct_max = thresholds.get("p90_rank_shift_pct_max")
+        if p90_pct_max is not None and displacement["p90_rank_shift_pct"] > float(p90_pct_max):
             failed_checks.append(
-                f"p90_rank_shift_above_max:{displacement['p90_rank_shift']}>{p90_max}"
+                f"p90_rank_shift_pct_above_max:{displacement['p90_rank_shift_pct']}>{p90_pct_max}"
             )
     else:
         jaccard_warn_min = thresholds.get("jaccard_warn_min")
@@ -309,18 +318,21 @@ def _evaluate_segment_stability(
             failed_checks.append(
                 f"rank_corr_below_warn_min:{rank_correlation}<{rank_corr_warn_min}"
             )
-        median_warn_max = thresholds.get("median_abs_rank_shift_warn_max")
-        if median_warn_max is not None and displacement["median_abs_rank_shift"] > float(
-            median_warn_max
+        median_pct_warn_max = thresholds.get("median_abs_rank_shift_pct_warn_max")
+        if median_pct_warn_max is not None and displacement["median_abs_rank_shift_pct"] > float(
+            median_pct_warn_max
         ):
             failed_checks.append(
-                "median_abs_rank_shift_above_warn_max:"
-                f"{displacement['median_abs_rank_shift']}>{median_warn_max}"
+                "median_abs_rank_shift_pct_above_warn_max:"
+                f"{displacement['median_abs_rank_shift_pct']}>{median_pct_warn_max}"
             )
-        p90_warn_max = thresholds.get("p90_rank_shift_warn_max")
-        if p90_warn_max is not None and displacement["p90_rank_shift"] > float(p90_warn_max):
+        p90_pct_warn_max = thresholds.get("p90_rank_shift_pct_warn_max")
+        if p90_pct_warn_max is not None and displacement["p90_rank_shift_pct"] > float(
+            p90_pct_warn_max
+        ):
             failed_checks.append(
-                f"p90_rank_shift_above_warn_max:{displacement['p90_rank_shift']}>{p90_warn_max}"
+                "p90_rank_shift_pct_above_warn_max:"
+                f"{displacement['p90_rank_shift_pct']}>{p90_pct_warn_max}"
             )
 
     status = "pass" if not failed_checks else severity
@@ -335,6 +347,8 @@ def _evaluate_segment_stability(
             "rank_correlation": rank_correlation,
             "median_abs_rank_shift": displacement["median_abs_rank_shift"],
             "p90_rank_shift": displacement["p90_rank_shift"],
+            "median_abs_rank_shift_pct": displacement["median_abs_rank_shift_pct"],
+            "p90_rank_shift_pct": displacement["p90_rank_shift_pct"],
         },
         "thresholds": thresholds,
         "violation_details": {"failed_checks": failed_checks},
@@ -635,25 +649,30 @@ def run_scoring_evaluation(
             reference_global_rank,
         )
         full_warn_checks: list[str] = []
-        full_median_warn_max = float(full_dataset_cfg.get("median_abs_rank_shift_warn_max", 200))
-        full_p90_warn_max = float(full_dataset_cfg.get("p90_rank_shift_warn_max", 1000))
-        if full_displacement["median_abs_rank_shift"] > full_median_warn_max:
+        full_median_pct_warn_max = float(
+            full_dataset_cfg.get("median_abs_rank_shift_pct_warn_max", 0.35)
+        )
+        full_p90_pct_warn_max = float(full_dataset_cfg.get("p90_rank_shift_pct_warn_max", 0.80))
+        if full_displacement["median_abs_rank_shift_pct"] > full_median_pct_warn_max:
             full_warn_checks.append(
-                "median_abs_rank_shift_above_warn_max:"
-                f"{full_displacement['median_abs_rank_shift']}>{full_median_warn_max}"
+                "median_abs_rank_shift_pct_above_warn_max:"
+                f"{full_displacement['median_abs_rank_shift_pct']}>{full_median_pct_warn_max}"
             )
-        if full_displacement["p90_rank_shift"] > full_p90_warn_max:
+        if full_displacement["p90_rank_shift_pct"] > full_p90_pct_warn_max:
             full_warn_checks.append(
-                f"p90_rank_shift_above_warn_max:{full_displacement['p90_rank_shift']}>{full_p90_warn_max}"
+                "p90_rank_shift_pct_above_warn_max:"
+                f"{full_displacement['p90_rank_shift_pct']}>{full_p90_pct_warn_max}"
             )
         full_dataset_status = "warn" if full_warn_checks else "pass"
         full_dataset_metrics = {
             "intersection_count": int(full_displacement["intersection_count"]),
             "median_abs_rank_shift": full_displacement["median_abs_rank_shift"],
             "p90_rank_shift": full_displacement["p90_rank_shift"],
+            "median_abs_rank_shift_pct": full_displacement["median_abs_rank_shift_pct"],
+            "p90_rank_shift_pct": full_displacement["p90_rank_shift_pct"],
             "thresholds": {
-                "median_abs_rank_shift_warn_max": full_median_warn_max,
-                "p90_rank_shift_warn_max": full_p90_warn_max,
+                "median_abs_rank_shift_pct_warn_max": full_median_pct_warn_max,
+                "p90_rank_shift_pct_warn_max": full_p90_pct_warn_max,
             },
             "status": full_dataset_status,
             "violation_details": {"failed_checks": full_warn_checks},
diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py
index 1b61c4d..c5a43c1 100644
--- a/backend/tests/test_scoring_v2_evaluation.py
+++ b/backend/tests/test_scoring_v2_evaluation.py
@@ -168,6 +168,7 @@ def test_scoring_evaluation_marks_experimental_when_sample_too_small(
     assert "sample_size" in report["warning_gates"]
 
 
+# todo: verify logic is correct
 def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypatch) -> None:
     current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
     reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
@@ -175,10 +176,10 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat
     base_config = scoring_evaluation._load_scoring_config()
     strict_config = copy.deepcopy(base_config)
     strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
-        "median_abs_rank_shift_max"
+        "median_abs_rank_shift_pct_max"
     ] = 0
     strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
-        "p90_rank_shift_max"
+        "p90_rank_shift_pct_max"
     ] = 0
     monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config)
 
@@ -204,6 +205,7 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat
     assert "stability" in report["failed_gates"]
 
 
+# todo: verify logic is correct
 def test_full_dataset_displacement_warning_is_context_only(
     db_session: Session, monkeypatch
 ) -> None:
@@ -213,17 +215,17 @@ def test_full_dataset_displacement_warning_is_context_only(
     base_config = scoring_evaluation._load_scoring_config()
     warn_config = copy.deepcopy(base_config)
     warn_config["evaluation_thresholds"]["stability"]["full_dataset"][
-        "median_abs_rank_shift_warn_max"
+        "median_abs_rank_shift_pct_warn_max"
     ] = -1
     warn_config["evaluation_thresholds"]["stability"]["full_dataset"][
-        "p90_rank_shift_warn_max"
+        "p90_rank_shift_pct_warn_max"
     ] = -1
     # Keep top thresholds permissive so warning is context-only.
     warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
-        "median_abs_rank_shift_max"
+        "median_abs_rank_shift_pct_max"
     ] = 10_000
     warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
-        "p90_rank_shift_max"
+        "p90_rank_shift_pct_max"
     ] = 10_000
     monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: warn_config)
 
diff --git a/config/scoring.yaml b/config/scoring.yaml
index c848bf7..d79b76b 100644
--- a/config/scoring.yaml
+++ b/config/scoring.yaml
@@ -64,24 +64,24 @@ evaluation_thresholds:
         jaccard_min: 0.70
         rank_correlation_min: 0.80
         perturbation_overlap_min: 0.60
-        median_abs_rank_shift_max: 30
-        p90_rank_shift_max: 120
+        median_abs_rank_shift_pct_max: 0.15
+        p90_rank_shift_pct_max: 0.60
       middle_band:
         start_pct: 0.45
         end_pct: 0.60
         jaccard_warn_min: 0.30
         rank_correlation_warn_min: 0.50
-        median_abs_rank_shift_warn_max: 250
-        p90_rank_shift_warn_max: 1200
+        median_abs_rank_shift_pct_warn_max: 0.45
+        p90_rank_shift_pct_warn_max: 0.85
       bottom_band:
         start_pct: 0.85
         end_pct: 1.00
         jaccard_warn_min: 0.25
         rank_correlation_warn_min: 0.40
-        median_abs_rank_shift_warn_max: 300
-        p90_rank_shift_warn_max: 1500
+        median_abs_rank_shift_pct_warn_max: 0.50
+        p90_rank_shift_pct_warn_max: 0.90
     full_dataset:
-      median_abs_rank_shift_warn_max: 200
-      p90_rank_shift_warn_max: 1000
+      median_abs_rank_shift_pct_warn_max: 0.35
+      p90_rank_shift_pct_warn_max: 0.80
   decision:
     minimum_sample_for_promote: 100

From b487937b0b4b7fec5697c2e7825e0a61639b93c9 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Thu, 23 Apr 2026 11:49:54 +0200
Subject: [PATCH 04/16] feat: refine scoring evaluation metrics with segment
 overlap and displacement measures

- Updated implementation steps to include segment overlap and rank correlation metrics for `top_band`, `middle_band`, and `bottom_band`.
- Introduced global rank displacement metrics to enhance evaluation sensitivity and context.
- Adjusted stability thresholds and auditability requirements in the scoring configuration.
- Enhanced documentation to reflect the new metrics and their implications for scoring evaluation.
---
 docs/week-2-execution-plan.md | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/docs/week-2-execution-plan.md b/docs/week-2-execution-plan.md
index c041932..0f93f4e 100644
--- a/docs/week-2-execution-plan.md
+++ b/docs/week-2-execution-plan.md
@@ -65,16 +65,23 @@ What this means (simple):
 - Stability checks tell us whether ranking behavior is reliable enough to ship.
 
 Implementation steps:
-1. Add top-N overlap metric:
-   - compute Jaccard overlap between current top-20 and reference top-20
-   - compare against threshold (default `top20_jaccard_min`)
-2. Add rank correlation metric:
-   - compute full-list rank correlation vs reference ranking
-   - compare against threshold (default `rank_correlation_min`)
-3. Add weight perturbation sensitivity:
+1. Add segment overlap + ordering metrics:
+   - compute Jaccard overlap and rank correlation for:
+     - `top_band` (critical)
+     - `middle_band` (warning)
+     - `bottom_band` (warning)
+   - compare against config thresholds under `evaluation_thresholds.stability.segments`
+2. Add rank displacement metrics (global-rank based):
+   - compute `median_abs_rank_shift` and `p90_rank_shift` for shared listings
+   - compute normalized forms `median_abs_rank_shift_pct` and `p90_rank_shift_pct`
+   - gate on normalized (`*_pct`) thresholds so behavior scales across dataset sizes
+3. Add weight perturbation sensitivity for top band:
    - run controlled +/-5% to +/-10% weight perturbation experiments
-   - measure whether top ranking collapses or shifts beyond acceptable limits
-4. Store all stability metrics in the same evaluation artifact for auditability.
+   - measure whether top ranking collapses beyond acceptable limits
+4. Add full-dataset displacement context:
+   - compute global displacement metrics for full dataset
+   - treat full-dataset displacement breaches as warning-level context
+5. Store all stability metrics in the same evaluation artifact for auditability.
 
 ### Phase 3.3 Release Decision Output (promote/revert/experimental)
 
@@ -116,9 +123,9 @@ Create/modify these files during implementation:
    - Modify: `config/scoring.yaml`
    - Add/confirm:
      - data quality thresholds (valid/duplicate/null rate)
-     - stability thresholds (top-N overlap, rank correlation)
+     - stability thresholds (segment overlap, rank correlation, displacement)
      - sensitivity thresholds (acceptable perturbation drift)
-     - gate severity mapping (critical vs warning)
+     - gate severity mapping (critical `top_band` vs warning-level non-top/full-dataset)
 
 4. **CLI entrypoint (CLI-first workflow)**
    - Modify: `backend/app/cli.py`
@@ -134,7 +141,7 @@ Create/modify these files during implementation:
      - `backend/tests/test_scoring_service.py`
      - `backend/tests/test_scoring_v2_evaluation.py` (new)
    - Cover:
-     - metric correctness for top-N overlap and rank correlation
+     - metric correctness for segment overlap/correlation/displacement
      - perturbation sensitivity behavior for stable vs unstable cases
      - decision classification (`promote`/`revert`/`experimental`)
      - artifact shape and required fields

From f4a30cfbe58841dd1df0678bb0decf05e48df55f Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Thu, 23 Apr 2026 12:41:58 +0200
Subject: [PATCH 05/16] feat: add new tests for rank displacement metrics and
 perturbation thresholds

- Introduced tests for rank displacement metrics, validating expected values for intersection count and rank shifts.
- Added tests for top-band perturbation thresholds, ensuring correct evaluation outcomes for both pass and fail scenarios.
- Updated existing tests to remove outdated comments and improve clarity on evaluation logic.
- Enhanced documentation to reflect the new tests and their significance in scoring evaluation.
---
 backend/tests/test_scoring_v2_evaluation.py | 102 +++++++++++++++++++-
 docs/evaluation-review-protocol.md          |  18 ++--
 docs/week2-implementation-playbook.md       |  11 ++-
 docs/week2-interface-contract.md            |   9 +-
 4 files changed, 125 insertions(+), 15 deletions(-)

diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py
index c5a43c1..395258a 100644
--- a/backend/tests/test_scoring_v2_evaluation.py
+++ b/backend/tests/test_scoring_v2_evaluation.py
@@ -168,7 +168,6 @@ def test_scoring_evaluation_marks_experimental_when_sample_too_small(
     assert "sample_size" in report["warning_gates"]
 
 
-# todo: verify logic is correct
 def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypatch) -> None:
     current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
     reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
@@ -204,8 +203,6 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat
     assert report["decision"] == "revert"
     assert "stability" in report["failed_gates"]
 
-
-# todo: verify logic is correct
 def test_full_dataset_displacement_warning_is_context_only(
     db_session: Session, monkeypatch
 ) -> None:
@@ -249,3 +246,102 @@ def test_full_dataset_displacement_warning_is_context_only(
     assert "stability_full_dataset" in report["warning_gates"]
     assert report["gates"]["stability"]["metrics"]["full_dataset"]["status"] == "warn"
     assert "stability" not in report["failed_gates"]
+
+
+def test_rank_displacement_metrics_computes_expected_values() -> None:
+    current_ids = ["A", "B", "C", "D"]
+    reference_ids = ["B", "A", "C", "E"]
+    current_global_rank = {
+        "A": 1,
+        "B": 2,
+        "C": 3,
+        "D": 4,
+        "X": 5,
+    }
+    reference_global_rank = {
+        "B": 1,
+        "A": 2,
+        "C": 4,
+        "E": 3,
+        "X": 5,
+    }
+
+    metrics = scoring_evaluation._rank_displacement_metrics(
+        current_ids=current_ids,
+        reference_ids=reference_ids,
+        current_global_rank=current_global_rank,
+        reference_global_rank=reference_global_rank,
+    )
+
+    # Shared IDs are A, B, C -> shifts are [1, 1, 1].
+    assert metrics["intersection_count"] == 3.0
+    assert metrics["median_abs_rank_shift"] == 1.0
+    assert metrics["p90_rank_shift"] == 1.0
+    # rank_span = max(5, 5) - 1 = 4 => each pct shift is 1/4 = 0.25
+    assert metrics["median_abs_rank_shift_pct"] == 0.25
+    assert metrics["p90_rank_shift_pct"] == 0.25
+
+
+def test_top_band_perturbation_threshold_can_fail(db_session: Session, monkeypatch) -> None:
+    current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+    reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+
+    base_config = scoring_evaluation._load_scoring_config()
+    strict_config = copy.deepcopy(base_config)
+    strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
+        "perturbation_overlap_min"
+    ] = 0.90
+    monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config)
+    monkeypatch.setattr(
+        scoring_evaluation,
+        "_compute_perturbation_overlap",
+        lambda _rows, top_n, deltas: (
+            0.50,
+            [{"signal": "price_vs_comp", "delta": 0.10, "top_n_jaccard": 0.50}],
+        ),
+    )
+
+    report = run_scoring_evaluation(
+        db_session,
+        job_id=current_job.id,
+        reference_job_id=reference_job.id,
+        top_n=20,
+    )
+    assert report["decision"] == "revert"
+    assert "stability" in report["failed_gates"]
+    top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"]
+    assert top_band["status"] == "fail"
+    assert any(
+        check.startswith("perturbation_overlap_below_min:")
+        for check in top_band["violation_details"]["failed_checks"]
+    )
+
+
+def test_top_band_perturbation_threshold_can_pass(db_session: Session, monkeypatch) -> None:
+    current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+    reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True)
+
+    base_config = scoring_evaluation._load_scoring_config()
+    strict_config = copy.deepcopy(base_config)
+    strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][
+        "perturbation_overlap_min"
+    ] = 0.90
+    monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config)
+    monkeypatch.setattr(
+        scoring_evaluation,
+        "_compute_perturbation_overlap",
+        lambda _rows, top_n, deltas: (
+            0.95,
+            [{"signal": "price_vs_comp", "delta": 0.10, "top_n_jaccard": 0.95}],
+        ),
+    )
+
+    report = run_scoring_evaluation(
+        db_session,
+        job_id=current_job.id,
+        reference_job_id=reference_job.id,
+        top_n=20,
+    )
+    assert report["decision"] == "promote"
+    top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"]
+    assert top_band["status"] == "pass"
diff --git a/docs/evaluation-review-protocol.md b/docs/evaluation-review-protocol.md
index b785120..1eed204 100644
--- a/docs/evaluation-review-protocol.md
+++ b/docs/evaluation-review-protocol.md
@@ -70,14 +70,20 @@ Suggested defaults (adjust later):
 
 ## 4.3 Stability gates
 
-- top-N overlap vs previous version/run
-- rank correlation across full set
-- controlled sensitivity under weight perturbation (+/-5 to 10%)
+- segment overlap vs previous version/run:
+  - top band (critical), middle band (warn), bottom band (warn)
+- rank correlation per band
+- global-rank displacement tracking:
+  - median abs rank shift
+  - p90 rank shift (tail movement)
+- normalized displacement thresholds (`*_pct`) for dataset-size-aware gating
+- controlled sensitivity under top-band weight perturbation (+/-5 to 10%)
 
 Suggested defaults:
-- top20_jaccard >= 0.70
-- rank correlation >= 0.80
-- no severe rank collapse from minor weight changes
+- top-band jaccard >= 0.70
+- top-band rank correlation >= 0.80
+- top-band perturbation overlap >= 0.60
+- non-top/full-dataset displacement threshold breaches are warning-level context
 
 ## 4.4 Explainability gates
 
diff --git a/docs/week2-implementation-playbook.md b/docs/week2-implementation-playbook.md
index d9a9ecb..c6f6327 100644
--- a/docs/week2-implementation-playbook.md
+++ b/docs/week2-implementation-playbook.md
@@ -199,10 +199,13 @@ Checkpoint:
 
 ## Phase 5: Evaluation/stability gates (Day 4)
 
-1. Implement top-N overlap and rank correlation metrics.
-2. Implement perturbation sensitivity checks.
-3. Produce clear go/no-go decision artifact per run.
-4. Add tests for metric calculation correctness.
+1. Implement segment stability metrics:
+   - top/middle/bottom overlap + rank correlation checks
+2. Implement global-rank displacement diagnostics:
+   - absolute metrics plus normalized `*_pct` thresholds for gating
+3. Implement top-band perturbation sensitivity checks.
+4. Produce clear go/no-go decision artifact per run.
+5. Add tests for metric calculation correctness.
 
 Checkpoint:
 - scoring run outputs promotion recommendation deterministically.
diff --git a/docs/week2-interface-contract.md b/docs/week2-interface-contract.md
index 1d907de..28bca88 100644
--- a/docs/week2-interface-contract.md
+++ b/docs/week2-interface-contract.md
@@ -76,8 +76,13 @@ Phase 0 freeze references these default thresholds from:
 - `config/scoring.yaml` (`evaluation_thresholds`)
 
 Current defaults:
-- `top20_jaccard_min = 0.70`
-- `rank_correlation_min = 0.80`
+- `evaluation_thresholds.stability.segments.top_band.jaccard_min = 0.70`
+- `evaluation_thresholds.stability.segments.top_band.rank_correlation_min = 0.80`
+- `evaluation_thresholds.stability.segments.top_band.perturbation_overlap_min = 0.60`
+- `evaluation_thresholds.stability.segments.top_band.median_abs_rank_shift_pct_max = 0.15`
+- `evaluation_thresholds.stability.segments.top_band.p90_rank_shift_pct_max = 0.60`
+- `evaluation_thresholds.stability.full_dataset.median_abs_rank_shift_pct_warn_max = 0.35`
+- `evaluation_thresholds.stability.full_dataset.p90_rank_shift_pct_warn_max = 0.80`
 
 These thresholds gate promote/revert decisions for Week 2 scoring changes.
 

From a89e4faee0d4464d25b762236ecc282f9ca23692 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Thu, 23 Apr 2026 13:40:51 +0200
Subject: [PATCH 06/16] feat: update project documentation for Week 2 execution
 and interface contracts

- Revised PROJECT_NOTE.md to reflect the finalized goals and deliverables for the advanced scoring system (`advanced_v2`), including evaluation gates and structured reasoning payloads.
- Updated week2-implementation-playbook.md to clarify the scope and execution order, emphasizing the single source of truth for implementation details.
- Enhanced week2-interface-contract.md to define output expectations and scope boundaries, ensuring clarity on in-scope and out-of-scope elements for Week 2.
---
 .cursor/rules/PROJECT_NOTE.md         | 136 ++++----------------------
 docs/week2-implementation-playbook.md |  30 ++----
 docs/week2-interface-contract.md      |  13 ++-
 3 files changed, 35 insertions(+), 144 deletions(-)

diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md
index a582f8b..9383fb4 100644
--- a/.cursor/rules/PROJECT_NOTE.md
+++ b/.cursor/rules/PROJECT_NOTE.md
@@ -1,16 +1,3 @@
-TODOS:
-
-- [x] First, create a few datasets for testing purposes
-1. 1000 listings set from p24, and 1000 listings set from privateproperty
-2. a dataset with both p24 and 1000 listings from privateproperty joined
-- [ ] phase 3 in week-2-execution plan
-
-Unfinished prompts for phase 3 (scoring_evaluation.py):
-
-1. side note, if I am correct it should not just evaluate the top n, but also the mid n and bottom n.
-
-2. get progress report on phase 3
-
 # 🏠 Real Estate Deal Intelligence Platform (Full System)
 
 ## 🎯 Goal
@@ -446,109 +433,26 @@ Deferred unless core goals are already complete:
 
 ### **Goal**
 
-Ship an **ROI-first, explainable advanced scoring system** that improves ranking quality over the Week 1 baseline by:
-- using **micro-comparables** (location/type/bed/bath segment medians, not a single dataset median),
-- adding **rental yield + transaction-cost adjustments** (net-ish ROI proxy),
-- producing a **reasoning/explanations payload** for every score (so results are inspectable),
-- adding an **analytics engine** that can quantify scoring quality and data health,
-- integrating **LLM enrichment** in a controlled, measurable way (only if it improves outcomes).
-
-### **Deliverables (Week 2)**
-
-#### **2.1 Advanced scoring system (v2)**
-
-- **Micro-comps pricing signals**
-  - Compute segmented medians / distributions for:
-    - `province/city/suburb` (use the deepest level with enough samples)
-    - `property_type`
-    - `bedrooms`, `bathrooms` (bucketed)
-  - Add fallbacks when segment sample size is too small (e.g., suburb → city → province → global).
-  - Replace baseline “single median” price deviation with:
-    - **price_vs_comp_median** (price deviation within the best-available segment)
-    - **price_per_sqm_vs_comp_median** (if floor_size available)
-
-- **ROI proxy signals**
-  - **Transaction-cost adjustment**
-    - Upfront costs modeled as configurable % or fixed schedule (kept in config).
-    - Optional LLM-assisted extraction path:
-      - infer additional upfront-cost signals from listing fields + description text
-      - emit `upfront_cost_estimate`, `cost_drivers`, and `confidence`
-      - use only when confidence is above threshold, otherwise fallback to deterministic config assumptions
-  - **Net yield proxy**
-    - Use available fields (`rates_and_taxes`, `levies`) + configurable assumptions:
-      - vacancy allowance %, maintenance %, management %, insurance (optional)
-    - Rent estimation approach for Week 2:
-      - **Phase 1 (required):** heuristic rent estimate (config-driven by `property_type`, `bedrooms`, `city/province` buckets)
-      - **Phase 2 (optional):** upgrade rent estimate via LLM/external data only if Phase 1 is weak
-  - Add a yield-derived score component such as:
-    - **net_yield_signal** and **payback_signal** (optional, time-boxed)
-
-- **Liquidity & risk adjustments**
-  - Keep time-on-market but improve it:
-    - use `date_posted` where available
-    - add a **stale inventory non-linear curve** (e.g., diminishing returns after N days)
-  - Penalize low-confidence or missing-critical-fields in a consistent way:
-    - separate **data_confidence** (completeness) from **investment_risk** (flags like auction/private seller if used)
-
-- **Scoring versioning**
-  - Output `model_version="advanced_v2"` (keep baseline runnable side-by-side).
-  - Ensure scoring is **idempotent** per job (overwrite results like Week 1).
-
-#### **2.2 Reasoning engine (explainability)**
-
-- Persist a structured explanation per listing score:
-  - top contributing signals with raw values and normalized scores
-  - “why this was ranked high/low”
-  - confidence and missing-field notes
-- Output target:
-  - a single `deal_reason` string (short)
-  - plus a structured `explanation` JSON blob (machine-readable) for later UI.
-
-#### **2.3 Analytics engine (quality + insight)**
-
-- Implement job-level analytics for:
-  - score distribution (histogram bins, min/max/median, percentiles)
-  - top-N listing summaries (score + key drivers)
-  - missingness report for key fields that affect scoring
-  - comps coverage report: what % of listings got suburb-level comps vs city/province/global
-- Add “ranking quality checks” (offline):
-  - sanity checks for pathological outcomes (e.g., missing price scored too high)
-  - stability checks when changing weights (top-N overlap)
-
-#### **2.4 LLM enrichment prototype (Week 2)**
-
-- **Purpose:** extract high-value structured variables from `description` to improve scoring.
-- **Candidate variables (minimal set):**
-  - condition/renovation level (e.g., “newly renovated”, “needs TLC”)
-  - security/amenities not reliably structured (pool, inverter/solar, etc.)
-  - rental hints (furnished, “investment”, “tenant in place”) as weak signals
-  - upfront-cost hints (legal/levy/special conditions) for ROI proxy refinement
-- **Integration approach (controlled):**
-  - store derived fields in a separate enrichment payload (do not overwrite canonical listing fields)
-  - feed enrichment into scoring only behind an **experiment flag**
-- **Week 2 validation gate (must pass to enable by default):**
-  - improves top-N deal quality on offline evaluation metrics (see 2.5)
-  - does not significantly increase invalid/low-confidence scores
-
-#### **2.5 Evaluation + gates (scope control)**
-
-- Add a lightweight offline evaluation process:
-  - compare baseline_v1 vs advanced_v2 on:
-    - top-N stability and reason diversity
-    - fewer “unknown / missing data” in top ranks
-    - comps coverage improvements
-    - yield proxy sanity (high yield not correlated with missing price)
-- **Decision gates:**
-  - only ship LLM-influenced scoring as default if it improves metrics and is stable
-  - otherwise keep LLM enrichment stored but not used in ranking
-
-### **Suggested implementation order**
-
-- Build micro-comps computation + comp-based pricing signals
-- Add ROI proxy (transaction costs + net yield)
-- Add reasoning payload format
-- Add analytics summaries + evaluation scripts
-- Add LLM enrichment prototype + validation gate
+Ship an ROI-first, explainable scoring system (`advanced_v2`) with deterministic evaluation gates that decide promote/revert/experimental outcomes.
+
+### **Week 2 Source-of-Truth Docs (Updated)**
+
+- Canonical scope: `docs/week-2-execution-plan.md`
+- Stability details: `docs/scoring-evaluation-middle-bottom-gating-spec.md`
+- Evaluation policy: `docs/evaluation-review-protocol.md`
+- Interface contract: `docs/week2-interface-contract.md`
+- Implementation playbook: `docs/week2-implementation-playbook.md`
+
+### **Week 2 High-Level Deliverables**
+
+- Advanced scoring (`advanced_v2`) with micro-comps + ROI proxy signals.
+- Structured reasoning payload (`deal_reason` + machine-readable `explanation`).
+- Evaluation gates with deterministic release decisions:
+  - `promote` / `revert` / `experimental`.
+- Segment-based stability checks:
+  - `top_band` (critical), `middle_band`/`bottom_band` (warning),
+  - full-dataset displacement context,
+  - relative displacement thresholds (`*_pct`) for dataset-size-aware gating.
 
 ---
 
diff --git a/docs/week2-implementation-playbook.md b/docs/week2-implementation-playbook.md
index c6f6327..ae2146b 100644
--- a/docs/week2-implementation-playbook.md
+++ b/docs/week2-implementation-playbook.md
@@ -1,13 +1,10 @@
 # Week 2 Implementation Playbook
 
-This is the single Week 2 implementation document that combines:
-- plain-language intent (why each feature matters),
-- code-level change map (what files to change),
-- chronological execution order (how to implement safely).
+This document is an implementation companion for Week 2.
+It should not be used as the primary scope definition.
 
-It merges and supersedes the intent of:
+Canonical Week 2 scope and sequencing live in:
 - `docs/week-2-execution-plan.md`
-- `docs/week2-advanced-scoring-explained.md`
 
 ---
 
@@ -23,22 +20,13 @@ Do **not** include Week 3 dashboard/API strategy UX revamp in this branch.
 
 ---
 
-## 2) Scope (In / Out)
+## 2) Scope Reference
 
-## In scope
-
-- Advanced scoring service (`advanced_v2`)
-- Micro-comps signals with safe fallback
-- ROI proxy signals (deterministic baseline)
-- Structured reasoning payload
-- Evaluation/stability gates and promotion decision output
-- MVP performance safeguards for Week 2 query/compute paths
-
-## Out of scope (for this branch)
-
-- Full strategy-driven dashboard workflows
-- Multi-provider external integrations
-- Default-on LLM scoring influence (LLM remains optional/flagged)
+Scope is intentionally not duplicated here.
+Use `docs/week-2-execution-plan.md` as the single source of truth for:
+- in-scope vs out-of-scope boundaries,
+- phase ordering,
+- done criteria.
 
 ---
 
diff --git a/docs/week2-interface-contract.md b/docs/week2-interface-contract.md
index 28bca88..7e16d06 100644
--- a/docs/week2-interface-contract.md
+++ b/docs/week2-interface-contract.md
@@ -1,12 +1,15 @@
 # Week 2 Interface Contract
 
-This document freezes the Week 2 output interface expectations before implementation work proceeds.
+This document defines Week 2 interface contracts only.
 
 It defines:
 - `advanced_v2` scoring output contract,
 - structured explanation payload contract,
 - promotion threshold references.
 
+Canonical Week 2 scope and execution order live in:
+- `docs/week-2-execution-plan.md`
+
 ---
 
 ## 1) Scoring Output Contract (`advanced_v2`)
@@ -108,9 +111,5 @@ Expected progression:
 
 ## 5) Scope Guardrail
 
-This contract is only for Week 2 scoring and reasoning outputs.
-
-It intentionally excludes:
-- Week 3 dashboard/API strategy workflow contracts,
-- multi-provider external integrations,
-- default-on LLM scoring influence.
+Scope is intentionally not restated here.
+Use `docs/week-2-execution-plan.md` as the single source of truth for Week 2 scope boundaries.

From 87cda98798c5124ebc9db84c5f72ae24ad4f1ac1 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Thu, 23 Apr 2026 14:38:14 +0200
Subject: [PATCH 07/16] feat: enhance project documentation and clarify scoring
 evaluation metrics

- Updated PROJECT_NOTE.md to include new insights on scoring evaluation metrics and their implications.
- Revised week2-implementation-playbook.md to improve clarity on execution steps and responsibilities.
- Enhanced week2-interface-contract.md to better define output expectations and scope for the upcoming evaluation phase.
---
 ...se4-performance-baseline-implementation.md | 205 ++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 docs/week2-phase4-performance-baseline-implementation.md

diff --git a/docs/week2-phase4-performance-baseline-implementation.md b/docs/week2-phase4-performance-baseline-implementation.md
new file mode 100644
index 0000000..91d5a6e
--- /dev/null
+++ b/docs/week2-phase4-performance-baseline-implementation.md
@@ -0,0 +1,205 @@
+# Week 2 Phase 4 Implementation Doc: Performance Baseline
+
+This document defines exactly what to implement for Phase 4 in Week 2:
+
+- establish a repeatable performance baseline,
+- record measurable evidence,
+- avoid premature optimization work that belongs in Week 3/4.
+
+Canonical phase reference: `docs/week-2-execution-plan.md` (Phase 4).
+
+---
+
+## 1) Goal
+
+Create a deterministic, CLI-first baseline run that measures core pipeline timings and writes machine-readable artifacts for later comparison.
+
+This phase is measurement-first, not optimization-first.
+
+---
+
+## 2) Scope
+
+### In Scope (Week 2)
+
+1. Run baseline timings for core pipeline stages:
+   - ingest
+   - score
+   - validate-dataset
+   - evaluate-scoring
+2. Support baseline runs for:
+   - single dataset
+   - multi-dataset proxy (run multiple datasets in one benchmark invocation)
+3. Record and persist:
+   - per-stage durations
+   - p50/p95 stage durations
+   - SLO assessment (met/missed/deferred)
+   - unresolved bottlenecks and follow-up actions
+4. Expose as one CLI command.
+
+### Out of Scope (defer to Week 3/4)
+
+- query/index refactors
+- API latency optimization
+- async orchestration and job status APIs
+- caching/invalidation framework
+- large-scale performance tuning loops
+
+---
+
+## 3) Deliverables
+
+1. New service:
+   - `backend/app/services/performance_baseline.py`
+2. New CLI command:
+   - `benchmark-baseline` in `backend/app/cli.py`
+3. Generated artifacts:
+   - `output/performance/<run_id>/baseline_metrics.json`
+   - `output/performance/<run_id>/baseline_summary.md`
+4. Tests:
+   - `backend/tests/test_performance_baseline.py`
+
+---
+
+## 4) CLI Contract
+
+Command:
+
+- `benchmark-baseline`
+
+Suggested options:
+
+- `--dataset <path>` (repeatable, required)
+- `--top-n <int>` (optional, default 20)
+- `--output-dir <path>` (optional; default `output/performance/<run_id>/`)
+
+Example:
+
+- `./scripts/cli-local.sh benchmark-baseline --dataset "/abs/path/a.json" --dataset "/abs/path/b.json" --top-n 20`
+
+---
+
+## 5) Execution Flow
+
+For each dataset path:
+
+1. Ingest dataset and capture duration.
+2. Score the produced job and capture duration.
+3. Validate dataset and capture duration.
+4. Evaluate scoring and capture duration.
+   - Week 2 baseline mode can use `reference_job_id = job_id` for deterministic smoke measurement.
+5. Collect paths to generated validation/evaluation artifacts.
+6. Append stage timings into run-level aggregation.
+
+After all datasets:
+
+1. Compute run-level p50/p95 per stage.
+2. Build SLO assessment:
+   - met
+   - missed
+   - deferred (for API or optimization-heavy SLOs not in Week 2 scope)
+3. Write JSON + Markdown summary artifacts.
+4. Print concise terminal summary with output paths.
+
+---
+
+## 6) Input and Output Shapes
+
+### Input (CLI)
+
+- `datasets: list[str]`
+- `top_n: int` (optional)
+- `output_dir: str` (optional)
+
+### Output (`baseline_metrics.json`)
+
+Top-level schema (minimum):
+
+- `run_id: str`
+- `timestamp_utc: str`
+- `scope: "week2_phase4_minimal_baseline"`
+- `datasets: list[dataset_result]`
+- `aggregate: { stage_stats }`
+- `slo_targets: { ... }`
+- `slo_assessment: { met: [], missed: [], deferred: [] }`
+- `bottlenecks: list[str]`
+- `week3_week4_followups: list[str]`
+
+`dataset_result` minimum:
+
+- `dataset_path: str`
+- `job_id: int | null`
+- `durations_s: { ingest, score, validate_dataset, evaluate_scoring }`
+- `artifacts: { validation_report_path, evaluation_report_path }`
+- `status: "pass" | "warn" | "fail" | "error"`
+- `error: str | null`
+
+### Output (`baseline_summary.md`)
+
+Human-readable summary with:
+
+- run metadata
+- per-stage p50/p95
+- SLO status by item
+- unresolved bottlenecks
+- explicit Week 3/4 follow-ups
+
+---
+
+## 7) SLO Handling for Week 2
+
+Use `docs/mvp-performance-plan.md` targets as references, but classify by implementation feasibility:
+
+- CLI pipeline timing targets: evaluate directly (score/validation duration targets).
+- API p95 targets: mark as deferred if API benchmark harness is not in Week 2 scope.
+
+This keeps the baseline honest without forcing premature architecture work.
+
+---
+
+## 8) Automation Details
+
+Automation is complete when one command:
+
+- runs all selected datasets end-to-end,
+- records stage timings,
+- computes p50/p95,
+- writes both artifacts,
+- exits non-zero only on command/runtime errors (not on SLO misses).
+
+SLO misses should be reported in artifacts, not treated as process crashes.
+
+---
+
+## 9) Code Change Checklist
+
+1. Add `performance_baseline.py` service with:
+   - orchestration logic
+   - timing collection (`time.perf_counter`)
+   - percentile helper
+   - artifact writers
+2. Add `benchmark-baseline` CLI command in `backend/app/cli.py`.
+3. Add tests for:
+   - single dataset run
+   - multi-dataset run
+   - artifact structure
+   - p50/p95 aggregation behavior
+4. Update docs references if command is surfaced in README/CLI docs.
+
+---
+
+## 10) Acceptance Criteria (Phase 4 Done, Week 2 Minimal)
+
+Phase 4 is considered done when:
+
+1. `benchmark-baseline` exists and is runnable from CLI.
+2. It supports one or many datasets in one run.
+3. It produces:
+   - `baseline_metrics.json`
+   - `baseline_summary.md`
+4. Artifacts include:
+   - per-stage durations
+   - p50/p95 by stage
+   - SLO met/missed/deferred
+   - unresolved bottlenecks
+5. Follow-up actions for Week 3/4 are explicitly captured.
\ No newline at end of file

From 6d85a7feafa13545702fdcd89d00eaf9c9639f60 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Fri, 24 Apr 2026 13:58:09 +0200
Subject: [PATCH 08/16] feat: implement performance baseline updates and CLI
 enhancements

- Updated PROJECT_NOTE.md to include required performance baseline handoff updates and metrics context.
- Added a new CLI command for benchmarking performance baselines, allowing users to assess API latency and SLOs.
- Enhanced week2-phase4-performance-baseline-implementation.md with detailed follow-up actions and required updates for the upcoming API implementations.
- Ensured documentation reflects the inclusion of dataset-size context and throughput metrics for meaningful performance comparisons.
---
 .cursor/rules/PROJECT_NOTE.md                 |   7 +
 backend/app/cli.py                            |  26 +++
 backend/app/services/performance_baseline.py  | 192 ++++++++++++++++++
 backend/tests/test_performance_baseline.py    |  56 +++++
 docs/week-2-execution-plan.md                 |   2 +
 ...se4-performance-baseline-implementation.md |  41 +++-
 6 files changed, 323 insertions(+), 1 deletion(-)
 create mode 100644 backend/app/services/performance_baseline.py
 create mode 100644 backend/tests/test_performance_baseline.py

diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md
index 9383fb4..1d78e3a 100644
--- a/.cursor/rules/PROJECT_NOTE.md
+++ b/.cursor/rules/PROJECT_NOTE.md
@@ -506,6 +506,10 @@ Turn PropSignal into a **configurable investor decision tool** where users can:
   - pagination and top-N optimized retrieval
   - asynchronous processing for heavy jobs (ingestion/scoring/validation)
   - freshness metadata (`last_ingested_at`, `last_scored_at`, `model/profile version`)
+ - Required performance baseline handoff update (more info in `week2-phase4-performance-baseline-implementation.md`):
+   - after ranking/list/detail APIs are available, update `backend/app/services/performance_baseline.py`
+     to measure API latency and move API SLOs from `deferred` to evaluated (`met`/`missed`)
+   - update `backend/tests/test_performance_baseline.py` to enforce this behavior
 
 #### **3.3 CLI revamp to mirror backend/dashboard capability**
 
@@ -586,6 +590,9 @@ Harden the system for real-world use by running structured validation on real da
 - Optimize bottlenecks (indexes, pagination paths, batch operations).
 - Complete deployment checklist (env config, observability, rollback path, smoke tests).
 - Use `docs/mvp-performance-plan.md` as the implementation checklist and SLO reference.
+- Ensure performance baseline artifacts include dataset-size context and throughput metrics:
+  - `records_total`, `records_valid`
+  - stage throughput (rows/sec) for scoring and validation
 
 #### **4.5 Documentation pack (operator + analyst guidance)**
 
diff --git a/backend/app/cli.py b/backend/app/cli.py
index 749d64b..1580f8b 100644
--- a/backend/app/cli.py
+++ b/backend/app/cli.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Annotated
 
 import typer
 
@@ -7,6 +8,7 @@
 from app.services.dataset_validation import run_dataset_validation
 from app.services.exporting import export_job_results
 from app.services.ingestion import ingest_propflux_file
+from app.services.performance_baseline import run_performance_baseline
 from app.services.scoring import run_scoring_job
 from app.services.scoring_evaluation import run_scoring_evaluation
 
@@ -82,5 +84,29 @@ def evaluate_scoring(
     typer.echo(f"Report written to: {report['report_path']}")
 
 
+@app.command("benchmark-baseline")
+def benchmark_baseline(
+    dataset: Annotated[list[str], typer.Option("--dataset")],
+    top_n: Annotated[int, typer.Option("--top-n")] = 20,
+    output_dir: Annotated[str | None, typer.Option("--output-dir")] = None,
+) -> None:
+    with SessionLocal() as db:
+        metrics = run_performance_baseline(
+            db,
+            dataset_paths=dataset,
+            top_n=top_n,
+            output_dir=output_dir,
+        )
+    typer.echo(
+        "Performance baseline completed for "
+        f"{len(dataset)} dataset(s). "
+        f"met={len(metrics['slo_assessment']['met'])}, "
+        f"missed={len(metrics['slo_assessment']['missed'])}, "
+        f"deferred={len(metrics['slo_assessment']['deferred'])}"
+    )
+    typer.echo(f"Metrics written to: {metrics['metrics_path']}")
+    typer.echo(f"Summary written to: {metrics['summary_path']}")
+
+
 if __name__ == "__main__":
     app()
diff --git a/backend/app/services/performance_baseline.py b/backend/app/services/performance_baseline.py
new file mode 100644
index 0000000..6c46069
--- /dev/null
+++ b/backend/app/services/performance_baseline.py
@@ -0,0 +1,192 @@
+from __future__ import annotations
+
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from time import perf_counter
+from typing import Any
+
+from sqlalchemy.orm import Session
+
+from app.services.dataset_validation import run_dataset_validation
+from app.services.ingestion import ingest_propflux_file
+from app.services.scoring import run_scoring_job
+from app.services.scoring_evaluation import run_scoring_evaluation
+
+
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return 0.0
+    ordered = sorted(values)
+    idx = int(round((len(ordered) - 1) * max(0.0, min(1.0, pct))))
+    return float(ordered[idx])
+
+
+def _timed(callable_fn: Any, *args: Any, **kwargs: Any) -> tuple[Any, float]:
+    start = perf_counter()
+    result = callable_fn(*args, **kwargs)
+    elapsed = perf_counter() - start
+    return result, round(elapsed, 4)
+
+
+def run_performance_baseline(
+    db: Session,
+    dataset_paths: list[str],
+    *,
+    top_n: int = 20,
+    output_dir: str | None = None,
+) -> dict[str, Any]:
+    if not dataset_paths:
+        raise ValueError("At least one dataset path is required.")
+
+    run_time_utc = datetime.now(UTC)
+    run_id = f"phase4-baseline-{run_time_utc.strftime('%Y%m%d%H%M%S')}"
+    base_dir = Path(output_dir) if output_dir else Path("output") / "performance" / run_id
+    base_dir.mkdir(parents=True, exist_ok=True)
+
+    dataset_results: list[dict[str, Any]] = []
+    ingest_durations: list[float] = []
+    score_durations: list[float] = []
+    validate_durations: list[float] = []
+    evaluate_durations: list[float] = []
+
+    for dataset_path in dataset_paths:
+        result: dict[str, Any] = {
+            "dataset_path": dataset_path,
+            "job_id": None,
+            "durations_s": {
+                "ingest": 0.0,
+                "score": 0.0,
+                "validate_dataset": 0.0,
+                "evaluate_scoring": 0.0,
+            },
+            "artifacts": {
+                "validation_report_path": None,
+                "evaluation_report_path": None,
+            },
+            "status": "error",
+            "error": None,
+        }
+        try:
+            ingestion_job, ingest_s = _timed(ingest_propflux_file, db, Path(dataset_path))
+            result["job_id"] = ingestion_job.id
+            result["durations_s"]["ingest"] = ingest_s
+            ingest_durations.append(ingest_s)
+
+            _scoring_job, score_s = _timed(run_scoring_job, db, ingestion_job.id)
+            result["durations_s"]["score"] = score_s
+            score_durations.append(score_s)
+
+            validation_result, validate_s = _timed(run_dataset_validation, db, ingestion_job.id)
+            result["durations_s"]["validate_dataset"] = validate_s
+            result["artifacts"]["validation_report_path"] = validation_result.report_path
+            validate_durations.append(validate_s)
+
+            evaluation_report, evaluate_s = _timed(
+                run_scoring_evaluation,
+                db,
+                job_id=ingestion_job.id,
+                reference_job_id=ingestion_job.id,
+                top_n=top_n,
+            )
+            result["durations_s"]["evaluate_scoring"] = evaluate_s
+            result["artifacts"]["evaluation_report_path"] = evaluation_report.get("report_path")
+            evaluate_durations.append(evaluate_s)
+
+            result["status"] = "pass"
+        except Exception as exc:  # pragma: no cover - defensive path
+            result["error"] = str(exc)
+            result["status"] = "error"
+        dataset_results.append(result)
+
+    aggregate = {
+        "ingest": {
+            "p50_s": _percentile(ingest_durations, 0.50),
+            "p95_s": _percentile(ingest_durations, 0.95),
+        },
+        "score": {
+            "p50_s": _percentile(score_durations, 0.50),
+            "p95_s": _percentile(score_durations, 0.95),
+        },
+        "validate_dataset": {
+            "p50_s": _percentile(validate_durations, 0.50),
+            "p95_s": _percentile(validate_durations, 0.95),
+        },
+        "evaluate_scoring": {
+            "p50_s": _percentile(evaluate_durations, 0.50),
+            "p95_s": _percentile(evaluate_durations, 0.95),
+        },
+    }
+
+    slo_targets = {
+        "scoring_run_10k_max_s": 600.0,
+        "dataset_validation_10k_max_s": 300.0,
+        "ranking_list_api_p95_ms": 800.0,
+        "filtered_ranking_api_p95_ms": 1200.0,
+        "listing_detail_api_p95_ms": 500.0,
+    }
+    slo_assessment = {"met": [], "missed": [], "deferred": []}
+    if aggregate["score"]["p95_s"] <= slo_targets["scoring_run_10k_max_s"]:
+        slo_assessment["met"].append("scoring_run_10k_max_s")
+    else:
+        slo_assessment["missed"].append("scoring_run_10k_max_s")
+    if aggregate["validate_dataset"]["p95_s"] <= slo_targets["dataset_validation_10k_max_s"]:
+        slo_assessment["met"].append("dataset_validation_10k_max_s")
+    else:
+        slo_assessment["missed"].append("dataset_validation_10k_max_s")
+    slo_assessment["deferred"].extend(
+        ["ranking_list_api_p95_ms", "filtered_ranking_api_p95_ms", "listing_detail_api_p95_ms"]
+    )
+
+    metrics = {
+        "run_id": run_id,
+        "timestamp_utc": run_time_utc.isoformat(),
+        "scope": "week2_phase4_minimal_baseline",
+        "datasets": dataset_results,
+        "aggregate": aggregate,
+        "slo_targets": slo_targets,
+        "slo_assessment": slo_assessment,
+        "bottlenecks": [],
+        "week3_week4_followups": [
+            "Add API-level latency benchmark harness and capture p95.",
+            "Add query/index optimization for ranking/filter paths.",
+            "Add async orchestration and cache strategy where needed.",
+        ],
+    }
+
+    metrics_path = base_dir / "baseline_metrics.json"
+    metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")
+
+    summary_lines = [
+        "# Phase 4 Baseline Summary",
+        "",
+        f"- Run ID: {run_id}",
+        f"- Timestamp UTC: {run_time_utc.isoformat()}",
+        f"- Dataset count: {len(dataset_results)}",
+        "",
+        "## Stage p50/p95 (seconds)",
+        f"- ingest: p50={aggregate['ingest']['p50_s']}, p95={aggregate['ingest']['p95_s']}",
+        f"- score: p50={aggregate['score']['p50_s']}, p95={aggregate['score']['p95_s']}",
+        (
+            f"- validate_dataset: p50={aggregate['validate_dataset']['p50_s']}, "
+            f"p95={aggregate['validate_dataset']['p95_s']}"
+        ),
+        (
+            f"- evaluate_scoring: p50={aggregate['evaluate_scoring']['p50_s']}, "
+            f"p95={aggregate['evaluate_scoring']['p95_s']}"
+        ),
+        "",
+        "## SLO Assessment",
+        f"- Met: {', '.join(slo_assessment['met']) or 'none'}",
+        f"- Missed: {', '.join(slo_assessment['missed']) or 'none'}",
+        f"- Deferred: {', '.join(slo_assessment['deferred']) or 'none'}",
+        "",
+        "## Follow-ups (Week 3/4)",
+    ]
+    summary_lines.extend(f"- {item}" for item in metrics["week3_week4_followups"])
+    summary_path = base_dir / "baseline_summary.md"
+    summary_path.write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
+
+    metrics["metrics_path"] = str(metrics_path)
+    metrics["summary_path"] = str(summary_path)
+    return metrics
diff --git a/backend/tests/test_performance_baseline.py b/backend/tests/test_performance_baseline.py
new file mode 100644
index 0000000..c65c0ef
--- /dev/null
+++ b/backend/tests/test_performance_baseline.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from app.services.performance_baseline import run_performance_baseline
+from sqlalchemy.orm import Session
+
+FIXTURE_DIR = Path(__file__).parent / "fixtures" / "propflux"
+
+
+def test_performance_baseline_single_dataset_writes_artifacts(
+    db_session: Session, tmp_path: Path
+) -> None:
+    metrics = run_performance_baseline(
+        db_session,
+        dataset_paths=[str(FIXTURE_DIR / "valid_listings.json")],
+        top_n=20,
+        output_dir=str(tmp_path),
+    )
+
+    assert metrics["scope"] == "week2_phase4_minimal_baseline"
+    assert len(metrics["datasets"]) == 1
+    assert metrics["datasets"][0]["status"] == "pass"
+    assert Path(metrics["metrics_path"]).exists()
+    assert Path(metrics["summary_path"]).exists()
+
+    saved = json.loads(Path(metrics["metrics_path"]).read_text(encoding="utf-8"))
+    assert "aggregate" in saved
+    assert "slo_assessment" in saved
+    assert "score" in saved["aggregate"]
+
+
+def test_performance_baseline_multiple_datasets_aggregates(
+    db_session: Session, tmp_path: Path
+) -> None:
+    metrics = run_performance_baseline(
+        db_session,
+        dataset_paths=[
+            str(FIXTURE_DIR / "valid_listings.json"),
+            str(FIXTURE_DIR / "duplicate_records.json"),
+        ],
+        top_n=20,
+        output_dir=str(tmp_path),
+    )
+
+    assert len(metrics["datasets"]) == 2
+    assert all(row["status"] in {"pass", "error"} for row in metrics["datasets"])
+    assert set(metrics["aggregate"].keys()) == {
+        "ingest",
+        "score",
+        "validate_dataset",
+        "evaluate_scoring",
+    }
+    assert "deferred" in metrics["slo_assessment"]
+    assert "ranking_list_api_p95_ms" in metrics["slo_assessment"]["deferred"]
diff --git a/docs/week-2-execution-plan.md b/docs/week-2-execution-plan.md
index 0f93f4e..390401a 100644
--- a/docs/week-2-execution-plan.md
+++ b/docs/week-2-execution-plan.md
@@ -1,5 +1,7 @@
 # Next Phase Execution Plan (Implementation Order)
 
+> TODO (Phase 4 follow-up): include dataset-size context in performance baseline artifacts (`records_total`, `records_valid`, and rows/sec throughput) so duration comparisons are meaningful across different dataset sizes.
+
 This is the practical execution sequence for the next feature branch, optimized for speed and low risk.
 
 ## Scope for This Branch:
diff --git a/docs/week2-phase4-performance-baseline-implementation.md b/docs/week2-phase4-performance-baseline-implementation.md
index 91d5a6e..8036946 100644
--- a/docs/week2-phase4-performance-baseline-implementation.md
+++ b/docs/week2-phase4-performance-baseline-implementation.md
@@ -202,4 +202,43 @@ Phase 4 is considered done when:
    - p50/p95 by stage
    - SLO met/missed/deferred
    - unresolved bottlenecks
-5. Follow-up actions for Week 3/4 are explicitly captured.
\ No newline at end of file
+5. Follow-up actions for Week 3/4 are explicitly captured.
+
+---
+
+## 11) Week 3/4 Handoff: Required Updates (Do Not Skip)
+
+When ranking/list/detail APIs are implemented in Week 3/4, update the baseline implementation immediately.
+
+### Files to update
+
+1. `backend/app/services/performance_baseline.py`
+2. `backend/app/cli.py`
+3. `backend/tests/test_performance_baseline.py`
+4. `docs/mvp-performance-plan.md` (if SLOs change)
+5. `docs/week2-phase4-performance-baseline-implementation.md` (mark Week 3/4 handoff completed)
+
+### Exact required changes
+
+1. Replace API SLO placeholders from deferred to measured:
+   - `ranking_list_api_p95_ms`
+   - `filtered_ranking_api_p95_ms`
+   - `listing_detail_api_p95_ms`
+2. Add actual API benchmark execution in `run_performance_baseline`:
+   - call ranking/list/detail endpoints (or dedicated benchmark client),
+   - collect endpoint latency samples,
+   - compute p50/p95 for each endpoint.
+3. Remove API SLOs from `deferred` classification once benchmark harness exists.
+4. Add dataset-size context to artifacts:
+   - `records_total`
+   - `records_valid`
+   - throughput fields (for example rows/sec) for score/validation stages.
+5. Extend summary output to include API p95 result lines and pass/fail status.
+6. Add tests that assert:
+   - API latency metrics are present in `aggregate`,
+   - API SLOs are assessed under `met`/`missed` (not always deferred),
+   - dataset-size and throughput fields are written.
+
+### Completion signal
+
+Week 3/4 handoff is complete only when `baseline_metrics.json` contains measured API latency stats and API SLOs are no longer unconditionally deferred.
\ No newline at end of file

From 99db2c2056cc8e14f75dfbbc3bcbc7105634eda3 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Fri, 24 Apr 2026 14:33:47 +0200
Subject: [PATCH 09/16] feat: allow unknown extra fields in PropfluxListing
 schema and add corresponding tests

- Updated PropfluxListing model configuration to accept unknown fields, enhancing compatibility with evolving data schemas.
- Added tests to validate ingestion of listings with extra fields, ensuring that records remain valid despite additional attributes.
- Implemented partial validation tests to confirm that unknown fields do not invalidate the payload, supporting forward compatibility.
---
 backend/app/schemas/propflux_listing.py |  4 +++-
 backend/tests/test_ingestion_service.py | 28 +++++++++++++++++++++++++
 backend/tests/test_propflux_schema.py   | 24 +++++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/backend/app/schemas/propflux_listing.py b/backend/app/schemas/propflux_listing.py
index 784262f..a53d163 100644
--- a/backend/app/schemas/propflux_listing.py
+++ b/backend/app/schemas/propflux_listing.py
@@ -16,7 +16,9 @@ class RecordValidationError(BaseModel):
 
 
 class PropfluxListing(BaseModel):
-    model_config = ConfigDict(extra="forbid")
+    # Be permissive with future source schema additions. We still enforce all
+    # required fields/types below, but unknown keys are accepted.
+    model_config = ConfigDict(extra="allow")
 
     # Required fields
     title: str
diff --git a/backend/tests/test_ingestion_service.py b/backend/tests/test_ingestion_service.py
index 6a5cf7c..2e997d8 100644
--- a/backend/tests/test_ingestion_service.py
+++ b/backend/tests/test_ingestion_service.py
@@ -66,3 +66,31 @@ def test_ingestion_allows_land_records_with_missing_bedbath(
     assert listing is not None
     assert listing.bedrooms == 0
     assert listing.bathrooms == 0.0
+
+
+def test_ingestion_allows_unknown_extra_fields(db_session: Session, tmp_path: Path) -> None:
+    payload = [
+        {
+            "title": "4 Bedroom House in Welbedacht",
+            "price": 6250000.0,
+            "location": "Welbedacht, Knysna",
+            "bedrooms": 4,
+            "bathrooms": 4.0,
+            "property_type": "House",
+            "description": "Immaculate home with views",
+            "listing_id": "T5440103",
+            "source_site": "privateproperty",
+            "job_id": "72e50122",
+            "new_marketing_flag": True,
+        }
+    ]
+    input_file = tmp_path / "extra_fields_listing.json"
+    input_file.write_text(json.dumps(payload), encoding="utf-8")
+
+    job = ingest_propflux_file(db_session, input_file)
+    assert job.records_valid == 1
+    assert job.records_invalid == 0
+
+    listing = db_session.scalar(select(Listing))
+    assert listing is not None
+    assert listing.title == "4 Bedroom House in Welbedacht"
diff --git a/backend/tests/test_propflux_schema.py b/backend/tests/test_propflux_schema.py
index 1f088e5..3cb773e 100644
--- a/backend/tests/test_propflux_schema.py
+++ b/backend/tests/test_propflux_schema.py
@@ -61,3 +61,27 @@ def test_partial_validation_allows_known_propflux_optional_fields() -> None:
 
     assert len(valid) == 1
     assert len(invalid) == 0
+
+
+def test_partial_validation_allows_unknown_extra_fields() -> None:
+    payload = [
+        {
+            "title": "4 Bedroom House in Welbedacht",
+            "price": 6250000.0,
+            "location": "Welbedacht, Knysna",
+            "bedrooms": 4,
+            "bathrooms": 4.0,
+            "property_type": "House",
+            "description": "Immaculate home with views",
+            "listing_id": "T5440103",
+            "source_site": "privateproperty",
+            # Unknown/forward-compatible fields from evolving upstream payloads.
+            "job_id": "72e50122",
+            "new_marketing_flag": True,
+            "custom_notes": "future field should not invalidate record",
+        }
+    ]
+    valid, invalid = validate_propflux_payload_partial(payload)
+
+    assert len(valid) == 1
+    assert len(invalid) == 0

From 72df8f102bd99497c0a94a9855c26941cd577a2d Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Fri, 24 Apr 2026 14:44:13 +0200
Subject: [PATCH 10/16] fix: resolve paths for validation and evaluation
 reports in performance baseline

- Updated the performance baseline service to resolve and store absolute paths for validation and evaluation report files, ensuring consistency in file references.
- Enhanced metrics path resolution for baseline summary and metrics files to prevent potential issues with relative paths.
---
 backend/app/services/performance_baseline.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/backend/app/services/performance_baseline.py b/backend/app/services/performance_baseline.py
index 6c46069..bff2476 100644
--- a/backend/app/services/performance_baseline.py
+++ b/backend/app/services/performance_baseline.py
@@ -79,7 +79,9 @@ def run_performance_baseline(
 
             validation_result, validate_s = _timed(run_dataset_validation, db, ingestion_job.id)
             result["durations_s"]["validate_dataset"] = validate_s
-            result["artifacts"]["validation_report_path"] = validation_result.report_path
+            result["artifacts"]["validation_report_path"] = str(
+                Path(validation_result.report_path).resolve()
+            )
             validate_durations.append(validate_s)
 
             evaluation_report, evaluate_s = _timed(
@@ -90,7 +92,11 @@ def run_performance_baseline(
                 top_n=top_n,
             )
             result["durations_s"]["evaluate_scoring"] = evaluate_s
-            result["artifacts"]["evaluation_report_path"] = evaluation_report.get("report_path")
+            evaluation_report_path = evaluation_report.get("report_path")
+            if evaluation_report_path is not None:
+                result["artifacts"]["evaluation_report_path"] = str(
+                    Path(evaluation_report_path).resolve()
+                )
             evaluate_durations.append(evaluate_s)
 
             result["status"] = "pass"
@@ -187,6 +193,6 @@ def run_performance_baseline(
     summary_path = base_dir / "baseline_summary.md"
     summary_path.write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
 
-    metrics["metrics_path"] = str(metrics_path)
-    metrics["summary_path"] = str(summary_path)
+    metrics["metrics_path"] = str(metrics_path.resolve())
+    metrics["summary_path"] = str(summary_path.resolve())
     return metrics

From ac0ade83e58d7a9dd0e02e3ad7063941c7e8a23e Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Fri, 24 Apr 2026 15:07:53 +0200
Subject: [PATCH 11/16] feat: add new status to IngestionJob model for enhanced
 tracking

- Introduced a new status "analyzed" to the IngestionJob model, expanding the range of job states for better tracking and management of ingestion processes.
---
 ...0424_0005_add_analyzed_ingestion_status.py |  73 +++++++++++
 backend/app/models/ingestion_job.py           |   1 +
 backend/tests/test_analytics_service.py       |  14 ++
 docs/week2-phase5-validation-cycle-spec.md    | 121 ++++++++++++++++++
 4 files changed, 209 insertions(+)
 create mode 100644 backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py
 create mode 100644 backend/tests/test_analytics_service.py
 create mode 100644 docs/week2-phase5-validation-cycle-spec.md

diff --git a/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py
new file mode 100644
index 0000000..25503dc
--- /dev/null
+++ b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py
@@ -0,0 +1,73 @@
+"""add analyzed ingestion job status
+
+Revision ID: 20260424_0005
+Revises: 20260415_0004
+Create Date: 2026-04-24 00:05:00
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "20260424_0005"
+down_revision: str | None = "20260415_0004"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    op.alter_column(
+        "ingestion_jobs",
+        "status",
+        existing_type=sa.Enum(
+            "created",
+            "processing",
+            "completed",
+            "completed_with_errors",
+            "failed",
+            name="ingestion_job_status",
+            native_enum=False,
+        ),
+        type_=sa.Enum(
+            "created",
+            "processing",
+            "completed",
+            "completed_with_errors",
+            "analyzed",
+            "failed",
+            name="ingestion_job_status",
+            native_enum=False,
+        ),
+        existing_nullable=False,
+    )
+
+
+def downgrade() -> None:
+    op.execute("UPDATE ingestion_jobs SET status = 'completed' WHERE status = 'analyzed'")
+    op.alter_column(
+        "ingestion_jobs",
+        "status",
+        existing_type=sa.Enum(
+            "created",
+            "processing",
+            "completed",
+            "completed_with_errors",
+            "analyzed",
+            "failed",
+            name="ingestion_job_status",
+            native_enum=False,
+        ),
+        type_=sa.Enum(
+            "created",
+            "processing",
+            "completed",
+            "completed_with_errors",
+            "failed",
+            name="ingestion_job_status",
+            native_enum=False,
+        ),
+        existing_nullable=False,
+    )
+
diff --git a/backend/app/models/ingestion_job.py b/backend/app/models/ingestion_job.py
index 9dc659d..ee154f2 100644
--- a/backend/app/models/ingestion_job.py
+++ b/backend/app/models/ingestion_job.py
@@ -17,6 +17,7 @@ class IngestionJob(Base):
             "processing",
             "completed",
             "completed_with_errors",
+            "analyzed",
             "failed",
             name="ingestion_job_status",
             native_enum=False,
diff --git a/backend/tests/test_analytics_service.py b/backend/tests/test_analytics_service.py
new file mode 100644
index 0000000..4541ba7
--- /dev/null
+++ b/backend/tests/test_analytics_service.py
@@ -0,0 +1,14 @@
+from app.models.ingestion_job import IngestionJob
+from app.services.analytics import run_analytics_job
+from sqlalchemy.orm import Session
+
+
+def test_run_analytics_job_sets_analyzed_status(db_session: Session) -> None:
+    job = IngestionJob(input_path="fixture.json", status="completed")
+    db_session.add(job)
+    db_session.commit()
+    db_session.refresh(job)
+
+    analyzed = run_analytics_job(db_session, job.id)
+    assert analyzed.status == "analyzed"
+
diff --git a/docs/week2-phase5-validation-cycle-spec.md b/docs/week2-phase5-validation-cycle-spec.md
new file mode 100644
index 0000000..b387f2e
--- /dev/null
+++ b/docs/week2-phase5-validation-cycle-spec.md
@@ -0,0 +1,121 @@
+# Week 2 Phase 5 Spec: Validation Cycle and Release Decision
+
+This spec operationalizes Phase 5 from `docs/week-2-execution-plan.md`:
+
+1. run one full validation cycle,
+2. apply one controlled change set,
+3. rerun and evaluate against baseline,
+4. freeze profile/version and document release decision.
+
+---
+
+## 1) Scope
+
+- Dataset: `data/samples/propflux_pp_1000_listings.json`
+- Baseline run: ingest + score + validate-dataset (evaluation deferred until comparison step)
+- Controlled change: exactly one scoring config bundle
+- Candidate run: ingest + score + validate-dataset
+- Evaluation: candidate vs baseline
+- Finalization: freeze or revert + decision record
+
+Out of scope:
+
+- multi-bundle tuning in a single cycle
+- architecture/performance refactors
+- Week 3/4 API enhancements
+
+---
+
+## 2) Execution Steps
+
+### Step A: Baseline run
+
+Run:
+
+- `ingest <dataset>`
+- `score <baseline_job_id>`
+- `validate-dataset <baseline_job_id>`
+
+Capture:
+
+- baseline job ID
+- validation report path
+- notable quality warnings/errors
+
+### Step B: Controlled change set (single bundle)
+
+Apply one bounded, auditable scoring change:
+
+- allowed: weight-only or threshold-only bundle
+- disallowed: mixed refactors and multiple independent tuning bundles
+
+Record:
+
+- exact before/after values
+- rationale
+
+### Step C: Candidate run
+
+Run same pipeline on same dataset:
+
+- `ingest <dataset>`
+- `score <candidate_job_id>`
+- `validate-dataset <candidate_job_id>`
+
+Capture candidate artifacts.
+
+### Step D: Evaluation compare
+
+Run:
+
+- `evaluate-scoring <candidate_job_id> --reference-job-id <baseline_job_id> --top-n 20`
+
+Capture:
+
+- decision
+- failed/warning gates
+- report path
+
+### Step E: Finalization
+
+- If decision is `promote`: freeze changed profile/version.
+- If decision is `revert`: rollback controlled change set and freeze previous profile.
+- Write release decision record artifact.
+
+---
+
+## 3) Required Artifacts
+
+Produce a decision record containing:
+
+- dataset used
+- baseline job ID + validation report path
+- controlled change set details
+- candidate job ID + validation report path
+- evaluation report path and gate outcome
+- final decision (`promote`/`revert`/`experimental`)
+- freeze/rollback action taken
+- follow-up tasks
+
+---
+
+## 4) Acceptance Criteria
+
+Phase 5 is complete when:
+
+1. baseline run completed and recorded,
+2. one controlled change set applied and documented,
+3. candidate run completed and recorded,
+4. evaluation run completed with explicit decision,
+5. profile/version frozen via promote freeze or revert rollback,
+6. decision record artifact written.
+
+---
+
+## 5) Operational Notes
+
+- Keep commands and dataset constant between baseline and candidate.
+- If ancillary command paths fail (for example non-critical analytics command issues),
+  continue required Phase 5 evaluation flow and document blocker in decision record.
+- Prefer small reversible change bundles to maximize comparability.
+

From 302355047ab0c50707af7cd7c18048668f7d33f7 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Fri, 24 Apr 2026 15:28:10 +0200
Subject: [PATCH 12/16] docs: update PROJECT_NOTE.md with debugging notes and
 file references

- Added notes regarding a failure in scoring evaluation with minimal config changes, prompting a need for debugging.
- Included specific file paths related to the evaluation process for better tracking and context.
---
 .cursor/rules/PROJECT_NOTE.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md
index 1d78e3a..6182dfb 100644
--- a/.cursor/rules/PROJECT_NOTE.md
+++ b/.cursor/rules/PROJECT_NOTE.md
@@ -1,5 +1,11 @@
 # 🏠 Real Estate Deal Intelligence Platform (Full System)
 
+backend/output/evaluations/job-21-ref-20-20260424131807/scoring_evaluation_2026-04-24_13-18-07Z.json
+
+For some reason it is failing, with minimal config changes. phase5_week2_validation_decision_2026-04-24_rerun_after_enum_fix.md
+
+debug why this is the case.
+
 ## 🎯 Goal
 
 Build a **production-grade data intelligence system** that transforms large-scale real estate listing datasets into **high-quality investment opportunities** using advanced scoring, analytics, and a clean, interactive dashboard.

From fc236f784f602d08e3bf496ccfa4f09f409ef5f3 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Mon, 27 Apr 2026 12:22:58 +0200
Subject: [PATCH 13/16] refactor: update ranking identity mapping to use scored
 listing IDs

- Modified the `_ranking_identity_map` function to accept a list of `ScoreResult` objects instead of a job ID, improving the accuracy of identity mapping.
- Updated calls to `_ranking_identity_map` in `run_scoring_evaluation` to reflect the new parameter structure.
- Added a new test to ensure that identity mapping correctly utilizes scored listing IDs, enhancing the robustness of scoring evaluations.
---
 backend/app/services/scoring_evaluation.py  | 11 ++-
 backend/tests/test_scoring_v2_evaluation.py | 87 +++++++++++++++++++++
 2 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py
index 18ad17a..dec0b38 100644
--- a/backend/app/services/scoring_evaluation.py
+++ b/backend/app/services/scoring_evaluation.py
@@ -50,8 +50,11 @@ def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]:
     ).all()
 
 
-def _ranking_identity_map(db: Session, job_id: int) -> dict[int, str]:
-    listings = db.scalars(select(Listing).where(Listing.job_id == job_id)).all()
+def _ranking_identity_map(db: Session, score_rows: list[ScoreResult]) -> dict[int, str]:
+    listing_ids = {row.listing_id for row in score_rows}
+    if not listing_ids:
+        return {}
+    listings = db.scalars(select(Listing).where(Listing.id.in_(listing_ids))).all()
     identities: dict[int, str] = {}
     for listing in listings:
         external_listing_id = listing.listing_id
@@ -373,7 +376,7 @@ def run_scoring_evaluation(
         raise ValueError(f"No scored listings found for job: {job_id}")
 
     model_version = current_rows[0].model_version
-    current_identity_map = _ranking_identity_map(db, job_id)
+    current_identity_map = _ranking_identity_map(db, current_rows)
     top_n_effective = int(top_n) if top_n > 0 else 20
     sampled_rows = current_rows[:top_n_effective]
 
@@ -544,7 +547,7 @@ def run_scoring_evaluation(
         }
     else:
         reference_rows = _sorted_scores(db, reference_job_id)
-        reference_identity_map = _ranking_identity_map(db, reference_job_id)
+        reference_identity_map = _ranking_identity_map(db, reference_rows)
         current_global_ids = _segment_identities(
             current_rows, current_identity_map, 0, len(current_rows)
         )
diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py
index 395258a..f6b5632 100644
--- a/backend/tests/test_scoring_v2_evaluation.py
+++ b/backend/tests/test_scoring_v2_evaluation.py
@@ -345,3 +345,90 @@ def test_top_band_perturbation_threshold_can_pass(db_session: Session, monkeypat
     assert report["decision"] == "promote"
     top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"]
     assert top_band["status"] == "pass"
+
+
+def test_stability_identity_mapping_uses_scored_listing_ids_not_listing_job_id(
+    db_session: Session,
+) -> None:
+    baseline_job = IngestionJob(
+        input_path="baseline.json",
+        status="completed",
+        records_total=1,
+        records_valid=1,
+        records_invalid=0,
+    )
+    candidate_job = IngestionJob(
+        input_path="candidate.json",
+        status="completed",
+        records_total=1,
+        records_valid=1,
+        records_invalid=0,
+    )
+    db_session.add_all([baseline_job, candidate_job])
+    db_session.flush()
+
+    listing = Listing(
+        job_id=baseline_job.id,
+        source_hash="shared-listing-hash",
+        title="Shared listing",
+        price=1_500_000.0,
+        location="Cape Town",
+        bedrooms=3,
+        bathrooms=2.0,
+        property_type="house",
+        description="Shared listing across reruns",
+        listing_id="SHARED-1",
+        source_site="propflux",
+        city="Cape Town",
+        province="Western Cape",
+        floor_size=120.0,
+        normalized_payload={"fixture": True},
+    )
+    db_session.add(listing)
+    db_session.flush()
+
+    explanation = _build_explanation(normalized_value=0.8, final_score=80.0)
+    db_session.add_all(
+        [
+            ScoreResult(
+                job_id=baseline_job.id,
+                listing_id=listing.id,
+                score=80.0,
+                confidence=0.9,
+                deal_reason="baseline",
+                explanation=explanation,
+                model_version="advanced_v2",
+            ),
+            ScoreResult(
+                job_id=candidate_job.id,
+                listing_id=listing.id,
+                score=79.5,
+                confidence=0.9,
+                deal_reason="candidate",
+                explanation=explanation,
+                model_version="advanced_v2",
+            ),
+            RawListing(
+                job_id=candidate_job.id,
+                record_index=0,
+                source_site="propflux",
+                listing_id="SHARED-1",
+                payload={"price": listing.price},
+            ),
+        ]
+    )
+
+    # Simulate upsert behavior where canonical listing ownership shifts to the
+    # latest ingestion job, which previously broke identity overlap.
+    listing.job_id = candidate_job.id
+    db_session.commit()
+
+    report = run_scoring_evaluation(
+        db_session,
+        job_id=candidate_job.id,
+        reference_job_id=baseline_job.id,
+        top_n=20,
+    )
+    top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"]
+    assert top_band["metrics"]["intersection_count"] == 1
+    assert top_band["metrics"]["jaccard_overlap"] == 1.0

From 1d6fc34b4aef2e74637819f1b2bae922a8165957 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Mon, 27 Apr 2026 12:33:47 +0200
Subject: [PATCH 14/16] docs: update project status and notes for Week 2
 completion

- Revised PROJECT_NOTE.md to reflect the successful completion of Week 2, including Phase 5 validation outcomes and final scoring profile values.
- Updated current-project-status.md to indicate the transition to Week 3, highlighting the readiness of Week 2 outputs and outlining the next objectives for API/CLI/dashboard implementation.
- Added details on the final validation decision and decision artifact for better tracking of project progress.
---
 .cursor/rules/PROJECT_NOTE.md  | 16 ++++++++------
 docs/current-project-status.md | 39 ++++++++++++++++++----------------
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md
index 6182dfb..be40cd1 100644
--- a/.cursor/rules/PROJECT_NOTE.md
+++ b/.cursor/rules/PROJECT_NOTE.md
@@ -1,11 +1,5 @@
 # 🏠 Real Estate Deal Intelligence Platform (Full System)
 
-backend/output/evaluations/job-21-ref-20-20260424131807/scoring_evaluation_2026-04-24_13-18-07Z.json
-
-For some reason it is failing, with minimal config changes. phase5_week2_validation_decision_2026-04-24_rerun_after_enum_fix.md
-
-debug why this is the case.
-
 ## 🎯 Goal
 
 Build a **production-grade data intelligence system** that transforms large-scale real estate listing datasets into **high-quality investment opportunities** using advanced scoring, analytics, and a clean, interactive dashboard.
@@ -460,6 +454,16 @@ Ship an ROI-first, explainable scoring system (`advanced_v2`) with deterministic
   - full-dataset displacement context,
   - relative displacement thresholds (`*_pct`) for dataset-size-aware gating.
 
+### **Week 2 Completion Status (Latest)**
+
+- Phase 5 rerun after enum + evaluation identity fixes completed successfully.
+- Final validation decision: `promote`.
+- Frozen Week 2 scoring profile values:
+  - `advanced_v2.weights.price_vs_comp = 0.29`
+  - `advanced_v2.weights.roi_proxy = 0.21`
+- Decision artifact:
+  - `backend/output/evaluations/phase5_week2_validation_decision_2026-04-27_post_enum_eval_fix.md`
+
 ---
 
 ## **Week 3**
diff --git a/docs/current-project-status.md b/docs/current-project-status.md
index d2bbee7..c1e7d7e 100644
--- a/docs/current-project-status.md
+++ b/docs/current-project-status.md
@@ -7,13 +7,13 @@ This file is the single reference for:
 
 ## Snapshot
 
-- Current phase: transition from Week 1 completion to Week 2 implementation.
-- Branch readiness: documentation and planning are in place for next-phase execution.
-- Primary next objective: implement Week 2 advanced scoring and explanation payloads.
+- Current phase: Week 2 implementation completed (including Phase 5 validation cycle).
+- Branch readiness: Week 2 outputs are production-candidate and frozen pending Week 3 scope.
+- Primary next objective: begin Week 3 strategy-driven API/CLI/dashboard implementation.
 
 ## Completion Checklist
 
-## Completed (foundation)
+## Completed (foundation + Week 2)
 
 - [x] Project scaffolding, dev scripts, and CI baseline
 - [x] Docker Compose + PostgreSQL setup
@@ -23,16 +23,18 @@ This file is the single reference for:
 - [x] Dataset validation service and CLI command (`validate-dataset`)
 - [x] Core unit/integration test coverage for ingestion, scoring, and dataset validation
 - [x] Week 2 strategy and architecture documentation package
+- [x] Week 2 advanced scoring (`advanced_v2`) with micro-comps + ROI proxy
+- [x] Week 2 structured reasoning payload in scored output
+- [x] Week 2 evaluation gates (`promote`/`revert`/`experimental`) and CLI integration
+- [x] Week 2 segment-based stability checks with relative displacement thresholds
+- [x] Week 2 Phase 4 performance baseline command and artifacts
+- [x] Week 2 Phase 5 validation cycle completed with final promoted profile
 
 ## Planned, not implemented yet
 
-- [ ] Week 2 advanced scoring (`advanced_v2`) with micro-comps
-- [ ] Week 2 ROI proxy (yield + transaction costs)
-- [ ] Week 2 structured reasoning payload in scored output
-- [ ] Week 2 analytics quality/stability checks
-- [ ] Week 2 optional LLM enrichment prototype (gated)
 - [ ] Week 3 strategy-driven ranking API/CLI/dashboard functionality
 - [ ] Week 4 full validation/tuning/release hardening loop
+- [ ] Week 2 optional LLM enrichment prototype (gated)
 
 ## Deferred/optional
 
@@ -40,18 +42,18 @@ This file is the single reference for:
 - [ ] Heavy macro/geospatial modeling
 - [ ] PDF export (if not required for MVP release)
 
-## Next Feature Branch Kickoff Checklist
+## Next Feature Branch Kickoff Checklist (Week 3)
 
 Use this immediately when starting the next branch:
 
-1. Confirm scope: Week 2 must-ship only (do not mix Week 3 UI/API overhaul work).
-2. Define `advanced_v2` signal contract and output schema.
-3. Implement micro-comps computation with safe fallbacks and confidence penalties.
-4. Implement ROI proxy signals with deterministic defaults first.
-5. Add structured reasoning payload persistence + tests.
-6. Add evaluation gates from `docs/evaluation-review-protocol.md`.
-7. Run manual sample review and log results.
-8. Merge only if promotion thresholds pass.
+1. Confirm Week 3 scope: strategy-driven API/CLI/dashboard only.
+2. Keep Week 2 scoring profile as baseline:
+   - `advanced_v2.weights.price_vs_comp = 0.29`
+   - `advanced_v2.weights.roi_proxy = 0.21`
+3. Build ranking/list/detail API endpoints and aligned CLI workflow.
+4. Implement Week 3 performance handoff items from Phase 4 baseline docs.
+5. Preserve Week 2 evaluation contracts while expanding strategy surfaces.
+6. Run regression checks against Week 2 decision artifact before merging.
 
 ## Branch Scope Guardrail (Important)
 
@@ -66,3 +68,4 @@ Use this immediately when starting the next branch:
 - Evaluation protocol: `docs/evaluation-review-protocol.md`
 - Principal audit: `docs/project-note-principal-audit.md`
 - MVP performance plan: `docs/mvp-performance-plan.md`
+- Week 2 final decision (post enum/eval fix): `backend/output/evaluations/phase5_week2_validation_decision_2026-04-27_post_enum_eval_fix.md`

From cb967a4a8c04afbbe04ccb3baab8b3a0182ab916 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Mon, 27 Apr 2026 12:38:25 +0200
Subject: [PATCH 15/16] refactor: improve code readability and structure in
 generate_top5_audit_visualization.py

- Rearranged import statements for better organization, moving datetime import above others.
- Enhanced readability by formatting complex expressions and return statements across multiple lines.
- Updated the construction of HTML strings to use list comprehension for clarity and maintainability.
- Made minor adjustments to variable assignments for improved consistency and readability.
---
 .../generate_top5_audit_visualization.py      | 74 ++++++++++++-------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/backend/scripts/generate_top5_audit_visualization.py b/backend/scripts/generate_top5_audit_visualization.py
index 7eeb90d..8d40882 100644
--- a/backend/scripts/generate_top5_audit_visualization.py
+++ b/backend/scripts/generate_top5_audit_visualization.py
@@ -1,16 +1,13 @@
 from __future__ import annotations
 
 import argparse
-from datetime import UTC, datetime
 import html
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
 
-from matplotlib import pyplot as plt
 import numpy as np
 import plotly.graph_objects as go
-from sqlalchemy import select
-
 from app.db.session import SessionLocal
 from app.models.listing import Listing
 from app.models.score_result import ScoreResult
@@ -22,6 +19,8 @@
     _load_scoring_config,
     _resolve_comp_context,
 )
+from matplotlib import pyplot as plt
+from sqlalchemy import select
 
 
 def _load_top_listings(job_id: int, limit: int) -> list[dict[str, Any]]:
@@ -103,9 +102,14 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in
         all_listings = db.scalars(select(Listing).where(Listing.job_id == job_id)).all()
 
     comp_index = _build_comp_index(
-        all_listings, fallback_order, include_bedrooms=include_bedrooms, include_bathrooms=include_bathrooms
+        all_listings,
+        fallback_order,
+        include_bedrooms=include_bedrooms,
+        include_bathrooms=include_bathrooms,
     )
-    listing_map = {int(listing.id): listing for listing in all_listings if int(listing.id) in selected_ids}
+    listing_map = {
+        int(listing.id): listing for listing in all_listings if int(listing.id) in selected_ids
+    }
     now_date = datetime.now(UTC).date()
     details: dict[int, dict[str, Any]] = {}
 
@@ -132,7 +136,9 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in
             else None
         )
         days_on_market = (
-            max(0, (now_date - listing.date_posted).days) if listing.date_posted is not None else None
+            max(0, (now_date - listing.date_posted).days)
+            if listing.date_posted is not None
+            else None
         )
         confidence = _confidence_signal(listing)
         feature_value = _feature_density_signal(listing)
@@ -161,7 +167,9 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in
                 "fallback_penalty": round(float(fallback_penalty), 4),
             },
             "size_vs_comp_inputs": {
-                "listing_floor_size": float(listing.floor_size) if listing.floor_size is not None else None,
+                "listing_floor_size": (
+                    float(listing.floor_size) if listing.floor_size is not None else None
+                ),
                 "listing_ppsqm": listing_ppsqm,
                 "comp_median_ppsqm": comp_median_ppsqm,
                 "comp_level": comp_level,
@@ -246,15 +254,23 @@ def _build_details_table_html(records: list[dict[str, Any]]) -> str:
             "</tr>"
         )
 
-    return (
-        "<h2>Metric Input Audit Table</h2>"
-        "<p>Inputs below are the listing datapoints and intermediate values used for metric calculations.</p>"
-        "<div style='overflow-x:auto;'>"
-        "<table border='1' cellspacing='0' cellpadding='6' style='border-collapse:collapse;font-family:Arial,sans-serif;font-size:12px;'>"
-        f"<thead><tr>{header_html}</tr></thead>"
-        f"<tbody>{''.join(body_rows)}</tbody>"
-        "</table>"
-        "</div>"
+    return "".join(
+        [
+            "<h2>Metric Input Audit Table</h2>",
+            (
+                "<p>Inputs below are the listing datapoints and intermediate values "
+                "used for metric calculations.</p>"
+            ),
+            "<div style='overflow-x:auto;'>",
+            (
+                "<table border='1' cellspacing='0' cellpadding='6' "
+                "style='border-collapse:collapse;font-family:Arial,sans-serif;font-size:12px;'>"
+            ),
+            f"<thead><tr>{header_html}</tr></thead>",
+            f"<tbody>{''.join(body_rows)}</tbody>",
+            "</table>",
+            "</div>",
+        ]
     )
 
 
@@ -320,14 +336,22 @@ def _build_interactive_chart(
     )
     details_html = _build_details_table_html(records)
     chart_html = fig.to_html(include_plotlyjs="cdn", full_html=False)
-    full_html = (
-        "<!DOCTYPE html><html><head><meta charset='utf-8'><title>PropSignal Top 5 Audit</title></head>"
-        "<body style='margin:18px;font-family:Arial,sans-serif;'>"
-        "<h1>PropSignal Human Audit: Top 5 Listings</h1>"
-        "<p>Interactive chart + metric input audit table for explanation concordance checks.</p>"
-        f"{chart_html}"
-        f"{details_html}"
-        "</body></html>"
+    full_html = "".join(
+        [
+            (
+                "<!DOCTYPE html><html><head><meta charset='utf-8'>"
+                "<title>PropSignal Top 5 Audit</title></head>"
+            ),
+            "<body style='margin:18px;font-family:Arial,sans-serif;'>",
+            "<h1>PropSignal Human Audit: Top 5 Listings</h1>",
+            (
+                "<p>Interactive chart + metric input audit table "
+                "for explanation concordance checks.</p>"
+            ),
+            f"{chart_html}",
+            f"{details_html}",
+            "</body></html>",
+        ]
     )
     output_html.write_text(full_html, encoding="utf-8")
 

From 140fc8d7278a8b91225ad163b4e4070785dc4277 Mon Sep 17 00:00:00 2001
From: William <vimscientist69@gmail.com>
Date: Mon, 27 Apr 2026 12:41:48 +0200
Subject: [PATCH 16/16] refactor: improve type annotations and code clarity in
 scoring evaluation and performance baseline services

- Updated type annotations for `slo_assessment` in `performance_baseline.py` to specify dictionary structure.
- Enhanced type annotations for parameters in `_compute_jaccard` and `_spearman_rank_correlation` functions in `scoring_evaluation.py` to use `Sequence` for better flexibility.
- Simplified the assignment of `identities` in `_ranking_identity_map` for improved readability.
- Consolidated the construction of `fallback_order` in `generate_top5_audit_visualization.py` for cleaner code.
- Removed unnecessary blank lines in various test files to maintain consistency and cleanliness in the codebase.
---
 ...0424_0005_add_analyzed_ingestion_status.py |  1 -
 backend/app/services/performance_baseline.py  |  2 +-
 backend/app/services/scoring_evaluation.py    | 24 +++++++++++--------
 .../generate_top5_audit_visualization.py      |  4 +---
 backend/tests/test_analytics_service.py       |  1 -
 backend/tests/test_scoring_v2_evaluation.py   |  1 +
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py
index 25503dc..cc5e58e 100644
--- a/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py
+++ b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py
@@ -70,4 +70,3 @@ def downgrade() -> None:
         ),
         existing_nullable=False,
     )
-
diff --git a/backend/app/services/performance_baseline.py b/backend/app/services/performance_baseline.py
index bff2476..812b4cf 100644
--- a/backend/app/services/performance_baseline.py
+++ b/backend/app/services/performance_baseline.py
@@ -131,7 +131,7 @@ def run_performance_baseline(
         "filtered_ranking_api_p95_ms": 1200.0,
         "listing_detail_api_p95_ms": 500.0,
     }
-    slo_assessment = {"met": [], "missed": [], "deferred": []}
+    slo_assessment: dict[str, list[str]] = {"met": [], "missed": [], "deferred": []}
     if aggregate["score"]["p95_s"] <= slo_targets["scoring_run_10k_max_s"]:
         slo_assessment["met"].append("scoring_run_10k_max_s")
     else:
diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py
index dec0b38..0d4ccb0 100644
--- a/backend/app/services/scoring_evaluation.py
+++ b/backend/app/services/scoring_evaluation.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+from collections.abc import Sequence
 from datetime import UTC, datetime
 from pathlib import Path
 from statistics import correlation, median
@@ -21,7 +22,7 @@ def _safe_divide(numerator: float, denominator: float) -> float:
     return numerator / denominator
 
 
-def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float:
+def _compute_jaccard(left_ids: Sequence[str | int], right_ids: Sequence[str | int]) -> float:
     left_set = set(left_ids)
     right_set = set(right_ids)
     union_size = len(left_set | right_set)
@@ -30,7 +31,9 @@ def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float:
     return round(len(left_set & right_set) / union_size, 4)
 
 
-def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str]) -> float:
+def _spearman_rank_correlation(
+    current_ids: Sequence[str | int], reference_ids: Sequence[str | int]
+) -> float:
     current_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(current_ids)}
     reference_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(reference_ids)}
     common_ids = sorted(set(current_rank) & set(reference_rank))
@@ -43,11 +46,12 @@ def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str])
 
 
 def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]:
-    return db.scalars(
+    rows = db.scalars(
         select(ScoreResult)
         .where(ScoreResult.job_id == job_id)
         .order_by(ScoreResult.score.desc(), ScoreResult.listing_id.asc())
     ).all()
+    return list(rows)
 
 
 def _ranking_identity_map(db: Session, score_rows: list[ScoreResult]) -> dict[int, str]:
@@ -58,11 +62,10 @@ def _ranking_identity_map(db: Session, score_rows: list[ScoreResult]) -> dict[in
     identities: dict[int, str] = {}
     for listing in listings:
         external_listing_id = listing.listing_id
-        identities[listing.id] = (
-            external_listing_id
-            if external_listing_id not in (None, "")
-            else f"internal-{listing.id}"
-        )
+        if isinstance(external_listing_id, str) and external_listing_id:
+            identities[listing.id] = external_listing_id
+        else:
+            identities[listing.id] = f"internal-{listing.id}"
     return identities
 
 
@@ -709,12 +712,13 @@ def run_scoring_evaluation(
     minimum_sample_for_promote = int(decision_thresholds.get("minimum_sample_for_promote", 100))
     warning_gate_keys: list[str] = []
     failed_gate_keys: list[str] = []
-    for gate_key, gate_payload in {
+    gate_payloads: dict[str, dict[str, Any]] = {
         "data_quality": data_quality_gate,
         "scoring_sanity": scoring_sanity_gate,
         "stability": stability_gate,
         "explainability": explainability_gate,
-    }.items():
+    }
+    for gate_key, gate_payload in gate_payloads.items():
         if gate_payload["status"] == "fail":
             failed_gate_keys.append(gate_key)
         elif gate_payload["status"] == "warn":
diff --git a/backend/scripts/generate_top5_audit_visualization.py b/backend/scripts/generate_top5_audit_visualization.py
index 8d40882..347775a 100644
--- a/backend/scripts/generate_top5_audit_visualization.py
+++ b/backend/scripts/generate_top5_audit_visualization.py
@@ -90,9 +90,7 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in
     advanced_v2_cfg = config.get("advanced_v2", {})
     comps_cfg = advanced_v2_cfg.get("comps", {})
     roi_cfg = advanced_v2_cfg.get("roi", {})
-    fallback_order = list(
-        comps_cfg.get("fallback_order", ["suburb", "city", "province", "global"])
-    )
+    fallback_order = list(comps_cfg.get("fallback_order", ["suburb", "city", "province", "global"]))
     include_bedrooms = bool(comps_cfg.get("include_bedrooms", True))
     include_bathrooms = bool(comps_cfg.get("include_bathrooms", True))
     minimum_cohort_size = int(comps_cfg.get("minimum_cohort_size", 12))
diff --git a/backend/tests/test_analytics_service.py b/backend/tests/test_analytics_service.py
index 4541ba7..1afb3f0 100644
--- a/backend/tests/test_analytics_service.py
+++ b/backend/tests/test_analytics_service.py
@@ -11,4 +11,3 @@ def test_run_analytics_job_sets_analyzed_status(db_session: Session) -> None:
 
     analyzed = run_analytics_job(db_session, job.id)
     assert analyzed.status == "analyzed"
-
diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py
index f6b5632..19400a3 100644
--- a/backend/tests/test_scoring_v2_evaluation.py
+++ b/backend/tests/test_scoring_v2_evaluation.py
@@ -203,6 +203,7 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat
     assert report["decision"] == "revert"
     assert "stability" in report["failed_gates"]
 
+
 def test_full_dataset_displacement_warning_is_context_only(
     db_session: Session, monkeypatch
 ) -> None: