From b791215ca881fbb985d16548d65c3c8270f69942 Mon Sep 17 00:00:00 2001 From: William Date: Wed, 22 Apr 2026 14:28:16 +0200 Subject: [PATCH 01/16] feat: implement segment-based stability thresholds in scoring evaluation - Introduced segment-based thresholds for scoring evaluation, defining metrics for `top_band`, `middle_band`, and `bottom_band` to enhance diagnostic capabilities. - Updated scoring configuration to reflect new stability metrics, including Jaccard and rank correlation thresholds for each segment. - Enhanced tests to validate the integration of segment thresholds and ensure correct evaluation reporting. - Improved documentation to clarify the purpose and structure of the new segment-based stability diagnostics. --- backend/app/services/scoring.py | 34 +- backend/app/services/scoring_evaluation.py | 440 +++++++++++++------- backend/tests/test_scoring_algorithms.py | 25 +- backend/tests/test_scoring_v2_evaluation.py | 95 +++++ config/scoring.yaml | 29 +- 5 files changed, 456 insertions(+), 167 deletions(-) diff --git a/backend/app/services/scoring.py b/backend/app/services/scoring.py index db10b59..eb13ecf 100644 --- a/backend/app/services/scoring.py +++ b/backend/app/services/scoring.py @@ -73,9 +73,37 @@ "high_score_cutoff": 80.0, }, "stability": { - "top20_jaccard_min": 0.7, - "rank_correlation_min": 0.8, - "perturbation_overlap_min": 0.6, + "segments": { + "top_band": { + "mode": "top_n", + "top_n": 20, + "jaccard_min": 0.7, + "rank_correlation_min": 0.8, + "perturbation_overlap_min": 0.6, + "median_abs_rank_shift_max": 30, + "p90_rank_shift_max": 120, + }, + "middle_band": { + "start_pct": 0.45, + "end_pct": 0.60, + "jaccard_warn_min": 0.3, + "rank_correlation_warn_min": 0.5, + "median_abs_rank_shift_warn_max": 250, + "p90_rank_shift_warn_max": 1200, + }, + "bottom_band": { + "start_pct": 0.85, + "end_pct": 1.00, + "jaccard_warn_min": 0.25, + "rank_correlation_warn_min": 0.4, + "median_abs_rank_shift_warn_max": 300, + "p90_rank_shift_warn_max": 1500, + }, + }, + "full_dataset": { + "median_abs_rank_shift_warn_max": 200, + "p90_rank_shift_warn_max": 1000, + }, }, "decision": { "minimum_sample_for_promote": 100, diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py index bb9da2c..f864931 100644 --- a/backend/app/services/scoring_evaluation.py +++ b/backend/app/services/scoring_evaluation.py @@ -3,7 +3,7 @@ import json from datetime import UTC, datetime from pathlib import Path -from statistics import correlation +from statistics import correlation, median from typing import Any from sqlalchemy import select @@ -16,28 +16,12 @@ def _safe_divide(numerator: float, denominator: float) -> float: - """Divide safely and return 0.0 on zero denominator. - - Why this exists: - - Evaluation math uses ratios in multiple places. - - A zero denominator should not crash a release-gate run. - """ if denominator == 0: return 0.0 return numerator / denominator def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float: - """Measure set overlap using Jaccard similarity. - - Formula: - - |intersection| / |union| - - Interpretation: - - 1.0 means the sets are identical. - - 0.0 means no overlap. - - Used here to check whether top-N listings stayed mostly the same. - """ left_set = set(left_ids) right_set = set(right_ids) union_size = len(left_set | right_set) @@ -47,21 +31,6 @@ def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float: def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str]) -> float: - """Compare ordering consistency between two ranked lists. - - How it works: - - Convert each list into rank positions. - - Keep only items that appear in both lists. - - Compute correlation on those rank positions. - - Interpretation: - - +1.0: near-identical order - - 0.0: weak relationship - - -1.0: near-reversed order - - Why it matters: - - Top-N overlap alone can look fine while overall ordering drifts heavily. - """ current_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(current_ids)} reference_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(reference_ids)} common_ids = sorted(set(current_rank) & set(reference_rank)) @@ -74,15 +43,6 @@ def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str]) def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]: - """Fetch one job's score rows in deterministic rank order. - - Sort order: - - Primary: score descending (higher scores rank first). - - Secondary: listing_id ascending as a deterministic tie-breaker. - - Tie-breakers are important so repeated runs produce stable ordering for equal scores. - """ - # listing_id is used only as a deterministic tie-breaker when scores are equal. return db.scalars( select(ScoreResult) .where(ScoreResult.job_id == job_id) @@ -91,16 +51,6 @@ def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]: def _ranking_identity_map(db: Session, job_id: int) -> dict[int, str]: - """Map internal DB listing IDs to cross-run comparable IDs. - - Why this exists: - - `score_results.listing_id` is an internal DB key and differs across jobs. - - Stability comparisons should use a stable external identity. - - Strategy: - - Prefer external `listing.listing_id`. - - Fall back to `internal-` when external ID is missing. - """ listings = db.scalars(select(Listing).where(Listing.job_id == job_id)).all() identities: dict[int, str] = {} for listing in listings: @@ -114,15 +64,6 @@ def _ranking_identity_map(db: Session, job_id: int) -> dict[int, str]: def _dominance_ratio(score_row: ScoreResult) -> float: - """Compute how much one signal dominates the explanation contributions. - - Formula: - - max(abs(weighted_contribution)) / sum(abs(weighted_contribution)) - - Interpretation: - - Values near 1.0 indicate one signal is doing almost all the work. - - High dominance can indicate brittle or unbalanced scoring. - """ if not score_row.explanation: return 1.0 signals = score_row.explanation.get("signals") @@ -142,16 +83,6 @@ def _dominance_ratio(score_row: ScoreResult) -> float: def _score_math_consistent(score_row: ScoreResult) -> bool: - """Validate that explanation math matches stored score values. - - Checks: - - Re-sum `signals[].weighted_contribution` and compare to - `score_math.weighted_sum_0_to_1`. - - Compare row `score` to `score_math.final_score_0_to_100`. - - This protects against explainability drift where narrative payloads disagree - with actual scoring outputs. - """ if not score_row.explanation: return False @@ -175,13 +106,6 @@ def _score_math_consistent(score_row: ScoreResult) -> bool: def _extract_signal_vectors(score_row: ScoreResult) -> dict[str, tuple[float, float]]: - """Extract per-signal (normalized_score, weight) pairs from explanation payload. - - Output shape: - - {signal_name: (normalized_score, weight)} - - This is the base structure used for perturbation sensitivity simulations. - """ explanation = score_row.explanation or {} signals = explanation.get("signals") if not isinstance(signals, list): @@ -206,17 +130,6 @@ def _perturbed_score( target_signal: str, delta: float, ) -> float: - """Recompute a simulated score after perturbing one signal weight. - - Process: - - Increase/decrease one target weight by `delta` (e.g., +0.05 or -0.10). - - Keep other weights unchanged. - - Re-normalize all weights to sum to 1. - - Recompute weighted score. - - Purpose: - - Estimate sensitivity: does a small weight tweak cause large ranking movement? - """ if not vectors: return 0.0 @@ -243,17 +156,6 @@ def _compute_perturbation_overlap( top_n: int, deltas: list[float], ) -> tuple[float, list[dict[str, Any]]]: - """Run perturbation experiments and measure top-N stability. - - For each signal and each delta: - - recompute perturbed scores - - rerank listings - - compare perturbed top-N vs baseline top-N using Jaccard - - Returns: - - minimum top-N overlap across all experiments (worst-case stability) - - detailed per-experiment metrics for audit/debugging - """ if not rows: return 0.0, [] @@ -300,29 +202,151 @@ def _compute_perturbation_overlap( return round(min(overlaps), 4), experiments -# continue here +def _segment_bounds(total_count: int, start_pct: float, end_pct: float) -> tuple[int, int]: + if total_count <= 0: + return (0, 0) + start = int(total_count * _clamp(start_pct)) + end = int(total_count * _clamp(end_pct)) + start = max(0, min(start, total_count)) + end = max(0, min(end, total_count)) + if end <= start: + end = min(total_count, start + 1) + return (start, end) + + +def _segment_identities( + rows: list[ScoreResult], + identity_map: dict[int, str], + start_idx: int, + end_idx: int, +) -> list[str]: + segment_rows = rows[start_idx:end_idx] + return [identity_map.get(row.listing_id, f"internal-{row.listing_id}") for row in segment_rows] + + +def _global_rank_map(identities: list[str]) -> dict[str, int]: + return {identity: idx + 1 for idx, identity in enumerate(identities)} + + +def _percentile(values: list[float], pct: float) -> float: + if not values: + return 0.0 + ordered = sorted(values) + idx = int(round((len(ordered) - 1) * _clamp(pct))) + return float(ordered[idx]) + + +def _rank_displacement_metrics( + current_ids: list[str], + reference_ids: list[str], + current_global_rank: dict[str, int], + reference_global_rank: dict[str, int], +) -> dict[str, float]: + shared_ids = sorted(set(current_ids) & set(reference_ids)) + if not shared_ids: + return { + "intersection_count": 0.0, + "median_abs_rank_shift": 0.0, + "p90_rank_shift": 0.0, + } + + shifts = [ + float(abs(current_global_rank[listing_id] - reference_global_rank[listing_id])) + for listing_id in shared_ids + ] + return { + "intersection_count": float(len(shared_ids)), + "median_abs_rank_shift": round(float(median(shifts)), 4), + "p90_rank_shift": round(_percentile(shifts, 0.90), 4), + } + + +def _evaluate_segment_stability( + *, + segment_name: str, + current_ids: list[str], + reference_ids: list[str], + current_global_rank: dict[str, int], + reference_global_rank: dict[str, int], + thresholds: dict[str, Any], + severity: str, +) -> dict[str, Any]: + jaccard_overlap = _compute_jaccard(current_ids, reference_ids) + rank_correlation = _spearman_rank_correlation(current_ids, reference_ids) + displacement = _rank_displacement_metrics( + current_ids, + reference_ids, + current_global_rank, + reference_global_rank, + ) + intersection_count = int(displacement["intersection_count"]) + + failed_checks: list[str] = [] + if severity == "fail": + jaccard_min = thresholds.get("jaccard_min") + if jaccard_min is not None and jaccard_overlap < float(jaccard_min): + failed_checks.append(f"jaccard_below_min:{jaccard_overlap}<{jaccard_min}") + rank_corr_min = thresholds.get("rank_correlation_min") + if rank_corr_min is not None and rank_correlation < float(rank_corr_min): + failed_checks.append(f"rank_corr_below_min:{rank_correlation}<{rank_corr_min}") + median_max = thresholds.get("median_abs_rank_shift_max") + if median_max is not None and displacement["median_abs_rank_shift"] > float(median_max): + failed_checks.append( + "median_abs_rank_shift_above_max:" + f"{displacement['median_abs_rank_shift']}>{median_max}" + ) + p90_max = thresholds.get("p90_rank_shift_max") + if p90_max is not None and displacement["p90_rank_shift"] > float(p90_max): + failed_checks.append( + f"p90_rank_shift_above_max:{displacement['p90_rank_shift']}>{p90_max}" + ) + else: + jaccard_warn_min = thresholds.get("jaccard_warn_min") + if jaccard_warn_min is not None and jaccard_overlap < float(jaccard_warn_min): + failed_checks.append(f"jaccard_below_warn_min:{jaccard_overlap}<{jaccard_warn_min}") + rank_corr_warn_min = thresholds.get("rank_correlation_warn_min") + if rank_corr_warn_min is not None and rank_correlation < float(rank_corr_warn_min): + failed_checks.append( + f"rank_corr_below_warn_min:{rank_correlation}<{rank_corr_warn_min}" + ) + median_warn_max = thresholds.get("median_abs_rank_shift_warn_max") + if median_warn_max is not None and displacement["median_abs_rank_shift"] > float( + median_warn_max + ): + failed_checks.append( + "median_abs_rank_shift_above_warn_max:" + f"{displacement['median_abs_rank_shift']}>{median_warn_max}" + ) + p90_warn_max = thresholds.get("p90_rank_shift_warn_max") + if p90_warn_max is not None and displacement["p90_rank_shift"] > float(p90_warn_max): + failed_checks.append( + f"p90_rank_shift_above_warn_max:{displacement['p90_rank_shift']}>{p90_warn_max}" + ) + + status = "pass" if not failed_checks else severity + return { + "segment_name": segment_name, + "status": status, + "metrics": { + "sample_size_current": len(current_ids), + "sample_size_reference": len(reference_ids), + "intersection_count": intersection_count, + "jaccard_overlap": jaccard_overlap, + "rank_correlation": rank_correlation, + "median_abs_rank_shift": displacement["median_abs_rank_shift"], + "p90_rank_shift": displacement["p90_rank_shift"], + }, + "thresholds": thresholds, + "violation_details": {"failed_checks": failed_checks}, + } + + def run_scoring_evaluation( db: Session, job_id: int, reference_job_id: int | None = None, top_n: int = 20, ) -> dict[str, Any]: - """Evaluate one scoring run and produce a release decision artifact. - - Gate families: - - Data quality: valid/duplicate/null-rate thresholds. - - Scoring sanity: score bounds, impossible top ranks, signal dominance. - - Explainability: payload presence and math consistency. - - Stability: top-N overlap, rank correlation, perturbation robustness. - - Decision logic: - - `revert` if any critical gate fails. - - `experimental` if sample is too small or warnings remain. - - `promote` only when all required gates pass. - - Side effect: - - Writes `output/evaluations//scoring_evaluation_.json`. - """ config = _load_scoring_config() thresholds = config.get("evaluation_thresholds", {}) data_thresholds = thresholds.get("data_quality", {}) @@ -335,8 +359,9 @@ def run_scoring_evaluation( raise ValueError(f"No scored listings found for job: {job_id}") model_version = current_rows[0].model_version - sampled_rows = current_rows[:top_n] current_identity_map = _ranking_identity_map(db, job_id) + top_n_effective = int(top_n) if top_n > 0 else 20 + sampled_rows = current_rows[:top_n_effective] validation = run_dataset_validation(db, job_id) valid_rate_min = float(data_thresholds.get("valid_rate_min", 0.85)) @@ -490,11 +515,12 @@ def run_scoring_evaluation( "score_math_mismatches_top_n": len(score_math_mismatch_rows), }, "thresholds": { - "required_top_n": top_n, + "required_top_n": top_n_effective, }, } stability_gate: dict[str, Any] + stability_warning_keys: list[str] = [] if reference_job_id is None: stability_gate = { "status": "warn", @@ -505,52 +531,156 @@ def run_scoring_evaluation( else: reference_rows = _sorted_scores(db, reference_job_id) reference_identity_map = _ranking_identity_map(db, reference_job_id) - reference_top_ids = [ - reference_identity_map.get(row.listing_id, f"internal-{row.listing_id}") - for row in reference_rows[:top_n] - ] - current_top_ids = [ - current_identity_map.get(row.listing_id, f"internal-{row.listing_id}") - for row in current_rows[:top_n] - ] - top_n_jaccard = _compute_jaccard(current_top_ids, reference_top_ids) - rank_corr = _spearman_rank_correlation( - [ - current_identity_map.get(row.listing_id, f"internal-{row.listing_id}") - for row in current_rows - ], - [ - reference_identity_map.get(row.listing_id, f"internal-{row.listing_id}") - for row in reference_rows - ], + current_global_ids = _segment_identities( + current_rows, current_identity_map, 0, len(current_rows) ) + reference_global_ids = _segment_identities( + reference_rows, reference_identity_map, 0, len(reference_rows) + ) + current_global_rank = _global_rank_map(current_global_ids) + reference_global_rank = _global_rank_map(reference_global_ids) + + segments_cfg = stability_thresholds.get("segments", {}) + top_cfg = dict(segments_cfg.get("top_band", {})) + middle_cfg = dict(segments_cfg.get("middle_band", {})) + bottom_cfg = dict(segments_cfg.get("bottom_band", {})) + full_dataset_cfg = dict(stability_thresholds.get("full_dataset", {})) + + top_n_cfg = int(top_cfg.get("top_n", top_n_effective)) + top_count = top_n_effective if top_n > 0 else top_n_cfg + top_current_ids = _segment_identities(current_rows, current_identity_map, 0, top_count) + top_reference_ids = _segment_identities( + reference_rows, reference_identity_map, 0, top_count + ) + top_band = _evaluate_segment_stability( + segment_name="top_band", + current_ids=top_current_ids, + reference_ids=top_reference_ids, + current_global_rank=current_global_rank, + reference_global_rank=reference_global_rank, + thresholds=top_cfg, + severity="fail", + ) + perturbation_deltas = [-0.10, -0.05, 0.05, 0.10] perturbation_overlap_min, perturbation_details = _compute_perturbation_overlap( - current_rows, top_n=top_n, deltas=perturbation_deltas + current_rows, top_n=top_count, deltas=perturbation_deltas ) + top_band["metrics"]["perturbation_overlap_min"] = perturbation_overlap_min + top_band["metrics"]["perturbation_checks"] = perturbation_details + perturbation_threshold = float(top_cfg.get("perturbation_overlap_min", 0.60)) + top_band["thresholds"]["perturbation_overlap_min"] = perturbation_threshold + if perturbation_overlap_min < perturbation_threshold: + top_band["status"] = "fail" + top_band["violation_details"]["failed_checks"].append( + f"perturbation_overlap_below_min:{perturbation_overlap_min}<{perturbation_threshold}" + ) - top_n_jaccard_min = float(stability_thresholds.get("top20_jaccard_min", 0.70)) - rank_correlation_min = float(stability_thresholds.get("rank_correlation_min", 0.80)) - perturbation_overlap_min_threshold = float( - stability_thresholds.get("perturbation_overlap_min", 0.60) + middle_start_pct = float(middle_cfg.get("start_pct", 0.45)) + middle_end_pct = float(middle_cfg.get("end_pct", 0.60)) + middle_current_bounds = _segment_bounds(len(current_rows), middle_start_pct, middle_end_pct) + middle_reference_bounds = _segment_bounds( + len(reference_rows), middle_start_pct, middle_end_pct ) - stability_pass = ( - top_n_jaccard >= top_n_jaccard_min - and rank_corr >= rank_correlation_min - and perturbation_overlap_min >= perturbation_overlap_min_threshold + middle_band = _evaluate_segment_stability( + segment_name="middle_band", + current_ids=_segment_identities( + current_rows, + current_identity_map, + middle_current_bounds[0], + middle_current_bounds[1], + ), + reference_ids=_segment_identities( + reference_rows, + reference_identity_map, + middle_reference_bounds[0], + middle_reference_bounds[1], + ), + current_global_rank=current_global_rank, + reference_global_rank=reference_global_rank, + thresholds=middle_cfg, + severity="warn", + ) + + bottom_start_pct = float(bottom_cfg.get("start_pct", 0.85)) + bottom_end_pct = float(bottom_cfg.get("end_pct", 1.00)) + bottom_current_bounds = _segment_bounds(len(current_rows), bottom_start_pct, bottom_end_pct) + bottom_reference_bounds = _segment_bounds( + len(reference_rows), bottom_start_pct, bottom_end_pct ) + bottom_band = _evaluate_segment_stability( + segment_name="bottom_band", + current_ids=_segment_identities( + current_rows, + current_identity_map, + bottom_current_bounds[0], + bottom_current_bounds[1], + ), + reference_ids=_segment_identities( + reference_rows, + reference_identity_map, + bottom_reference_bounds[0], + bottom_reference_bounds[1], + ), + current_global_rank=current_global_rank, + reference_global_rank=reference_global_rank, + thresholds=bottom_cfg, + severity="warn", + ) + + full_displacement = _rank_displacement_metrics( + current_global_ids, + reference_global_ids, + current_global_rank, + reference_global_rank, + ) + full_warn_checks: list[str] = [] + full_median_warn_max = float(full_dataset_cfg.get("median_abs_rank_shift_warn_max", 200)) + full_p90_warn_max = float(full_dataset_cfg.get("p90_rank_shift_warn_max", 1000)) + if full_displacement["median_abs_rank_shift"] > full_median_warn_max: + full_warn_checks.append( + "median_abs_rank_shift_above_warn_max:" + f"{full_displacement['median_abs_rank_shift']}>{full_median_warn_max}" + ) + if full_displacement["p90_rank_shift"] > full_p90_warn_max: + full_warn_checks.append( + f"p90_rank_shift_above_warn_max:{full_displacement['p90_rank_shift']}>{full_p90_warn_max}" + ) + full_dataset_status = "warn" if full_warn_checks else "pass" + full_dataset_metrics = { + "intersection_count": int(full_displacement["intersection_count"]), + "median_abs_rank_shift": full_displacement["median_abs_rank_shift"], + "p90_rank_shift": full_displacement["p90_rank_shift"], + "thresholds": { + "median_abs_rank_shift_warn_max": full_median_warn_max, + "p90_rank_shift_warn_max": full_p90_warn_max, + }, + "status": full_dataset_status, + "violation_details": {"failed_checks": full_warn_checks}, + } + + if middle_band["status"] == "warn": + stability_warning_keys.append("stability_middle_band") + if bottom_band["status"] == "warn": + stability_warning_keys.append("stability_bottom_band") + if full_dataset_status == "warn": + stability_warning_keys.append("stability_full_dataset") + + has_top_fail = top_band["status"] == "fail" + has_warnings = bool(stability_warning_keys) stability_gate = { - "status": "pass" if stability_pass else "fail", + "status": "fail" if has_top_fail else ("warn" if has_warnings else "pass"), "metrics": { - "top_n_jaccard": top_n_jaccard, - "rank_correlation": rank_corr, - "perturbation_overlap_min": perturbation_overlap_min, - "perturbation_checks": perturbation_details, + "segments": { + "top_band": top_band, + "middle_band": middle_band, + "bottom_band": bottom_band, + }, + "full_dataset": full_dataset_metrics, }, "thresholds": { - "top_n_jaccard_min": top_n_jaccard_min, - "rank_correlation_min": rank_correlation_min, - "perturbation_overlap_min": perturbation_overlap_min_threshold, + "segments": segments_cfg, + "full_dataset": full_dataset_cfg, }, } @@ -567,6 +697,7 @@ def run_scoring_evaluation( failed_gate_keys.append(gate_key) elif gate_payload["status"] == "warn": warning_gate_keys.append(gate_key) + warning_gate_keys.extend(stability_warning_keys) decision: str decision_reasons: list[str] = [] @@ -584,7 +715,6 @@ def run_scoring_evaluation( decision = "promote" decision_reasons.append("All required gates passed.") - # continue here recommended_next_actions: list[str] = [] if decision == "revert": recommended_next_actions.append("Rollback scoring profile changes and review failed gates.") @@ -608,7 +738,7 @@ def run_scoring_evaluation( "model_version": model_version, "timestamp_utc": evaluation_time_utc.isoformat(), "sample_size": len(current_rows), - "top_n": top_n, + "top_n": top_n_effective, "gates": { "data_quality": data_quality_gate, "scoring_sanity": scoring_sanity_gate, diff --git a/backend/tests/test_scoring_algorithms.py b/backend/tests/test_scoring_algorithms.py index 1ca76d6..01c72df 100644 --- a/backend/tests/test_scoring_algorithms.py +++ b/backend/tests/test_scoring_algorithms.py @@ -58,7 +58,9 @@ def test_load_scoring_config_returns_defaults_without_config_file( "province", "global", ] - assert config["evaluation_thresholds"]["stability"]["top20_jaccard_min"] == 0.7 + assert ( + config["evaluation_thresholds"]["stability"]["segments"]["top_band"]["jaccard_min"] == 0.7 + ) def test_load_scoring_config_deep_merges_advanced_v2_and_threshold_overrides( @@ -78,7 +80,9 @@ def test_load_scoring_config_deep_merges_advanced_v2_and_threshold_overrides( " transaction_cost_pct: 0.1", "evaluation_thresholds:", " stability:", - " rank_correlation_min: 0.85", + " segments:", + " top_band:", + " rank_correlation_min: 0.85", ] ), encoding="utf-8", @@ -97,8 +101,13 @@ def test_load_scoring_config_deep_merges_advanced_v2_and_threshold_overrides( ] assert config["advanced_v2"]["roi"]["transaction_cost_pct"] == 0.1 assert config["advanced_v2"]["roi"]["maintenance_pct"] == 0.04 - assert config["evaluation_thresholds"]["stability"]["rank_correlation_min"] == 0.85 - assert config["evaluation_thresholds"]["stability"]["top20_jaccard_min"] == 0.7 + assert ( + config["evaluation_thresholds"]["stability"]["segments"]["top_band"]["rank_correlation_min"] + == 0.85 + ) + assert ( + config["evaluation_thresholds"]["stability"]["segments"]["top_band"]["jaccard_min"] == 0.7 + ) def test_signal_neutral_defaults_when_inputs_missing() -> None: @@ -354,8 +363,12 @@ def test_run_scoring_job_uses_advanced_v2_when_flags_enabled( }, "evaluation_thresholds": { "stability": { - "top20_jaccard_min": 0.7, - "rank_correlation_min": 0.8, + "segments": { + "top_band": { + "jaccard_min": 0.7, + "rank_correlation_min": 0.8, + } + }, } }, }, diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py index cf20f6d..1b61c4d 100644 --- a/backend/tests/test_scoring_v2_evaluation.py +++ b/backend/tests/test_scoring_v2_evaluation.py @@ -1,9 +1,12 @@ from __future__ import annotations +import copy + from app.models.ingestion_job import IngestionJob from app.models.listing import Listing from app.models.raw_listing import RawListing from app.models.score_result import ScoreResult +from app.services import scoring_evaluation from app.services.scoring_evaluation import run_scoring_evaluation from sqlalchemy.orm import Session @@ -120,6 +123,17 @@ def test_scoring_evaluation_promotes_when_all_gates_pass(db_session: Session) -> assert report["failed_gates"] == [] assert report["warning_gates"] == [] assert report["gates"]["stability"]["status"] == "pass" + stability = report["gates"]["stability"]["metrics"] + assert "segments" in stability + assert "top_band" in stability["segments"] + assert "middle_band" in stability["segments"] + assert "bottom_band" in stability["segments"] + assert "full_dataset" in stability + assert "thresholds" in stability["segments"]["top_band"] + assert "thresholds" in stability["segments"]["middle_band"] + assert "thresholds" in stability["segments"]["bottom_band"] + assert "median_abs_rank_shift" in stability["segments"]["top_band"]["metrics"] + assert "p90_rank_shift" in stability["segments"]["top_band"]["metrics"] def test_scoring_evaluation_reverts_when_stability_fails(db_session: Session) -> None: @@ -152,3 +166,84 @@ def test_scoring_evaluation_marks_experimental_when_sample_too_small( assert report["decision"] == "experimental" assert "sample_size" in report["warning_gates"] + + +def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypatch) -> None: + current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + + base_config = scoring_evaluation._load_scoring_config() + strict_config = copy.deepcopy(base_config) + strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ + "median_abs_rank_shift_max" + ] = 0 + strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ + "p90_rank_shift_max" + ] = 0 + monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config) + + # Reversing top 20 introduces displacement while keeping enough overlap. + top_rows = ( + db_session.query(ScoreResult) + .filter(ScoreResult.job_id == current_job.id) + .order_by(ScoreResult.score.desc(), ScoreResult.listing_id.asc()) + .limit(20) + .all() + ) + for idx, row in enumerate(top_rows): + row.score = 99.99 - (19 - idx) + db_session.commit() + + report = run_scoring_evaluation( + db_session, + job_id=current_job.id, + reference_job_id=reference_job.id, + top_n=20, + ) + assert report["decision"] == "revert" + assert "stability" in report["failed_gates"] + + +def test_full_dataset_displacement_warning_is_context_only( + db_session: Session, monkeypatch +) -> None: + current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + + base_config = scoring_evaluation._load_scoring_config() + warn_config = copy.deepcopy(base_config) + warn_config["evaluation_thresholds"]["stability"]["full_dataset"][ + "median_abs_rank_shift_warn_max" + ] = -1 + warn_config["evaluation_thresholds"]["stability"]["full_dataset"][ + "p90_rank_shift_warn_max" + ] = -1 + # Keep top thresholds permissive so warning is context-only. + warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ + "median_abs_rank_shift_max" + ] = 10_000 + warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ + "p90_rank_shift_max" + ] = 10_000 + monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: warn_config) + + # Introduce non-top ordering movement so full-dataset displacement becomes non-zero. + ordered_rows = ( + db_session.query(ScoreResult) + .filter(ScoreResult.job_id == current_job.id) + .order_by(ScoreResult.score.desc(), ScoreResult.listing_id.asc()) + .all() + ) + ordered_rows[60].score, ordered_rows[80].score = ordered_rows[80].score, ordered_rows[60].score + db_session.commit() + + report = run_scoring_evaluation( + db_session, + job_id=current_job.id, + reference_job_id=reference_job.id, + top_n=20, + ) + assert report["decision"] == "experimental" + assert "stability_full_dataset" in report["warning_gates"] + assert report["gates"]["stability"]["metrics"]["full_dataset"]["status"] == "warn" + assert "stability" not in report["failed_gates"] diff --git a/config/scoring.yaml b/config/scoring.yaml index 74f688b..c848bf7 100644 --- a/config/scoring.yaml +++ b/config/scoring.yaml @@ -57,8 +57,31 @@ evaluation_thresholds: signal_dominance_cap: 0.70 high_score_cutoff: 80.0 stability: - top20_jaccard_min: 0.7 - rank_correlation_min: 0.8 - perturbation_overlap_min: 0.6 + segments: + top_band: + mode: top_n + top_n: 20 + jaccard_min: 0.70 + rank_correlation_min: 0.80 + perturbation_overlap_min: 0.60 + median_abs_rank_shift_max: 30 + p90_rank_shift_max: 120 + middle_band: + start_pct: 0.45 + end_pct: 0.60 + jaccard_warn_min: 0.30 + rank_correlation_warn_min: 0.50 + median_abs_rank_shift_warn_max: 250 + p90_rank_shift_warn_max: 1200 + bottom_band: + start_pct: 0.85 + end_pct: 1.00 + jaccard_warn_min: 0.25 + rank_correlation_warn_min: 0.40 + median_abs_rank_shift_warn_max: 300 + p90_rank_shift_warn_max: 1500 + full_dataset: + median_abs_rank_shift_warn_max: 200 + p90_rank_shift_warn_max: 1000 decision: minimum_sample_for_promote: 100 From c81c4260fdbf1ee0afc05165540a763787714356 Mon Sep 17 00:00:00 2001 From: William Date: Wed, 22 Apr 2026 14:28:16 +0200 Subject: [PATCH 02/16] feat: implement segment-based stability thresholds in scoring evaluation - Introduced segment-based thresholds for scoring evaluation, defining metrics for `top_band`, `middle_band`, and `bottom_band` to enhance diagnostic capabilities. - Updated scoring configuration to reflect new stability metrics, including Jaccard and rank correlation thresholds for each segment. - Enhanced tests to validate the integration of segment thresholds and ensure correct evaluation reporting. - Improved documentation to clarify the purpose and structure of the new segment-based stability diagnostics. From 6d7597df8f3f0fb4c8edfaee0bd39e51631cbcb5 Mon Sep 17 00:00:00 2001 From: William Date: Wed, 22 Apr 2026 15:24:17 +0200 Subject: [PATCH 03/16] feat: update scoring evaluation metrics to include percentage-based thresholds - Added percentage-based metrics for median absolute rank shift and p90 rank shift to enhance evaluation sensitivity. - Updated scoring configuration and evaluation logic to incorporate new percentage thresholds for segment stability checks. - Adjusted tests to validate the integration of percentage-based metrics and ensure correct evaluation reporting. - Enhanced documentation to clarify the purpose and structure of the new percentage-based metrics in scoring evaluation. --- backend/app/services/scoring.py | 16 ++--- backend/app/services/scoring_evaluation.py | 67 +++++++++++++-------- backend/tests/test_scoring_v2_evaluation.py | 14 +++-- config/scoring.yaml | 16 ++--- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/backend/app/services/scoring.py b/backend/app/services/scoring.py index eb13ecf..591cd72 100644 --- a/backend/app/services/scoring.py +++ b/backend/app/services/scoring.py @@ -80,29 +80,29 @@ "jaccard_min": 0.7, "rank_correlation_min": 0.8, "perturbation_overlap_min": 0.6, - "median_abs_rank_shift_max": 30, - "p90_rank_shift_max": 120, + "median_abs_rank_shift_pct_max": 0.15, + "p90_rank_shift_pct_max": 0.60, }, "middle_band": { "start_pct": 0.45, "end_pct": 0.60, "jaccard_warn_min": 0.3, "rank_correlation_warn_min": 0.5, - "median_abs_rank_shift_warn_max": 250, - "p90_rank_shift_warn_max": 1200, + "median_abs_rank_shift_pct_warn_max": 0.45, + "p90_rank_shift_pct_warn_max": 0.85, }, "bottom_band": { "start_pct": 0.85, "end_pct": 1.00, "jaccard_warn_min": 0.25, "rank_correlation_warn_min": 0.4, - "median_abs_rank_shift_warn_max": 300, - "p90_rank_shift_warn_max": 1500, + "median_abs_rank_shift_pct_warn_max": 0.50, + "p90_rank_shift_pct_warn_max": 0.90, }, }, "full_dataset": { - "median_abs_rank_shift_warn_max": 200, - "p90_rank_shift_warn_max": 1000, + "median_abs_rank_shift_pct_warn_max": 0.35, + "p90_rank_shift_pct_warn_max": 0.80, }, }, "decision": { diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py index f864931..18ad17a 100644 --- a/backend/app/services/scoring_evaluation.py +++ b/backend/app/services/scoring_evaluation.py @@ -248,16 +248,23 @@ def _rank_displacement_metrics( "intersection_count": 0.0, "median_abs_rank_shift": 0.0, "p90_rank_shift": 0.0, + "median_abs_rank_shift_pct": 0.0, + "p90_rank_shift_pct": 0.0, } shifts = [ float(abs(current_global_rank[listing_id] - reference_global_rank[listing_id])) for listing_id in shared_ids ] + rank_span = max(len(current_global_rank), len(reference_global_rank)) - 1 + rank_span = max(rank_span, 1) + shift_pcts = [shift / rank_span for shift in shifts] return { "intersection_count": float(len(shared_ids)), "median_abs_rank_shift": round(float(median(shifts)), 4), "p90_rank_shift": round(_percentile(shifts, 0.90), 4), + "median_abs_rank_shift_pct": round(float(median(shift_pcts)), 6), + "p90_rank_shift_pct": round(_percentile(shift_pcts, 0.90), 6), } @@ -289,16 +296,18 @@ def _evaluate_segment_stability( rank_corr_min = thresholds.get("rank_correlation_min") if rank_corr_min is not None and rank_correlation < float(rank_corr_min): failed_checks.append(f"rank_corr_below_min:{rank_correlation}<{rank_corr_min}") - median_max = thresholds.get("median_abs_rank_shift_max") - if median_max is not None and displacement["median_abs_rank_shift"] > float(median_max): + median_pct_max = thresholds.get("median_abs_rank_shift_pct_max") + if median_pct_max is not None and displacement["median_abs_rank_shift_pct"] > float( + median_pct_max + ): failed_checks.append( - "median_abs_rank_shift_above_max:" - f"{displacement['median_abs_rank_shift']}>{median_max}" + "median_abs_rank_shift_pct_above_max:" + f"{displacement['median_abs_rank_shift_pct']}>{median_pct_max}" ) - p90_max = thresholds.get("p90_rank_shift_max") - if p90_max is not None and displacement["p90_rank_shift"] > float(p90_max): + p90_pct_max = thresholds.get("p90_rank_shift_pct_max") + if p90_pct_max is not None and displacement["p90_rank_shift_pct"] > float(p90_pct_max): failed_checks.append( - f"p90_rank_shift_above_max:{displacement['p90_rank_shift']}>{p90_max}" + f"p90_rank_shift_pct_above_max:{displacement['p90_rank_shift_pct']}>{p90_pct_max}" ) else: jaccard_warn_min = thresholds.get("jaccard_warn_min") @@ -309,18 +318,21 @@ def _evaluate_segment_stability( failed_checks.append( f"rank_corr_below_warn_min:{rank_correlation}<{rank_corr_warn_min}" ) - median_warn_max = thresholds.get("median_abs_rank_shift_warn_max") - if median_warn_max is not None and displacement["median_abs_rank_shift"] > float( - median_warn_max + median_pct_warn_max = thresholds.get("median_abs_rank_shift_pct_warn_max") + if median_pct_warn_max is not None and displacement["median_abs_rank_shift_pct"] > float( + median_pct_warn_max ): failed_checks.append( - "median_abs_rank_shift_above_warn_max:" - f"{displacement['median_abs_rank_shift']}>{median_warn_max}" + "median_abs_rank_shift_pct_above_warn_max:" + f"{displacement['median_abs_rank_shift_pct']}>{median_pct_warn_max}" ) - p90_warn_max = thresholds.get("p90_rank_shift_warn_max") - if p90_warn_max is not None and displacement["p90_rank_shift"] > float(p90_warn_max): + p90_pct_warn_max = thresholds.get("p90_rank_shift_pct_warn_max") + if p90_pct_warn_max is not None and displacement["p90_rank_shift_pct"] > float( + p90_pct_warn_max + ): failed_checks.append( - f"p90_rank_shift_above_warn_max:{displacement['p90_rank_shift']}>{p90_warn_max}" + "p90_rank_shift_pct_above_warn_max:" + f"{displacement['p90_rank_shift_pct']}>{p90_pct_warn_max}" ) status = "pass" if not failed_checks else severity @@ -335,6 +347,8 @@ def _evaluate_segment_stability( "rank_correlation": rank_correlation, "median_abs_rank_shift": displacement["median_abs_rank_shift"], "p90_rank_shift": displacement["p90_rank_shift"], + "median_abs_rank_shift_pct": displacement["median_abs_rank_shift_pct"], + "p90_rank_shift_pct": displacement["p90_rank_shift_pct"], }, "thresholds": thresholds, "violation_details": {"failed_checks": failed_checks}, @@ -635,25 +649,30 @@ def run_scoring_evaluation( reference_global_rank, ) full_warn_checks: list[str] = [] - full_median_warn_max = float(full_dataset_cfg.get("median_abs_rank_shift_warn_max", 200)) - full_p90_warn_max = float(full_dataset_cfg.get("p90_rank_shift_warn_max", 1000)) - if full_displacement["median_abs_rank_shift"] > full_median_warn_max: + full_median_pct_warn_max = float( + full_dataset_cfg.get("median_abs_rank_shift_pct_warn_max", 0.35) + ) + full_p90_pct_warn_max = float(full_dataset_cfg.get("p90_rank_shift_pct_warn_max", 0.80)) + if full_displacement["median_abs_rank_shift_pct"] > full_median_pct_warn_max: full_warn_checks.append( - "median_abs_rank_shift_above_warn_max:" - f"{full_displacement['median_abs_rank_shift']}>{full_median_warn_max}" + "median_abs_rank_shift_pct_above_warn_max:" + f"{full_displacement['median_abs_rank_shift_pct']}>{full_median_pct_warn_max}" ) - if full_displacement["p90_rank_shift"] > full_p90_warn_max: + if full_displacement["p90_rank_shift_pct"] > full_p90_pct_warn_max: full_warn_checks.append( - f"p90_rank_shift_above_warn_max:{full_displacement['p90_rank_shift']}>{full_p90_warn_max}" + "p90_rank_shift_pct_above_warn_max:" + f"{full_displacement['p90_rank_shift_pct']}>{full_p90_pct_warn_max}" ) full_dataset_status = "warn" if full_warn_checks else "pass" full_dataset_metrics = { "intersection_count": int(full_displacement["intersection_count"]), "median_abs_rank_shift": full_displacement["median_abs_rank_shift"], "p90_rank_shift": full_displacement["p90_rank_shift"], + "median_abs_rank_shift_pct": full_displacement["median_abs_rank_shift_pct"], + "p90_rank_shift_pct": full_displacement["p90_rank_shift_pct"], "thresholds": { - "median_abs_rank_shift_warn_max": full_median_warn_max, - "p90_rank_shift_warn_max": full_p90_warn_max, + "median_abs_rank_shift_pct_warn_max": full_median_pct_warn_max, + "p90_rank_shift_pct_warn_max": full_p90_pct_warn_max, }, "status": full_dataset_status, "violation_details": {"failed_checks": full_warn_checks}, diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py index 1b61c4d..c5a43c1 100644 --- a/backend/tests/test_scoring_v2_evaluation.py +++ b/backend/tests/test_scoring_v2_evaluation.py @@ -168,6 +168,7 @@ def test_scoring_evaluation_marks_experimental_when_sample_too_small( assert "sample_size" in report["warning_gates"] +# todo: verify logic is correct def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypatch) -> None: current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) @@ -175,10 +176,10 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat base_config = scoring_evaluation._load_scoring_config() strict_config = copy.deepcopy(base_config) strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ - "median_abs_rank_shift_max" + "median_abs_rank_shift_pct_max" ] = 0 strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ - "p90_rank_shift_max" + "p90_rank_shift_pct_max" ] = 0 monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config) @@ -204,6 +205,7 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat assert "stability" in report["failed_gates"] +# todo: verify logic is correct def test_full_dataset_displacement_warning_is_context_only( db_session: Session, monkeypatch ) -> None: @@ -213,17 +215,17 @@ def test_full_dataset_displacement_warning_is_context_only( base_config = scoring_evaluation._load_scoring_config() warn_config = copy.deepcopy(base_config) warn_config["evaluation_thresholds"]["stability"]["full_dataset"][ - "median_abs_rank_shift_warn_max" + "median_abs_rank_shift_pct_warn_max" ] = -1 warn_config["evaluation_thresholds"]["stability"]["full_dataset"][ - "p90_rank_shift_warn_max" + "p90_rank_shift_pct_warn_max" ] = -1 # Keep top thresholds permissive so warning is context-only. warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ - "median_abs_rank_shift_max" + "median_abs_rank_shift_pct_max" ] = 10_000 warn_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ - "p90_rank_shift_max" + "p90_rank_shift_pct_max" ] = 10_000 monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: warn_config) diff --git a/config/scoring.yaml b/config/scoring.yaml index c848bf7..d79b76b 100644 --- a/config/scoring.yaml +++ b/config/scoring.yaml @@ -64,24 +64,24 @@ evaluation_thresholds: jaccard_min: 0.70 rank_correlation_min: 0.80 perturbation_overlap_min: 0.60 - median_abs_rank_shift_max: 30 - p90_rank_shift_max: 120 + median_abs_rank_shift_pct_max: 0.15 + p90_rank_shift_pct_max: 0.60 middle_band: start_pct: 0.45 end_pct: 0.60 jaccard_warn_min: 0.30 rank_correlation_warn_min: 0.50 - median_abs_rank_shift_warn_max: 250 - p90_rank_shift_warn_max: 1200 + median_abs_rank_shift_pct_warn_max: 0.45 + p90_rank_shift_pct_warn_max: 0.85 bottom_band: start_pct: 0.85 end_pct: 1.00 jaccard_warn_min: 0.25 rank_correlation_warn_min: 0.40 - median_abs_rank_shift_warn_max: 300 - p90_rank_shift_warn_max: 1500 + median_abs_rank_shift_pct_warn_max: 0.50 + p90_rank_shift_pct_warn_max: 0.90 full_dataset: - median_abs_rank_shift_warn_max: 200 - p90_rank_shift_warn_max: 1000 + median_abs_rank_shift_pct_warn_max: 0.35 + p90_rank_shift_pct_warn_max: 0.80 decision: minimum_sample_for_promote: 100 From b487937b0b4b7fec5697c2e7825e0a61639b93c9 Mon Sep 17 00:00:00 2001 From: William Date: Thu, 23 Apr 2026 11:49:54 +0200 Subject: [PATCH 04/16] feat: refine scoring evaluation metrics with segment overlap and displacement measures - Updated implementation steps to include segment overlap and rank correlation metrics for `top_band`, `middle_band`, and `bottom_band`. - Introduced global rank displacement metrics to enhance evaluation sensitivity and context. - Adjusted stability thresholds and auditability requirements in the scoring configuration. - Enhanced documentation to reflect the new metrics and their implications for scoring evaluation. --- docs/week-2-execution-plan.md | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/docs/week-2-execution-plan.md b/docs/week-2-execution-plan.md index c041932..0f93f4e 100644 --- a/docs/week-2-execution-plan.md +++ b/docs/week-2-execution-plan.md @@ -65,16 +65,23 @@ What this means (simple): - Stability checks tell us whether ranking behavior is reliable enough to ship. Implementation steps: -1. Add top-N overlap metric: - - compute Jaccard overlap between current top-20 and reference top-20 - - compare against threshold (default `top20_jaccard_min`) -2. Add rank correlation metric: - - compute full-list rank correlation vs reference ranking - - compare against threshold (default `rank_correlation_min`) -3. Add weight perturbation sensitivity: +1. Add segment overlap + ordering metrics: + - compute Jaccard overlap and rank correlation for: + - `top_band` (critical) + - `middle_band` (warning) + - `bottom_band` (warning) + - compare against config thresholds under `evaluation_thresholds.stability.segments` +2. Add rank displacement metrics (global-rank based): + - compute `median_abs_rank_shift` and `p90_rank_shift` for shared listings + - compute normalized forms `median_abs_rank_shift_pct` and `p90_rank_shift_pct` + - gate on normalized (`*_pct`) thresholds so behavior scales across dataset sizes +3. Add weight perturbation sensitivity for top band: - run controlled +/-5% to +/-10% weight perturbation experiments - - measure whether top ranking collapses or shifts beyond acceptable limits -4. Store all stability metrics in the same evaluation artifact for auditability. + - measure whether top ranking collapses beyond acceptable limits +4. Add full-dataset displacement context: + - compute global displacement metrics for full dataset + - treat full-dataset displacement breaches as warning-level context +5. Store all stability metrics in the same evaluation artifact for auditability. ### Phase 3.3 Release Decision Output (promote/revert/experimental) @@ -116,9 +123,9 @@ Create/modify these files during implementation: - Modify: `config/scoring.yaml` - Add/confirm: - data quality thresholds (valid/duplicate/null rate) - - stability thresholds (top-N overlap, rank correlation) + - stability thresholds (segment overlap, rank correlation, displacement) - sensitivity thresholds (acceptable perturbation drift) - - gate severity mapping (critical vs warning) + - gate severity mapping (critical `top_band` vs warning-level non-top/full-dataset) 4. **CLI entrypoint (CLI-first workflow)** - Modify: `backend/app/cli.py` @@ -134,7 +141,7 @@ Create/modify these files during implementation: - `backend/tests/test_scoring_service.py` - `backend/tests/test_scoring_v2_evaluation.py` (new) - Cover: - - metric correctness for top-N overlap and rank correlation + - metric correctness for segment overlap/correlation/displacement - perturbation sensitivity behavior for stable vs unstable cases - decision classification (`promote`/`revert`/`experimental`) - artifact shape and required fields From f4a30cfbe58841dd1df0678bb0decf05e48df55f Mon Sep 17 00:00:00 2001 From: William Date: Thu, 23 Apr 2026 12:41:58 +0200 Subject: [PATCH 05/16] feat: add new tests for rank displacement metrics and perturbation thresholds - Introduced tests for rank displacement metrics, validating expected values for intersection count and rank shifts. - Added tests for top-band perturbation thresholds, ensuring correct evaluation outcomes for both pass and fail scenarios. - Updated existing tests to remove outdated comments and improve clarity on evaluation logic. - Enhanced documentation to reflect the new tests and their significance in scoring evaluation. --- backend/tests/test_scoring_v2_evaluation.py | 102 +++++++++++++++++++- docs/evaluation-review-protocol.md | 18 ++-- docs/week2-implementation-playbook.md | 11 ++- docs/week2-interface-contract.md | 9 +- 4 files changed, 125 insertions(+), 15 deletions(-) diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py index c5a43c1..395258a 100644 --- a/backend/tests/test_scoring_v2_evaluation.py +++ b/backend/tests/test_scoring_v2_evaluation.py @@ -168,7 +168,6 @@ def test_scoring_evaluation_marks_experimental_when_sample_too_small( assert "sample_size" in report["warning_gates"] -# todo: verify logic is correct def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypatch) -> None: current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) @@ -204,8 +203,6 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat assert report["decision"] == "revert" assert "stability" in report["failed_gates"] - -# todo: verify logic is correct def test_full_dataset_displacement_warning_is_context_only( db_session: Session, monkeypatch ) -> None: @@ -249,3 +246,102 @@ def test_full_dataset_displacement_warning_is_context_only( assert "stability_full_dataset" in report["warning_gates"] assert report["gates"]["stability"]["metrics"]["full_dataset"]["status"] == "warn" assert "stability" not in report["failed_gates"] + + +def test_rank_displacement_metrics_computes_expected_values() -> None: + current_ids = ["A", "B", "C", "D"] + reference_ids = ["B", "A", "C", "E"] + current_global_rank = { + "A": 1, + "B": 2, + "C": 3, + "D": 4, + "X": 5, + } + reference_global_rank = { + "B": 1, + "A": 2, + "C": 4, + "E": 3, + "X": 5, + } + + metrics = scoring_evaluation._rank_displacement_metrics( + current_ids=current_ids, + reference_ids=reference_ids, + current_global_rank=current_global_rank, + reference_global_rank=reference_global_rank, + ) + + # Shared IDs are A, B, C -> shifts are [1, 1, 1]. + assert metrics["intersection_count"] == 3.0 + assert metrics["median_abs_rank_shift"] == 1.0 + assert metrics["p90_rank_shift"] == 1.0 + # rank_span = max(5, 5) - 1 = 4 => each pct shift is 1/4 = 0.25 + assert metrics["median_abs_rank_shift_pct"] == 0.25 + assert metrics["p90_rank_shift_pct"] == 0.25 + + +def test_top_band_perturbation_threshold_can_fail(db_session: Session, monkeypatch) -> None: + current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + + base_config = scoring_evaluation._load_scoring_config() + strict_config = copy.deepcopy(base_config) + strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ + "perturbation_overlap_min" + ] = 0.90 + monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config) + monkeypatch.setattr( + scoring_evaluation, + "_compute_perturbation_overlap", + lambda _rows, top_n, deltas: ( + 0.50, + [{"signal": "price_vs_comp", "delta": 0.10, "top_n_jaccard": 0.50}], + ), + ) + + report = run_scoring_evaluation( + db_session, + job_id=current_job.id, + reference_job_id=reference_job.id, + top_n=20, + ) + assert report["decision"] == "revert" + assert "stability" in report["failed_gates"] + top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"] + assert top_band["status"] == "fail" + assert any( + check.startswith("perturbation_overlap_below_min:") + for check in top_band["violation_details"]["failed_checks"] + ) + + +def test_top_band_perturbation_threshold_can_pass(db_session: Session, monkeypatch) -> None: + current_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + reference_job = _seed_scored_job(db_session, sample_size=120, score_order_desc=True) + + base_config = scoring_evaluation._load_scoring_config() + strict_config = copy.deepcopy(base_config) + strict_config["evaluation_thresholds"]["stability"]["segments"]["top_band"][ + "perturbation_overlap_min" + ] = 0.90 + monkeypatch.setattr(scoring_evaluation, "_load_scoring_config", lambda: strict_config) + monkeypatch.setattr( + scoring_evaluation, + "_compute_perturbation_overlap", + lambda _rows, top_n, deltas: ( + 0.95, + [{"signal": "price_vs_comp", "delta": 0.10, "top_n_jaccard": 0.95}], + ), + ) + + report = run_scoring_evaluation( + db_session, + job_id=current_job.id, + reference_job_id=reference_job.id, + top_n=20, + ) + assert report["decision"] == "promote" + top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"] + assert top_band["status"] == "pass" diff --git a/docs/evaluation-review-protocol.md b/docs/evaluation-review-protocol.md index b785120..1eed204 100644 --- a/docs/evaluation-review-protocol.md +++ b/docs/evaluation-review-protocol.md @@ -70,14 +70,20 @@ Suggested defaults (adjust later): ## 4.3 Stability gates -- top-N overlap vs previous version/run -- rank correlation across full set -- controlled sensitivity under weight perturbation (+/-5 to 10%) +- segment overlap vs previous version/run: + - top band (critical), middle band (warn), bottom band (warn) +- rank correlation per band +- global-rank displacement tracking: + - median abs rank shift + - p90 rank shift (tail movement) +- normalized displacement thresholds (`*_pct`) for dataset-size-aware gating +- controlled sensitivity under top-band weight perturbation (+/-5 to 10%) Suggested defaults: -- top20_jaccard >= 0.70 -- rank correlation >= 0.80 -- no severe rank collapse from minor weight changes +- top-band jaccard >= 0.70 +- top-band rank correlation >= 0.80 +- top-band perturbation overlap >= 0.60 +- non-top/full-dataset displacement threshold breaches are warning-level context ## 4.4 Explainability gates diff --git a/docs/week2-implementation-playbook.md b/docs/week2-implementation-playbook.md index d9a9ecb..c6f6327 100644 --- a/docs/week2-implementation-playbook.md +++ b/docs/week2-implementation-playbook.md @@ -199,10 +199,13 @@ Checkpoint: ## Phase 5: Evaluation/stability gates (Day 4) -1. Implement top-N overlap and rank correlation metrics. -2. Implement perturbation sensitivity checks. -3. Produce clear go/no-go decision artifact per run. -4. Add tests for metric calculation correctness. +1. Implement segment stability metrics: + - top/middle/bottom overlap + rank correlation checks +2. Implement global-rank displacement diagnostics: + - absolute metrics plus normalized `*_pct` thresholds for gating +3. Implement top-band perturbation sensitivity checks. +4. Produce clear go/no-go decision artifact per run. +5. Add tests for metric calculation correctness. Checkpoint: - scoring run outputs promotion recommendation deterministically. diff --git a/docs/week2-interface-contract.md b/docs/week2-interface-contract.md index 1d907de..28bca88 100644 --- a/docs/week2-interface-contract.md +++ b/docs/week2-interface-contract.md @@ -76,8 +76,13 @@ Phase 0 freeze references these default thresholds from: - `config/scoring.yaml` (`evaluation_thresholds`) Current defaults: -- `top20_jaccard_min = 0.70` -- `rank_correlation_min = 0.80` +- `evaluation_thresholds.stability.segments.top_band.jaccard_min = 0.70` +- `evaluation_thresholds.stability.segments.top_band.rank_correlation_min = 0.80` +- `evaluation_thresholds.stability.segments.top_band.perturbation_overlap_min = 0.60` +- `evaluation_thresholds.stability.segments.top_band.median_abs_rank_shift_pct_max = 0.15` +- `evaluation_thresholds.stability.segments.top_band.p90_rank_shift_pct_max = 0.60` +- `evaluation_thresholds.stability.full_dataset.median_abs_rank_shift_pct_warn_max = 0.35` +- `evaluation_thresholds.stability.full_dataset.p90_rank_shift_pct_warn_max = 0.80` These thresholds gate promote/revert decisions for Week 2 scoring changes. From a89e4faee0d4464d25b762236ecc282f9ca23692 Mon Sep 17 00:00:00 2001 From: William Date: Thu, 23 Apr 2026 13:40:51 +0200 Subject: [PATCH 06/16] feat: update project documentation for Week 2 execution and interface contracts - Revised PROJECT_NOTE.md to reflect the finalized goals and deliverables for the advanced scoring system (`advanced_v2`), including evaluation gates and structured reasoning payloads. - Updated week2-implementation-playbook.md to clarify the scope and execution order, emphasizing the single source of truth for implementation details. - Enhanced week2-interface-contract.md to define output expectations and scope boundaries, ensuring clarity on in-scope and out-of-scope elements for Week 2. --- .cursor/rules/PROJECT_NOTE.md | 136 ++++---------------------- docs/week2-implementation-playbook.md | 30 ++---- docs/week2-interface-contract.md | 13 ++- 3 files changed, 35 insertions(+), 144 deletions(-) diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md index a582f8b..9383fb4 100644 --- a/.cursor/rules/PROJECT_NOTE.md +++ b/.cursor/rules/PROJECT_NOTE.md @@ -1,16 +1,3 @@ -TODOS: - -- [x] First, create a few datasets for testing purposes -1. 1000 listings set from p24, and 1000 listings set from privateproperty -2. a dataset with both p24 and 1000 listings from privateproperty joined -- [ ] phase 3 in week-2-execution plan - -Unfinished prompts for phase 3 (scoring_evaluation.py): - -1. side note, if I am correct it should not just evaluate the top n, but also the mid n and bottom n. - -2. get progress report on phase 3 - # 🏠 Real Estate Deal Intelligence Platform (Full System) ## 🎯 Goal @@ -446,109 +433,26 @@ Deferred unless core goals are already complete: ### **Goal** -Ship an **ROI-first, explainable advanced scoring system** that improves ranking quality over the Week 1 baseline by: -- using **micro-comparables** (location/type/bed/bath segment medians, not a single dataset median), -- adding **rental yield + transaction-cost adjustments** (net-ish ROI proxy), -- producing a **reasoning/explanations payload** for every score (so results are inspectable), -- adding an **analytics engine** that can quantify scoring quality and data health, -- integrating **LLM enrichment** in a controlled, measurable way (only if it improves outcomes). - -### **Deliverables (Week 2)** - -#### **2.1 Advanced scoring system (v2)** - -- **Micro-comps pricing signals** - - Compute segmented medians / distributions for: - - `province/city/suburb` (use the deepest level with enough samples) - - `property_type` - - `bedrooms`, `bathrooms` (bucketed) - - Add fallbacks when segment sample size is too small (e.g., suburb → city → province → global). - - Replace baseline “single median” price deviation with: - - **price_vs_comp_median** (price deviation within the best-available segment) - - **price_per_sqm_vs_comp_median** (if floor_size available) - -- **ROI proxy signals** - - **Transaction-cost adjustment** - - Upfront costs modeled as configurable % or fixed schedule (kept in config). - - Optional LLM-assisted extraction path: - - infer additional upfront-cost signals from listing fields + description text - - emit `upfront_cost_estimate`, `cost_drivers`, and `confidence` - - use only when confidence is above threshold, otherwise fallback to deterministic config assumptions - - **Net yield proxy** - - Use available fields (`rates_and_taxes`, `levies`) + configurable assumptions: - - vacancy allowance %, maintenance %, management %, insurance (optional) - - Rent estimation approach for Week 2: - - **Phase 1 (required):** heuristic rent estimate (config-driven by `property_type`, `bedrooms`, `city/province` buckets) - - **Phase 2 (optional):** upgrade rent estimate via LLM/external data only if Phase 1 is weak - - Add a yield-derived score component such as: - - **net_yield_signal** and **payback_signal** (optional, time-boxed) - -- **Liquidity & risk adjustments** - - Keep time-on-market but improve it: - - use `date_posted` where available - - add a **stale inventory non-linear curve** (e.g., diminishing returns after N days) - - Penalize low-confidence or missing-critical-fields in a consistent way: - - separate **data_confidence** (completeness) from **investment_risk** (flags like auction/private seller if used) - -- **Scoring versioning** - - Output `model_version="advanced_v2"` (keep baseline runnable side-by-side). - - Ensure scoring is **idempotent** per job (overwrite results like Week 1). - -#### **2.2 Reasoning engine (explainability)** - -- Persist a structured explanation per listing score: - - top contributing signals with raw values and normalized scores - - “why this was ranked high/low” - - confidence and missing-field notes -- Output target: - - a single `deal_reason` string (short) - - plus a structured `explanation` JSON blob (machine-readable) for later UI. - -#### **2.3 Analytics engine (quality + insight)** - -- Implement job-level analytics for: - - score distribution (histogram bins, min/max/median, percentiles) - - top-N listing summaries (score + key drivers) - - missingness report for key fields that affect scoring - - comps coverage report: what % of listings got suburb-level comps vs city/province/global -- Add “ranking quality checks” (offline): - - sanity checks for pathological outcomes (e.g., missing price scored too high) - - stability checks when changing weights (top-N overlap) - -#### **2.4 LLM enrichment prototype (Week 2)** - -- **Purpose:** extract high-value structured variables from `description` to improve scoring. -- **Candidate variables (minimal set):** - - condition/renovation level (e.g., “newly renovated”, “needs TLC”) - - security/amenities not reliably structured (pool, inverter/solar, etc.) - - rental hints (furnished, “investment”, “tenant in place”) as weak signals - - upfront-cost hints (legal/levy/special conditions) for ROI proxy refinement -- **Integration approach (controlled):** - - store derived fields in a separate enrichment payload (do not overwrite canonical listing fields) - - feed enrichment into scoring only behind an **experiment flag** -- **Week 2 validation gate (must pass to enable by default):** - - improves top-N deal quality on offline evaluation metrics (see 2.5) - - does not significantly increase invalid/low-confidence scores - -#### **2.5 Evaluation + gates (scope control)** - -- Add a lightweight offline evaluation process: - - compare baseline_v1 vs advanced_v2 on: - - top-N stability and reason diversity - - fewer “unknown / missing data” in top ranks - - comps coverage improvements - - yield proxy sanity (high yield not correlated with missing price) -- **Decision gates:** - - only ship LLM-influenced scoring as default if it improves metrics and is stable - - otherwise keep LLM enrichment stored but not used in ranking - -### **Suggested implementation order** - -- Build micro-comps computation + comp-based pricing signals -- Add ROI proxy (transaction costs + net yield) -- Add reasoning payload format -- Add analytics summaries + evaluation scripts -- Add LLM enrichment prototype + validation gate +Ship an ROI-first, explainable scoring system (`advanced_v2`) with deterministic evaluation gates that decide promote/revert/experimental outcomes. + +### **Week 2 Source-of-Truth Docs (Updated)** + +- Canonical scope: `docs/week-2-execution-plan.md` +- Stability details: `docs/scoring-evaluation-middle-bottom-gating-spec.md` +- Evaluation policy: `docs/evaluation-review-protocol.md` +- Interface contract: `docs/week2-interface-contract.md` +- Implementation playbook: `docs/week2-implementation-playbook.md` + +### **Week 2 High-Level Deliverables** + +- Advanced scoring (`advanced_v2`) with micro-comps + ROI proxy signals. +- Structured reasoning payload (`deal_reason` + machine-readable `explanation`). +- Evaluation gates with deterministic release decisions: + - `promote` / `revert` / `experimental`. +- Segment-based stability checks: + - `top_band` (critical), `middle_band`/`bottom_band` (warning), + - full-dataset displacement context, + - relative displacement thresholds (`*_pct`) for dataset-size-aware gating. --- diff --git a/docs/week2-implementation-playbook.md b/docs/week2-implementation-playbook.md index c6f6327..ae2146b 100644 --- a/docs/week2-implementation-playbook.md +++ b/docs/week2-implementation-playbook.md @@ -1,13 +1,10 @@ # Week 2 Implementation Playbook -This is the single Week 2 implementation document that combines: -- plain-language intent (why each feature matters), -- code-level change map (what files to change), -- chronological execution order (how to implement safely). +This document is an implementation companion for Week 2. +It should not be used as the primary scope definition. -It merges and supersedes the intent of: +Canonical Week 2 scope and sequencing live in: - `docs/week-2-execution-plan.md` -- `docs/week2-advanced-scoring-explained.md` --- @@ -23,22 +20,13 @@ Do **not** include Week 3 dashboard/API strategy UX revamp in this branch. --- -## 2) Scope (In / Out) +## 2) Scope Reference -## In scope - -- Advanced scoring service (`advanced_v2`) -- Micro-comps signals with safe fallback -- ROI proxy signals (deterministic baseline) -- Structured reasoning payload -- Evaluation/stability gates and promotion decision output -- MVP performance safeguards for Week 2 query/compute paths - -## Out of scope (for this branch) - -- Full strategy-driven dashboard workflows -- Multi-provider external integrations -- Default-on LLM scoring influence (LLM remains optional/flagged) +Scope is intentionally not duplicated here. +Use `docs/week-2-execution-plan.md` as the single source of truth for: +- in-scope vs out-of-scope boundaries, +- phase ordering, +- done criteria. --- diff --git a/docs/week2-interface-contract.md b/docs/week2-interface-contract.md index 28bca88..7e16d06 100644 --- a/docs/week2-interface-contract.md +++ b/docs/week2-interface-contract.md @@ -1,12 +1,15 @@ # Week 2 Interface Contract -This document freezes the Week 2 output interface expectations before implementation work proceeds. +This document defines Week 2 interface contracts only. It defines: - `advanced_v2` scoring output contract, - structured explanation payload contract, - promotion threshold references. +Canonical Week 2 scope and execution order live in: +- `docs/week-2-execution-plan.md` + --- ## 1) Scoring Output Contract (`advanced_v2`) @@ -108,9 +111,5 @@ Expected progression: ## 5) Scope Guardrail -This contract is only for Week 2 scoring and reasoning outputs. - -It intentionally excludes: -- Week 3 dashboard/API strategy workflow contracts, -- multi-provider external integrations, -- default-on LLM scoring influence. +Scope is intentionally not restated here. +Use `docs/week-2-execution-plan.md` as the single source of truth for Week 2 scope boundaries. From 87cda98798c5124ebc9db84c5f72ae24ad4f1ac1 Mon Sep 17 00:00:00 2001 From: William Date: Thu, 23 Apr 2026 14:38:14 +0200 Subject: [PATCH 07/16] feat: enhance project documentation and clarify scoring evaluation metrics - Updated PROJECT_NOTE.md to include new insights on scoring evaluation metrics and their implications. - Revised week2-implementation-playbook.md to improve clarity on execution steps and responsibilities. - Enhanced week2-interface-contract.md to better define output expectations and scope for the upcoming evaluation phase. --- ...se4-performance-baseline-implementation.md | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 docs/week2-phase4-performance-baseline-implementation.md diff --git a/docs/week2-phase4-performance-baseline-implementation.md b/docs/week2-phase4-performance-baseline-implementation.md new file mode 100644 index 0000000..91d5a6e --- /dev/null +++ b/docs/week2-phase4-performance-baseline-implementation.md @@ -0,0 +1,205 @@ +# Week 2 Phase 4 Implementation Doc: Performance Baseline + +This document defines exactly what to implement for Phase 4 in Week 2: + +- establish a repeatable performance baseline, +- record measurable evidence, +- avoid premature optimization work that belongs in Week 3/4. + +Canonical phase reference: `docs/week-2-execution-plan.md` (Phase 4). + +--- + +## 1) Goal + +Create a deterministic, CLI-first baseline run that measures core pipeline timings and writes machine-readable artifacts for later comparison. + +This phase is measurement-first, not optimization-first. + +--- + +## 2) Scope + +### In Scope (Week 2) + +1. Run baseline timings for core pipeline stages: + - ingest + - score + - validate-dataset + - evaluate-scoring +2. Support baseline runs for: + - single dataset + - multi-dataset proxy (run multiple datasets in one benchmark invocation) +3. Record and persist: + - per-stage durations + - p50/p95 stage durations + - SLO assessment (met/missed/deferred) + - unresolved bottlenecks and follow-up actions +4. Expose as one CLI command. + +### Out of Scope (defer to Week 3/4) + +- query/index refactors +- API latency optimization +- async orchestration and job status APIs +- caching/invalidation framework +- large-scale performance tuning loops + +--- + +## 3) Deliverables + +1. New service: + - `backend/app/services/performance_baseline.py` +2. New CLI command: + - `benchmark-baseline` in `backend/app/cli.py` +3. Generated artifacts: + - `output/performance//baseline_metrics.json` + - `output/performance//baseline_summary.md` +4. Tests: + - `backend/tests/test_performance_baseline.py` + +--- + +## 4) CLI Contract + +Command: + +- `benchmark-baseline` + +Suggested options: + +- `--dataset ` (repeatable, required) +- `--top-n ` (optional, default 20) +- `--output-dir ` (optional; default `output/performance//`) + +Example: + +- `./scripts/cli-local.sh benchmark-baseline --dataset "/abs/path/a.json" --dataset "/abs/path/b.json" --top-n 20` + +--- + +## 5) Execution Flow + +For each dataset path: + +1. Ingest dataset and capture duration. +2. Score the produced job and capture duration. +3. Validate dataset and capture duration. +4. Evaluate scoring and capture duration. + - Week 2 baseline mode can use `reference_job_id = job_id` for deterministic smoke measurement. +5. Collect paths to generated validation/evaluation artifacts. +6. Append stage timings into run-level aggregation. + +After all datasets: + +1. Compute run-level p50/p95 per stage. +2. Build SLO assessment: + - met + - missed + - deferred (for API or optimization-heavy SLOs not in Week 2 scope) +3. Write JSON + Markdown summary artifacts. +4. Print concise terminal summary with output paths. + +--- + +## 6) Input and Output Shapes + +### Input (CLI) + +- `datasets: list[str]` +- `top_n: int` (optional) +- `output_dir: str` (optional) + +### Output (`baseline_metrics.json`) + +Top-level schema (minimum): + +- `run_id: str` +- `timestamp_utc: str` +- `scope: "week2_phase4_minimal_baseline"` +- `datasets: list[dataset_result]` +- `aggregate: { stage_stats }` +- `slo_targets: { ... }` +- `slo_assessment: { met: [], missed: [], deferred: [] }` +- `bottlenecks: list[str]` +- `week3_week4_followups: list[str]` + +`dataset_result` minimum: + +- `dataset_path: str` +- `job_id: int | null` +- `durations_s: { ingest, score, validate_dataset, evaluate_scoring }` +- `artifacts: { validation_report_path, evaluation_report_path }` +- `status: "pass" | "warn" | "fail" | "error"` +- `error: str | null` + +### Output (`baseline_summary.md`) + +Human-readable summary with: + +- run metadata +- per-stage p50/p95 +- SLO status by item +- unresolved bottlenecks +- explicit Week 3/4 follow-ups + +--- + +## 7) SLO Handling for Week 2 + +Use `docs/mvp-performance-plan.md` targets as references, but classify by implementation feasibility: + +- CLI pipeline timing targets: evaluate directly (score/validation duration targets). +- API p95 targets: mark as deferred if API benchmark harness is not in Week 2 scope. + +This keeps the baseline honest without forcing premature architecture work. + +--- + +## 8) Automation Details + +Automation is complete when one command: + +- runs all selected datasets end-to-end, +- records stage timings, +- computes p50/p95, +- writes both artifacts, +- exits non-zero only on command/runtime errors (not on SLO misses). + +SLO misses should be reported in artifacts, not treated as process crashes. + +--- + +## 9) Code Change Checklist + +1. Add `performance_baseline.py` service with: + - orchestration logic + - timing collection (`time.perf_counter`) + - percentile helper + - artifact writers +2. Add `benchmark-baseline` CLI command in `backend/app/cli.py`. +3. Add tests for: + - single dataset run + - multi-dataset run + - artifact structure + - p50/p95 aggregation behavior +4. Update docs references if command is surfaced in README/CLI docs. + +--- + +## 10) Acceptance Criteria (Phase 4 Done, Week 2 Minimal) + +Phase 4 is considered done when: + +1. `benchmark-baseline` exists and is runnable from CLI. +2. It supports one or many datasets in one run. +3. It produces: + - `baseline_metrics.json` + - `baseline_summary.md` +4. Artifacts include: + - per-stage durations + - p50/p95 by stage + - SLO met/missed/deferred + - unresolved bottlenecks +5. Follow-up actions for Week 3/4 are explicitly captured. \ No newline at end of file From 6d85a7feafa13545702fdcd89d00eaf9c9639f60 Mon Sep 17 00:00:00 2001 From: William Date: Fri, 24 Apr 2026 13:58:09 +0200 Subject: [PATCH 08/16] feat: implement performance baseline updates and CLI enhancements - Updated PROJECT_NOTE.md to include required performance baseline handoff updates and metrics context. - Added a new CLI command for benchmarking performance baselines, allowing users to assess API latency and SLOs. - Enhanced week2-phase4-performance-baseline-implementation.md with detailed follow-up actions and required updates for the upcoming API implementations. - Ensured documentation reflects the inclusion of dataset-size context and throughput metrics for meaningful performance comparisons. --- .cursor/rules/PROJECT_NOTE.md | 7 + backend/app/cli.py | 26 +++ backend/app/services/performance_baseline.py | 192 ++++++++++++++++++ backend/tests/test_performance_baseline.py | 56 +++++ docs/week-2-execution-plan.md | 2 + ...se4-performance-baseline-implementation.md | 41 +++- 6 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 backend/app/services/performance_baseline.py create mode 100644 backend/tests/test_performance_baseline.py diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md index 9383fb4..1d78e3a 100644 --- a/.cursor/rules/PROJECT_NOTE.md +++ b/.cursor/rules/PROJECT_NOTE.md @@ -506,6 +506,10 @@ Turn PropSignal into a **configurable investor decision tool** where users can: - pagination and top-N optimized retrieval - asynchronous processing for heavy jobs (ingestion/scoring/validation) - freshness metadata (`last_ingested_at`, `last_scored_at`, `model/profile version`) + - Required performance baseline handoff update (more info in `week2-phase4-performance-baseline-implementation.md`): + - after ranking/list/detail APIs are available, update `backend/app/services/performance_baseline.py` + to measure API latency and move API SLOs from `deferred` to evaluated (`met`/`missed`) + - update `backend/tests/test_performance_baseline.py` to enforce this behavior #### **3.3 CLI revamp to mirror backend/dashboard capability** @@ -586,6 +590,9 @@ Harden the system for real-world use by running structured validation on real da - Optimize bottlenecks (indexes, pagination paths, batch operations). - Complete deployment checklist (env config, observability, rollback path, smoke tests). - Use `docs/mvp-performance-plan.md` as the implementation checklist and SLO reference. +- Ensure performance baseline artifacts include dataset-size context and throughput metrics: + - `records_total`, `records_valid` + - stage throughput (rows/sec) for scoring and validation #### **4.5 Documentation pack (operator + analyst guidance)** diff --git a/backend/app/cli.py b/backend/app/cli.py index 749d64b..1580f8b 100644 --- a/backend/app/cli.py +++ b/backend/app/cli.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Annotated import typer @@ -7,6 +8,7 @@ from app.services.dataset_validation import run_dataset_validation from app.services.exporting import export_job_results from app.services.ingestion import ingest_propflux_file +from app.services.performance_baseline import run_performance_baseline from app.services.scoring import run_scoring_job from app.services.scoring_evaluation import run_scoring_evaluation @@ -82,5 +84,29 @@ def evaluate_scoring( typer.echo(f"Report written to: {report['report_path']}") +@app.command("benchmark-baseline") +def benchmark_baseline( + dataset: Annotated[list[str], typer.Option("--dataset")], + top_n: Annotated[int, typer.Option("--top-n")] = 20, + output_dir: Annotated[str | None, typer.Option("--output-dir")] = None, +) -> None: + with SessionLocal() as db: + metrics = run_performance_baseline( + db, + dataset_paths=dataset, + top_n=top_n, + output_dir=output_dir, + ) + typer.echo( + "Performance baseline completed for " + f"{len(dataset)} dataset(s). " + f"met={len(metrics['slo_assessment']['met'])}, " + f"missed={len(metrics['slo_assessment']['missed'])}, " + f"deferred={len(metrics['slo_assessment']['deferred'])}" + ) + typer.echo(f"Metrics written to: {metrics['metrics_path']}") + typer.echo(f"Summary written to: {metrics['summary_path']}") + + if __name__ == "__main__": app() diff --git a/backend/app/services/performance_baseline.py b/backend/app/services/performance_baseline.py new file mode 100644 index 0000000..6c46069 --- /dev/null +++ b/backend/app/services/performance_baseline.py @@ -0,0 +1,192 @@ +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path +from time import perf_counter +from typing import Any + +from sqlalchemy.orm import Session + +from app.services.dataset_validation import run_dataset_validation +from app.services.ingestion import ingest_propflux_file +from app.services.scoring import run_scoring_job +from app.services.scoring_evaluation import run_scoring_evaluation + + +def _percentile(values: list[float], pct: float) -> float: + if not values: + return 0.0 + ordered = sorted(values) + idx = int(round((len(ordered) - 1) * max(0.0, min(1.0, pct)))) + return float(ordered[idx]) + + +def _timed(callable_fn: Any, *args: Any, **kwargs: Any) -> tuple[Any, float]: + start = perf_counter() + result = callable_fn(*args, **kwargs) + elapsed = perf_counter() - start + return result, round(elapsed, 4) + + +def run_performance_baseline( + db: Session, + dataset_paths: list[str], + *, + top_n: int = 20, + output_dir: str | None = None, +) -> dict[str, Any]: + if not dataset_paths: + raise ValueError("At least one dataset path is required.") + + run_time_utc = datetime.now(UTC) + run_id = f"phase4-baseline-{run_time_utc.strftime('%Y%m%d%H%M%S')}" + base_dir = Path(output_dir) if output_dir else Path("output") / "performance" / run_id + base_dir.mkdir(parents=True, exist_ok=True) + + dataset_results: list[dict[str, Any]] = [] + ingest_durations: list[float] = [] + score_durations: list[float] = [] + validate_durations: list[float] = [] + evaluate_durations: list[float] = [] + + for dataset_path in dataset_paths: + result: dict[str, Any] = { + "dataset_path": dataset_path, + "job_id": None, + "durations_s": { + "ingest": 0.0, + "score": 0.0, + "validate_dataset": 0.0, + "evaluate_scoring": 0.0, + }, + "artifacts": { + "validation_report_path": None, + "evaluation_report_path": None, + }, + "status": "error", + "error": None, + } + try: + ingestion_job, ingest_s = _timed(ingest_propflux_file, db, Path(dataset_path)) + result["job_id"] = ingestion_job.id + result["durations_s"]["ingest"] = ingest_s + ingest_durations.append(ingest_s) + + _scoring_job, score_s = _timed(run_scoring_job, db, ingestion_job.id) + result["durations_s"]["score"] = score_s + score_durations.append(score_s) + + validation_result, validate_s = _timed(run_dataset_validation, db, ingestion_job.id) + result["durations_s"]["validate_dataset"] = validate_s + result["artifacts"]["validation_report_path"] = validation_result.report_path + validate_durations.append(validate_s) + + evaluation_report, evaluate_s = _timed( + run_scoring_evaluation, + db, + job_id=ingestion_job.id, + reference_job_id=ingestion_job.id, + top_n=top_n, + ) + result["durations_s"]["evaluate_scoring"] = evaluate_s + result["artifacts"]["evaluation_report_path"] = evaluation_report.get("report_path") + evaluate_durations.append(evaluate_s) + + result["status"] = "pass" + except Exception as exc: # pragma: no cover - defensive path + result["error"] = str(exc) + result["status"] = "error" + dataset_results.append(result) + + aggregate = { + "ingest": { + "p50_s": _percentile(ingest_durations, 0.50), + "p95_s": _percentile(ingest_durations, 0.95), + }, + "score": { + "p50_s": _percentile(score_durations, 0.50), + "p95_s": _percentile(score_durations, 0.95), + }, + "validate_dataset": { + "p50_s": _percentile(validate_durations, 0.50), + "p95_s": _percentile(validate_durations, 0.95), + }, + "evaluate_scoring": { + "p50_s": _percentile(evaluate_durations, 0.50), + "p95_s": _percentile(evaluate_durations, 0.95), + }, + } + + slo_targets = { + "scoring_run_10k_max_s": 600.0, + "dataset_validation_10k_max_s": 300.0, + "ranking_list_api_p95_ms": 800.0, + "filtered_ranking_api_p95_ms": 1200.0, + "listing_detail_api_p95_ms": 500.0, + } + slo_assessment = {"met": [], "missed": [], "deferred": []} + if aggregate["score"]["p95_s"] <= slo_targets["scoring_run_10k_max_s"]: + slo_assessment["met"].append("scoring_run_10k_max_s") + else: + slo_assessment["missed"].append("scoring_run_10k_max_s") + if aggregate["validate_dataset"]["p95_s"] <= slo_targets["dataset_validation_10k_max_s"]: + slo_assessment["met"].append("dataset_validation_10k_max_s") + else: + slo_assessment["missed"].append("dataset_validation_10k_max_s") + slo_assessment["deferred"].extend( + ["ranking_list_api_p95_ms", "filtered_ranking_api_p95_ms", "listing_detail_api_p95_ms"] + ) + + metrics = { + "run_id": run_id, + "timestamp_utc": run_time_utc.isoformat(), + "scope": "week2_phase4_minimal_baseline", + "datasets": dataset_results, + "aggregate": aggregate, + "slo_targets": slo_targets, + "slo_assessment": slo_assessment, + "bottlenecks": [], + "week3_week4_followups": [ + "Add API-level latency benchmark harness and capture p95.", + "Add query/index optimization for ranking/filter paths.", + "Add async orchestration and cache strategy where needed.", + ], + } + + metrics_path = base_dir / "baseline_metrics.json" + metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8") + + summary_lines = [ + "# Phase 4 Baseline Summary", + "", + f"- Run ID: {run_id}", + f"- Timestamp UTC: {run_time_utc.isoformat()}", + f"- Dataset count: {len(dataset_results)}", + "", + "## Stage p50/p95 (seconds)", + f"- ingest: p50={aggregate['ingest']['p50_s']}, p95={aggregate['ingest']['p95_s']}", + f"- score: p50={aggregate['score']['p50_s']}, p95={aggregate['score']['p95_s']}", + ( + f"- validate_dataset: p50={aggregate['validate_dataset']['p50_s']}, " + f"p95={aggregate['validate_dataset']['p95_s']}" + ), + ( + f"- evaluate_scoring: p50={aggregate['evaluate_scoring']['p50_s']}, " + f"p95={aggregate['evaluate_scoring']['p95_s']}" + ), + "", + "## SLO Assessment", + f"- Met: {', '.join(slo_assessment['met']) or 'none'}", + f"- Missed: {', '.join(slo_assessment['missed']) or 'none'}", + f"- Deferred: {', '.join(slo_assessment['deferred']) or 'none'}", + "", + "## Follow-ups (Week 3/4)", + ] + summary_lines.extend(f"- {item}" for item in metrics["week3_week4_followups"]) + summary_path = base_dir / "baseline_summary.md" + summary_path.write_text("\n".join(summary_lines) + "\n", encoding="utf-8") + + metrics["metrics_path"] = str(metrics_path) + metrics["summary_path"] = str(summary_path) + return metrics diff --git a/backend/tests/test_performance_baseline.py b/backend/tests/test_performance_baseline.py new file mode 100644 index 0000000..c65c0ef --- /dev/null +++ b/backend/tests/test_performance_baseline.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from app.services.performance_baseline import run_performance_baseline +from sqlalchemy.orm import Session + +FIXTURE_DIR = Path(__file__).parent / "fixtures" / "propflux" + + +def test_performance_baseline_single_dataset_writes_artifacts( + db_session: Session, tmp_path: Path +) -> None: + metrics = run_performance_baseline( + db_session, + dataset_paths=[str(FIXTURE_DIR / "valid_listings.json")], + top_n=20, + output_dir=str(tmp_path), + ) + + assert metrics["scope"] == "week2_phase4_minimal_baseline" + assert len(metrics["datasets"]) == 1 + assert metrics["datasets"][0]["status"] == "pass" + assert Path(metrics["metrics_path"]).exists() + assert Path(metrics["summary_path"]).exists() + + saved = json.loads(Path(metrics["metrics_path"]).read_text(encoding="utf-8")) + assert "aggregate" in saved + assert "slo_assessment" in saved + assert "score" in saved["aggregate"] + + +def test_performance_baseline_multiple_datasets_aggregates( + db_session: Session, tmp_path: Path +) -> None: + metrics = run_performance_baseline( + db_session, + dataset_paths=[ + str(FIXTURE_DIR / "valid_listings.json"), + str(FIXTURE_DIR / "duplicate_records.json"), + ], + top_n=20, + output_dir=str(tmp_path), + ) + + assert len(metrics["datasets"]) == 2 + assert all(row["status"] in {"pass", "error"} for row in metrics["datasets"]) + assert set(metrics["aggregate"].keys()) == { + "ingest", + "score", + "validate_dataset", + "evaluate_scoring", + } + assert "deferred" in metrics["slo_assessment"] + assert "ranking_list_api_p95_ms" in metrics["slo_assessment"]["deferred"] diff --git a/docs/week-2-execution-plan.md b/docs/week-2-execution-plan.md index 0f93f4e..390401a 100644 --- a/docs/week-2-execution-plan.md +++ b/docs/week-2-execution-plan.md @@ -1,5 +1,7 @@ # Next Phase Execution Plan (Implementation Order) +> TODO (Phase 4 follow-up): include dataset-size context in performance baseline artifacts (`records_total`, `records_valid`, and rows/sec throughput) so duration comparisons are meaningful across different dataset sizes. + This is the practical execution sequence for the next feature branch, optimized for speed and low risk. ## Scope for This Branch: diff --git a/docs/week2-phase4-performance-baseline-implementation.md b/docs/week2-phase4-performance-baseline-implementation.md index 91d5a6e..8036946 100644 --- a/docs/week2-phase4-performance-baseline-implementation.md +++ b/docs/week2-phase4-performance-baseline-implementation.md @@ -202,4 +202,43 @@ Phase 4 is considered done when: - p50/p95 by stage - SLO met/missed/deferred - unresolved bottlenecks -5. Follow-up actions for Week 3/4 are explicitly captured. \ No newline at end of file +5. Follow-up actions for Week 3/4 are explicitly captured. + +--- + +## 11) Week 3/4 Handoff: Required Updates (Do Not Skip) + +When ranking/list/detail APIs are implemented in Week 3/4, update the baseline implementation immediately. + +### Files to update + +1. `backend/app/services/performance_baseline.py` +2. `backend/app/cli.py` +3. `backend/tests/test_performance_baseline.py` +4. `docs/mvp-performance-plan.md` (if SLOs change) +5. `docs/week2-phase4-performance-baseline-implementation.md` (mark Week 3/4 handoff completed) + +### Exact required changes + +1. Replace API SLO placeholders from deferred to measured: + - `ranking_list_api_p95_ms` + - `filtered_ranking_api_p95_ms` + - `listing_detail_api_p95_ms` +2. Add actual API benchmark execution in `run_performance_baseline`: + - call ranking/list/detail endpoints (or dedicated benchmark client), + - collect endpoint latency samples, + - compute p50/p95 for each endpoint. +3. Remove API SLOs from `deferred` classification once benchmark harness exists. +4. Add dataset-size context to artifacts: + - `records_total` + - `records_valid` + - throughput fields (for example rows/sec) for score/validation stages. +5. Extend summary output to include API p95 result lines and pass/fail status. +6. Add tests that assert: + - API latency metrics are present in `aggregate`, + - API SLOs are assessed under `met`/`missed` (not always deferred), + - dataset-size and throughput fields are written. + +### Completion signal + +Week 3/4 handoff is complete only when `baseline_metrics.json` contains measured API latency stats and API SLOs are no longer unconditionally deferred. \ No newline at end of file From 99db2c2056cc8e14f75dfbbc3bcbc7105634eda3 Mon Sep 17 00:00:00 2001 From: William Date: Fri, 24 Apr 2026 14:33:47 +0200 Subject: [PATCH 09/16] feat: allow unknown extra fields in PropfluxListing schema and add corresponding tests - Updated PropfluxListing model configuration to accept unknown fields, enhancing compatibility with evolving data schemas. - Added tests to validate ingestion of listings with extra fields, ensuring that records remain valid despite additional attributes. - Implemented partial validation tests to confirm that unknown fields do not invalidate the payload, supporting forward compatibility. --- backend/app/schemas/propflux_listing.py | 4 +++- backend/tests/test_ingestion_service.py | 28 +++++++++++++++++++++++++ backend/tests/test_propflux_schema.py | 24 +++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/backend/app/schemas/propflux_listing.py b/backend/app/schemas/propflux_listing.py index 784262f..a53d163 100644 --- a/backend/app/schemas/propflux_listing.py +++ b/backend/app/schemas/propflux_listing.py @@ -16,7 +16,9 @@ class RecordValidationError(BaseModel): class PropfluxListing(BaseModel): - model_config = ConfigDict(extra="forbid") + # Be permissive with future source schema additions. We still enforce all + # required fields/types below, but unknown keys are accepted. + model_config = ConfigDict(extra="allow") # Required fields title: str diff --git a/backend/tests/test_ingestion_service.py b/backend/tests/test_ingestion_service.py index 6a5cf7c..2e997d8 100644 --- a/backend/tests/test_ingestion_service.py +++ b/backend/tests/test_ingestion_service.py @@ -66,3 +66,31 @@ def test_ingestion_allows_land_records_with_missing_bedbath( assert listing is not None assert listing.bedrooms == 0 assert listing.bathrooms == 0.0 + + +def test_ingestion_allows_unknown_extra_fields(db_session: Session, tmp_path: Path) -> None: + payload = [ + { + "title": "4 Bedroom House in Welbedacht", + "price": 6250000.0, + "location": "Welbedacht, Knysna", + "bedrooms": 4, + "bathrooms": 4.0, + "property_type": "House", + "description": "Immaculate home with views", + "listing_id": "T5440103", + "source_site": "privateproperty", + "job_id": "72e50122", + "new_marketing_flag": True, + } + ] + input_file = tmp_path / "extra_fields_listing.json" + input_file.write_text(json.dumps(payload), encoding="utf-8") + + job = ingest_propflux_file(db_session, input_file) + assert job.records_valid == 1 + assert job.records_invalid == 0 + + listing = db_session.scalar(select(Listing)) + assert listing is not None + assert listing.title == "4 Bedroom House in Welbedacht" diff --git a/backend/tests/test_propflux_schema.py b/backend/tests/test_propflux_schema.py index 1f088e5..3cb773e 100644 --- a/backend/tests/test_propflux_schema.py +++ b/backend/tests/test_propflux_schema.py @@ -61,3 +61,27 @@ def test_partial_validation_allows_known_propflux_optional_fields() -> None: assert len(valid) == 1 assert len(invalid) == 0 + + +def test_partial_validation_allows_unknown_extra_fields() -> None: + payload = [ + { + "title": "4 Bedroom House in Welbedacht", + "price": 6250000.0, + "location": "Welbedacht, Knysna", + "bedrooms": 4, + "bathrooms": 4.0, + "property_type": "House", + "description": "Immaculate home with views", + "listing_id": "T5440103", + "source_site": "privateproperty", + # Unknown/forward-compatible fields from evolving upstream payloads. + "job_id": "72e50122", + "new_marketing_flag": True, + "custom_notes": "future field should not invalidate record", + } + ] + valid, invalid = validate_propflux_payload_partial(payload) + + assert len(valid) == 1 + assert len(invalid) == 0 From 72df8f102bd99497c0a94a9855c26941cd577a2d Mon Sep 17 00:00:00 2001 From: William Date: Fri, 24 Apr 2026 14:44:13 +0200 Subject: [PATCH 10/16] fix: resolve paths for validation and evaluation reports in performance baseline - Updated the performance baseline service to resolve and store absolute paths for validation and evaluation report files, ensuring consistency in file references. - Enhanced metrics path resolution for baseline summary and metrics files to prevent potential issues with relative paths. --- backend/app/services/performance_baseline.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/backend/app/services/performance_baseline.py b/backend/app/services/performance_baseline.py index 6c46069..bff2476 100644 --- a/backend/app/services/performance_baseline.py +++ b/backend/app/services/performance_baseline.py @@ -79,7 +79,9 @@ def run_performance_baseline( validation_result, validate_s = _timed(run_dataset_validation, db, ingestion_job.id) result["durations_s"]["validate_dataset"] = validate_s - result["artifacts"]["validation_report_path"] = validation_result.report_path + result["artifacts"]["validation_report_path"] = str( + Path(validation_result.report_path).resolve() + ) validate_durations.append(validate_s) evaluation_report, evaluate_s = _timed( @@ -90,7 +92,11 @@ def run_performance_baseline( top_n=top_n, ) result["durations_s"]["evaluate_scoring"] = evaluate_s - result["artifacts"]["evaluation_report_path"] = evaluation_report.get("report_path") + evaluation_report_path = evaluation_report.get("report_path") + if evaluation_report_path is not None: + result["artifacts"]["evaluation_report_path"] = str( + Path(evaluation_report_path).resolve() + ) evaluate_durations.append(evaluate_s) result["status"] = "pass" @@ -187,6 +193,6 @@ def run_performance_baseline( summary_path = base_dir / "baseline_summary.md" summary_path.write_text("\n".join(summary_lines) + "\n", encoding="utf-8") - metrics["metrics_path"] = str(metrics_path) - metrics["summary_path"] = str(summary_path) + metrics["metrics_path"] = str(metrics_path.resolve()) + metrics["summary_path"] = str(summary_path.resolve()) return metrics From ac0ade83e58d7a9dd0e02e3ad7063941c7e8a23e Mon Sep 17 00:00:00 2001 From: William Date: Fri, 24 Apr 2026 15:07:53 +0200 Subject: [PATCH 11/16] feat: add new status to IngestionJob model for enhanced tracking - Introduced a new status "analyzed" to the IngestionJob model, expanding the range of job states for better tracking and management of ingestion processes. --- ...0424_0005_add_analyzed_ingestion_status.py | 73 +++++++++++ backend/app/models/ingestion_job.py | 1 + backend/tests/test_analytics_service.py | 14 ++ docs/week2-phase5-validation-cycle-spec.md | 121 ++++++++++++++++++ 4 files changed, 209 insertions(+) create mode 100644 backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py create mode 100644 backend/tests/test_analytics_service.py create mode 100644 docs/week2-phase5-validation-cycle-spec.md diff --git a/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py new file mode 100644 index 0000000..25503dc --- /dev/null +++ b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py @@ -0,0 +1,73 @@ +"""add analyzed ingestion job status + +Revision ID: 20260424_0005 +Revises: 20260415_0004 +Create Date: 2026-04-24 00:05:00 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "20260424_0005" +down_revision: str | None = "20260415_0004" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.alter_column( + "ingestion_jobs", + "status", + existing_type=sa.Enum( + "created", + "processing", + "completed", + "completed_with_errors", + "failed", + name="ingestion_job_status", + native_enum=False, + ), + type_=sa.Enum( + "created", + "processing", + "completed", + "completed_with_errors", + "analyzed", + "failed", + name="ingestion_job_status", + native_enum=False, + ), + existing_nullable=False, + ) + + +def downgrade() -> None: + op.execute("UPDATE ingestion_jobs SET status = 'completed' WHERE status = 'analyzed'") + op.alter_column( + "ingestion_jobs", + "status", + existing_type=sa.Enum( + "created", + "processing", + "completed", + "completed_with_errors", + "analyzed", + "failed", + name="ingestion_job_status", + native_enum=False, + ), + type_=sa.Enum( + "created", + "processing", + "completed", + "completed_with_errors", + "failed", + name="ingestion_job_status", + native_enum=False, + ), + existing_nullable=False, + ) + diff --git a/backend/app/models/ingestion_job.py b/backend/app/models/ingestion_job.py index 9dc659d..ee154f2 100644 --- a/backend/app/models/ingestion_job.py +++ b/backend/app/models/ingestion_job.py @@ -17,6 +17,7 @@ class IngestionJob(Base): "processing", "completed", "completed_with_errors", + "analyzed", "failed", name="ingestion_job_status", native_enum=False, diff --git a/backend/tests/test_analytics_service.py b/backend/tests/test_analytics_service.py new file mode 100644 index 0000000..4541ba7 --- /dev/null +++ b/backend/tests/test_analytics_service.py @@ -0,0 +1,14 @@ +from app.models.ingestion_job import IngestionJob +from app.services.analytics import run_analytics_job +from sqlalchemy.orm import Session + + +def test_run_analytics_job_sets_analyzed_status(db_session: Session) -> None: + job = IngestionJob(input_path="fixture.json", status="completed") + db_session.add(job) + db_session.commit() + db_session.refresh(job) + + analyzed = run_analytics_job(db_session, job.id) + assert analyzed.status == "analyzed" + diff --git a/docs/week2-phase5-validation-cycle-spec.md b/docs/week2-phase5-validation-cycle-spec.md new file mode 100644 index 0000000..b387f2e --- /dev/null +++ b/docs/week2-phase5-validation-cycle-spec.md @@ -0,0 +1,121 @@ +# Week 2 Phase 5 Spec: Validation Cycle and Release Decision + +This spec operationalizes Phase 5 from `docs/week-2-execution-plan.md`: + +1. run one full validation cycle, +2. apply one controlled change set, +3. rerun and evaluate against baseline, +4. freeze profile/version and document release decision. + +--- + +## 1) Scope + +- Dataset: `data/samples/propflux_pp_1000_listings.json` +- Baseline run: ingest + score + validate-dataset (evaluation deferred until comparison step) +- Controlled change: exactly one scoring config bundle +- Candidate run: ingest + score + validate-dataset +- Evaluation: candidate vs baseline +- Finalization: freeze or revert + decision record + +Out of scope: + +- multi-bundle tuning in a single cycle +- architecture/performance refactors +- Week 3/4 API enhancements + +--- + +## 2) Execution Steps + +### Step A: Baseline run + +Run: + +- `ingest ` +- `score ` +- `validate-dataset ` + +Capture: + +- baseline job ID +- validation report path +- notable quality warnings/errors + +### Step B: Controlled change set (single bundle) + +Apply one bounded, auditable scoring change: + +- allowed: weight-only or threshold-only bundle +- disallowed: mixed refactors and multiple independent tuning bundles + +Record: + +- exact before/after values +- rationale + +### Step C: Candidate run + +Run same pipeline on same dataset: + +- `ingest ` +- `score ` +- `validate-dataset ` + +Capture candidate artifacts. + +### Step D: Evaluation compare + +Run: + +- `evaluate-scoring --reference-job-id --top-n 20` + +Capture: + +- decision +- failed/warning gates +- report path + +### Step E: Finalization + +- If decision is `promote`: freeze changed profile/version. +- If decision is `revert`: rollback controlled change set and freeze previous profile. +- Write release decision record artifact. + +--- + +## 3) Required Artifacts + +Produce a decision record containing: + +- dataset used +- baseline job ID + validation report path +- controlled change set details +- candidate job ID + validation report path +- evaluation report path and gate outcome +- final decision (`promote`/`revert`/`experimental`) +- freeze/rollback action taken +- follow-up tasks + +--- + +## 4) Acceptance Criteria + +Phase 5 is complete when: + +1. baseline run completed and recorded, +2. one controlled change set applied and documented, +3. candidate run completed and recorded, +4. evaluation run completed with explicit decision, +5. profile/version frozen via promote freeze or revert rollback, +6. decision record artifact written. + +--- + +## 5) Operational Notes + +- Keep commands and dataset constant between baseline and candidate. +- If ancillary command paths fail (for example non-critical analytics command issues), + continue required Phase 5 evaluation flow and document blocker in decision record. +- Prefer small reversible change bundles to maximize comparability. + From 302355047ab0c50707af7cd7c18048668f7d33f7 Mon Sep 17 00:00:00 2001 From: William Date: Fri, 24 Apr 2026 15:28:10 +0200 Subject: [PATCH 12/16] docs: update PROJECT_NOTE.md with debugging notes and file references - Added notes regarding a failure in scoring evaluation with minimal config changes, prompting a need for debugging. - Included specific file paths related to the evaluation process for better tracking and context. --- .cursor/rules/PROJECT_NOTE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md index 1d78e3a..6182dfb 100644 --- a/.cursor/rules/PROJECT_NOTE.md +++ b/.cursor/rules/PROJECT_NOTE.md @@ -1,5 +1,11 @@ # 🏠 Real Estate Deal Intelligence Platform (Full System) +backend/output/evaluations/job-21-ref-20-20260424131807/scoring_evaluation_2026-04-24_13-18-07Z.json + +For some reason it is failing, with minimal config changes. phase5_week2_validation_decision_2026-04-24_rerun_after_enum_fix.md + +debug why this is the case. + ## 🎯 Goal Build a **production-grade data intelligence system** that transforms large-scale real estate listing datasets into **high-quality investment opportunities** using advanced scoring, analytics, and a clean, interactive dashboard. From fc236f784f602d08e3bf496ccfa4f09f409ef5f3 Mon Sep 17 00:00:00 2001 From: William Date: Mon, 27 Apr 2026 12:22:58 +0200 Subject: [PATCH 13/16] refactor: update ranking identity mapping to use scored listing IDs - Modified the `_ranking_identity_map` function to accept a list of `ScoreResult` objects instead of a job ID, improving the accuracy of identity mapping. - Updated calls to `_ranking_identity_map` in `run_scoring_evaluation` to reflect the new parameter structure. - Added a new test to ensure that identity mapping correctly utilizes scored listing IDs, enhancing the robustness of scoring evaluations. --- backend/app/services/scoring_evaluation.py | 11 ++- backend/tests/test_scoring_v2_evaluation.py | 87 +++++++++++++++++++++ 2 files changed, 94 insertions(+), 4 deletions(-) diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py index 18ad17a..dec0b38 100644 --- a/backend/app/services/scoring_evaluation.py +++ b/backend/app/services/scoring_evaluation.py @@ -50,8 +50,11 @@ def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]: ).all() -def _ranking_identity_map(db: Session, job_id: int) -> dict[int, str]: - listings = db.scalars(select(Listing).where(Listing.job_id == job_id)).all() +def _ranking_identity_map(db: Session, score_rows: list[ScoreResult]) -> dict[int, str]: + listing_ids = {row.listing_id for row in score_rows} + if not listing_ids: + return {} + listings = db.scalars(select(Listing).where(Listing.id.in_(listing_ids))).all() identities: dict[int, str] = {} for listing in listings: external_listing_id = listing.listing_id @@ -373,7 +376,7 @@ def run_scoring_evaluation( raise ValueError(f"No scored listings found for job: {job_id}") model_version = current_rows[0].model_version - current_identity_map = _ranking_identity_map(db, job_id) + current_identity_map = _ranking_identity_map(db, current_rows) top_n_effective = int(top_n) if top_n > 0 else 20 sampled_rows = current_rows[:top_n_effective] @@ -544,7 +547,7 @@ def run_scoring_evaluation( } else: reference_rows = _sorted_scores(db, reference_job_id) - reference_identity_map = _ranking_identity_map(db, reference_job_id) + reference_identity_map = _ranking_identity_map(db, reference_rows) current_global_ids = _segment_identities( current_rows, current_identity_map, 0, len(current_rows) ) diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py index 395258a..f6b5632 100644 --- a/backend/tests/test_scoring_v2_evaluation.py +++ b/backend/tests/test_scoring_v2_evaluation.py @@ -345,3 +345,90 @@ def test_top_band_perturbation_threshold_can_pass(db_session: Session, monkeypat assert report["decision"] == "promote" top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"] assert top_band["status"] == "pass" + + +def test_stability_identity_mapping_uses_scored_listing_ids_not_listing_job_id( + db_session: Session, +) -> None: + baseline_job = IngestionJob( + input_path="baseline.json", + status="completed", + records_total=1, + records_valid=1, + records_invalid=0, + ) + candidate_job = IngestionJob( + input_path="candidate.json", + status="completed", + records_total=1, + records_valid=1, + records_invalid=0, + ) + db_session.add_all([baseline_job, candidate_job]) + db_session.flush() + + listing = Listing( + job_id=baseline_job.id, + source_hash="shared-listing-hash", + title="Shared listing", + price=1_500_000.0, + location="Cape Town", + bedrooms=3, + bathrooms=2.0, + property_type="house", + description="Shared listing across reruns", + listing_id="SHARED-1", + source_site="propflux", + city="Cape Town", + province="Western Cape", + floor_size=120.0, + normalized_payload={"fixture": True}, + ) + db_session.add(listing) + db_session.flush() + + explanation = _build_explanation(normalized_value=0.8, final_score=80.0) + db_session.add_all( + [ + ScoreResult( + job_id=baseline_job.id, + listing_id=listing.id, + score=80.0, + confidence=0.9, + deal_reason="baseline", + explanation=explanation, + model_version="advanced_v2", + ), + ScoreResult( + job_id=candidate_job.id, + listing_id=listing.id, + score=79.5, + confidence=0.9, + deal_reason="candidate", + explanation=explanation, + model_version="advanced_v2", + ), + RawListing( + job_id=candidate_job.id, + record_index=0, + source_site="propflux", + listing_id="SHARED-1", + payload={"price": listing.price}, + ), + ] + ) + + # Simulate upsert behavior where canonical listing ownership shifts to the + # latest ingestion job, which previously broke identity overlap. + listing.job_id = candidate_job.id + db_session.commit() + + report = run_scoring_evaluation( + db_session, + job_id=candidate_job.id, + reference_job_id=baseline_job.id, + top_n=20, + ) + top_band = report["gates"]["stability"]["metrics"]["segments"]["top_band"] + assert top_band["metrics"]["intersection_count"] == 1 + assert top_band["metrics"]["jaccard_overlap"] == 1.0 From 1d6fc34b4aef2e74637819f1b2bae922a8165957 Mon Sep 17 00:00:00 2001 From: William Date: Mon, 27 Apr 2026 12:33:47 +0200 Subject: [PATCH 14/16] docs: update project status and notes for Week 2 completion - Revised PROJECT_NOTE.md to reflect the successful completion of Week 2, including Phase 5 validation outcomes and final scoring profile values. - Updated current-project-status.md to indicate the transition to Week 3, highlighting the readiness of Week 2 outputs and outlining the next objectives for API/CLI/dashboard implementation. - Added details on the final validation decision and decision artifact for better tracking of project progress. --- .cursor/rules/PROJECT_NOTE.md | 16 ++++++++------ docs/current-project-status.md | 39 ++++++++++++++++++---------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/.cursor/rules/PROJECT_NOTE.md b/.cursor/rules/PROJECT_NOTE.md index 6182dfb..be40cd1 100644 --- a/.cursor/rules/PROJECT_NOTE.md +++ b/.cursor/rules/PROJECT_NOTE.md @@ -1,11 +1,5 @@ # 🏠 Real Estate Deal Intelligence Platform (Full System) -backend/output/evaluations/job-21-ref-20-20260424131807/scoring_evaluation_2026-04-24_13-18-07Z.json - -For some reason it is failing, with minimal config changes. phase5_week2_validation_decision_2026-04-24_rerun_after_enum_fix.md - -debug why this is the case. - ## 🎯 Goal Build a **production-grade data intelligence system** that transforms large-scale real estate listing datasets into **high-quality investment opportunities** using advanced scoring, analytics, and a clean, interactive dashboard. @@ -460,6 +454,16 @@ Ship an ROI-first, explainable scoring system (`advanced_v2`) with deterministic - full-dataset displacement context, - relative displacement thresholds (`*_pct`) for dataset-size-aware gating. +### **Week 2 Completion Status (Latest)** + +- Phase 5 rerun after enum + evaluation identity fixes completed successfully. +- Final validation decision: `promote`. +- Frozen Week 2 scoring profile values: + - `advanced_v2.weights.price_vs_comp = 0.29` + - `advanced_v2.weights.roi_proxy = 0.21` +- Decision artifact: + - `backend/output/evaluations/phase5_week2_validation_decision_2026-04-27_post_enum_eval_fix.md` + --- ## **Week 3** diff --git a/docs/current-project-status.md b/docs/current-project-status.md index d2bbee7..c1e7d7e 100644 --- a/docs/current-project-status.md +++ b/docs/current-project-status.md @@ -7,13 +7,13 @@ This file is the single reference for: ## Snapshot -- Current phase: transition from Week 1 completion to Week 2 implementation. -- Branch readiness: documentation and planning are in place for next-phase execution. -- Primary next objective: implement Week 2 advanced scoring and explanation payloads. +- Current phase: Week 2 implementation completed (including Phase 5 validation cycle). +- Branch readiness: Week 2 outputs are production-candidate and frozen pending Week 3 scope. +- Primary next objective: begin Week 3 strategy-driven API/CLI/dashboard implementation. ## Completion Checklist -## Completed (foundation) +## Completed (foundation + Week 2) - [x] Project scaffolding, dev scripts, and CI baseline - [x] Docker Compose + PostgreSQL setup @@ -23,16 +23,18 @@ This file is the single reference for: - [x] Dataset validation service and CLI command (`validate-dataset`) - [x] Core unit/integration test coverage for ingestion, scoring, and dataset validation - [x] Week 2 strategy and architecture documentation package +- [x] Week 2 advanced scoring (`advanced_v2`) with micro-comps + ROI proxy +- [x] Week 2 structured reasoning payload in scored output +- [x] Week 2 evaluation gates (`promote`/`revert`/`experimental`) and CLI integration +- [x] Week 2 segment-based stability checks with relative displacement thresholds +- [x] Week 2 Phase 4 performance baseline command and artifacts +- [x] Week 2 Phase 5 validation cycle completed with final promoted profile ## Planned, not implemented yet -- [ ] Week 2 advanced scoring (`advanced_v2`) with micro-comps -- [ ] Week 2 ROI proxy (yield + transaction costs) -- [ ] Week 2 structured reasoning payload in scored output -- [ ] Week 2 analytics quality/stability checks -- [ ] Week 2 optional LLM enrichment prototype (gated) - [ ] Week 3 strategy-driven ranking API/CLI/dashboard functionality - [ ] Week 4 full validation/tuning/release hardening loop +- [ ] Week 2 optional LLM enrichment prototype (gated) ## Deferred/optional @@ -40,18 +42,18 @@ This file is the single reference for: - [ ] Heavy macro/geospatial modeling - [ ] PDF export (if not required for MVP release) -## Next Feature Branch Kickoff Checklist +## Next Feature Branch Kickoff Checklist (Week 3) Use this immediately when starting the next branch: -1. Confirm scope: Week 2 must-ship only (do not mix Week 3 UI/API overhaul work). -2. Define `advanced_v2` signal contract and output schema. -3. Implement micro-comps computation with safe fallbacks and confidence penalties. -4. Implement ROI proxy signals with deterministic defaults first. -5. Add structured reasoning payload persistence + tests. -6. Add evaluation gates from `docs/evaluation-review-protocol.md`. -7. Run manual sample review and log results. -8. Merge only if promotion thresholds pass. +1. Confirm Week 3 scope: strategy-driven API/CLI/dashboard only. +2. Keep Week 2 scoring profile as baseline: + - `advanced_v2.weights.price_vs_comp = 0.29` + - `advanced_v2.weights.roi_proxy = 0.21` +3. Build ranking/list/detail API endpoints and aligned CLI workflow. +4. Implement Week 3 performance handoff items from Phase 4 baseline docs. +5. Preserve Week 2 evaluation contracts while expanding strategy surfaces. +6. Run regression checks against Week 2 decision artifact before merging. ## Branch Scope Guardrail (Important) @@ -66,3 +68,4 @@ Use this immediately when starting the next branch: - Evaluation protocol: `docs/evaluation-review-protocol.md` - Principal audit: `docs/project-note-principal-audit.md` - MVP performance plan: `docs/mvp-performance-plan.md` +- Week 2 final decision (post enum/eval fix): `backend/output/evaluations/phase5_week2_validation_decision_2026-04-27_post_enum_eval_fix.md` From cb967a4a8c04afbbe04ccb3baab8b3a0182ab916 Mon Sep 17 00:00:00 2001 From: William Date: Mon, 27 Apr 2026 12:38:25 +0200 Subject: [PATCH 15/16] refactor: improve code readability and structure in generate_top5_audit_visualization.py - Rearranged import statements for better organization, moving datetime import above others. - Enhanced readability by formatting complex expressions and return statements across multiple lines. - Updated the construction of HTML strings to use list comprehension for clarity and maintainability. - Made minor adjustments to variable assignments for improved consistency and readability. --- .../generate_top5_audit_visualization.py | 74 ++++++++++++------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/backend/scripts/generate_top5_audit_visualization.py b/backend/scripts/generate_top5_audit_visualization.py index 7eeb90d..8d40882 100644 --- a/backend/scripts/generate_top5_audit_visualization.py +++ b/backend/scripts/generate_top5_audit_visualization.py @@ -1,16 +1,13 @@ from __future__ import annotations import argparse -from datetime import UTC, datetime import html +from datetime import UTC, datetime from pathlib import Path from typing import Any -from matplotlib import pyplot as plt import numpy as np import plotly.graph_objects as go -from sqlalchemy import select - from app.db.session import SessionLocal from app.models.listing import Listing from app.models.score_result import ScoreResult @@ -22,6 +19,8 @@ _load_scoring_config, _resolve_comp_context, ) +from matplotlib import pyplot as plt +from sqlalchemy import select def _load_top_listings(job_id: int, limit: int) -> list[dict[str, Any]]: @@ -103,9 +102,14 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in all_listings = db.scalars(select(Listing).where(Listing.job_id == job_id)).all() comp_index = _build_comp_index( - all_listings, fallback_order, include_bedrooms=include_bedrooms, include_bathrooms=include_bathrooms + all_listings, + fallback_order, + include_bedrooms=include_bedrooms, + include_bathrooms=include_bathrooms, ) - listing_map = {int(listing.id): listing for listing in all_listings if int(listing.id) in selected_ids} + listing_map = { + int(listing.id): listing for listing in all_listings if int(listing.id) in selected_ids + } now_date = datetime.now(UTC).date() details: dict[int, dict[str, Any]] = {} @@ -132,7 +136,9 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in else None ) days_on_market = ( - max(0, (now_date - listing.date_posted).days) if listing.date_posted is not None else None + max(0, (now_date - listing.date_posted).days) + if listing.date_posted is not None + else None ) confidence = _confidence_signal(listing) feature_value = _feature_density_signal(listing) @@ -161,7 +167,9 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in "fallback_penalty": round(float(fallback_penalty), 4), }, "size_vs_comp_inputs": { - "listing_floor_size": float(listing.floor_size) if listing.floor_size is not None else None, + "listing_floor_size": ( + float(listing.floor_size) if listing.floor_size is not None else None + ), "listing_ppsqm": listing_ppsqm, "comp_median_ppsqm": comp_median_ppsqm, "comp_level": comp_level, @@ -246,15 +254,23 @@ def _build_details_table_html(records: list[dict[str, Any]]) -> str: "" ) - return ( - "

Metric Input Audit Table

" - "

Inputs below are the listing datapoints and intermediate values used for metric calculations.

" - "
" - "" - f"{header_html}" - f"{''.join(body_rows)}" - "
" - "
" + return "".join( + [ + "

Metric Input Audit Table

", + ( + "

Inputs below are the listing datapoints and intermediate values " + "used for metric calculations.

" + ), + "
", + ( + "" + ), + f"{header_html}", + f"{''.join(body_rows)}", + "
", + "
", + ] ) @@ -320,14 +336,22 @@ def _build_interactive_chart( ) details_html = _build_details_table_html(records) chart_html = fig.to_html(include_plotlyjs="cdn", full_html=False) - full_html = ( - "PropSignal Top 5 Audit" - "" - "

PropSignal Human Audit: Top 5 Listings

" - "

Interactive chart + metric input audit table for explanation concordance checks.

" - f"{chart_html}" - f"{details_html}" - "" + full_html = "".join( + [ + ( + "" + "PropSignal Top 5 Audit" + ), + "", + "

PropSignal Human Audit: Top 5 Listings

", + ( + "

Interactive chart + metric input audit table " + "for explanation concordance checks.

" + ), + f"{chart_html}", + f"{details_html}", + "", + ] ) output_html.write_text(full_html, encoding="utf-8") From 140fc8d7278a8b91225ad163b4e4070785dc4277 Mon Sep 17 00:00:00 2001 From: William Date: Mon, 27 Apr 2026 12:41:48 +0200 Subject: [PATCH 16/16] refactor: improve type annotations and code clarity in scoring evaluation and performance baseline services - Updated type annotations for `slo_assessment` in `performance_baseline.py` to specify dictionary structure. - Enhanced type annotations for parameters in `_compute_jaccard` and `_spearman_rank_correlation` functions in `scoring_evaluation.py` to use `Sequence` for better flexibility. - Simplified the assignment of `identities` in `_ranking_identity_map` for improved readability. - Consolidated the construction of `fallback_order` in `generate_top5_audit_visualization.py` for cleaner code. - Removed unnecessary blank lines in various test files to maintain consistency and cleanliness in the codebase. --- ...0424_0005_add_analyzed_ingestion_status.py | 1 - backend/app/services/performance_baseline.py | 2 +- backend/app/services/scoring_evaluation.py | 24 +++++++++++-------- .../generate_top5_audit_visualization.py | 4 +--- backend/tests/test_analytics_service.py | 1 - backend/tests/test_scoring_v2_evaluation.py | 1 + 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py index 25503dc..cc5e58e 100644 --- a/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py +++ b/backend/alembic/versions/20260424_0005_add_analyzed_ingestion_status.py @@ -70,4 +70,3 @@ def downgrade() -> None: ), existing_nullable=False, ) - diff --git a/backend/app/services/performance_baseline.py b/backend/app/services/performance_baseline.py index bff2476..812b4cf 100644 --- a/backend/app/services/performance_baseline.py +++ b/backend/app/services/performance_baseline.py @@ -131,7 +131,7 @@ def run_performance_baseline( "filtered_ranking_api_p95_ms": 1200.0, "listing_detail_api_p95_ms": 500.0, } - slo_assessment = {"met": [], "missed": [], "deferred": []} + slo_assessment: dict[str, list[str]] = {"met": [], "missed": [], "deferred": []} if aggregate["score"]["p95_s"] <= slo_targets["scoring_run_10k_max_s"]: slo_assessment["met"].append("scoring_run_10k_max_s") else: diff --git a/backend/app/services/scoring_evaluation.py b/backend/app/services/scoring_evaluation.py index dec0b38..0d4ccb0 100644 --- a/backend/app/services/scoring_evaluation.py +++ b/backend/app/services/scoring_evaluation.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from collections.abc import Sequence from datetime import UTC, datetime from pathlib import Path from statistics import correlation, median @@ -21,7 +22,7 @@ def _safe_divide(numerator: float, denominator: float) -> float: return numerator / denominator -def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float: +def _compute_jaccard(left_ids: Sequence[str | int], right_ids: Sequence[str | int]) -> float: left_set = set(left_ids) right_set = set(right_ids) union_size = len(left_set | right_set) @@ -30,7 +31,9 @@ def _compute_jaccard(left_ids: list[str], right_ids: list[str]) -> float: return round(len(left_set & right_set) / union_size, 4) -def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str]) -> float: +def _spearman_rank_correlation( + current_ids: Sequence[str | int], reference_ids: Sequence[str | int] +) -> float: current_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(current_ids)} reference_rank = {listing_id: idx + 1 for idx, listing_id in enumerate(reference_ids)} common_ids = sorted(set(current_rank) & set(reference_rank)) @@ -43,11 +46,12 @@ def _spearman_rank_correlation(current_ids: list[str], reference_ids: list[str]) def _sorted_scores(db: Session, job_id: int) -> list[ScoreResult]: - return db.scalars( + rows = db.scalars( select(ScoreResult) .where(ScoreResult.job_id == job_id) .order_by(ScoreResult.score.desc(), ScoreResult.listing_id.asc()) ).all() + return list(rows) def _ranking_identity_map(db: Session, score_rows: list[ScoreResult]) -> dict[int, str]: @@ -58,11 +62,10 @@ def _ranking_identity_map(db: Session, score_rows: list[ScoreResult]) -> dict[in identities: dict[int, str] = {} for listing in listings: external_listing_id = listing.listing_id - identities[listing.id] = ( - external_listing_id - if external_listing_id not in (None, "") - else f"internal-{listing.id}" - ) + if isinstance(external_listing_id, str) and external_listing_id: + identities[listing.id] = external_listing_id + else: + identities[listing.id] = f"internal-{listing.id}" return identities @@ -709,12 +712,13 @@ def run_scoring_evaluation( minimum_sample_for_promote = int(decision_thresholds.get("minimum_sample_for_promote", 100)) warning_gate_keys: list[str] = [] failed_gate_keys: list[str] = [] - for gate_key, gate_payload in { + gate_payloads: dict[str, dict[str, Any]] = { "data_quality": data_quality_gate, "scoring_sanity": scoring_sanity_gate, "stability": stability_gate, "explainability": explainability_gate, - }.items(): + } + for gate_key, gate_payload in gate_payloads.items(): if gate_payload["status"] == "fail": failed_gate_keys.append(gate_key) elif gate_payload["status"] == "warn": diff --git a/backend/scripts/generate_top5_audit_visualization.py b/backend/scripts/generate_top5_audit_visualization.py index 8d40882..347775a 100644 --- a/backend/scripts/generate_top5_audit_visualization.py +++ b/backend/scripts/generate_top5_audit_visualization.py @@ -90,9 +90,7 @@ def _compute_audit_inputs(job_id: int, records: list[dict[str, Any]]) -> dict[in advanced_v2_cfg = config.get("advanced_v2", {}) comps_cfg = advanced_v2_cfg.get("comps", {}) roi_cfg = advanced_v2_cfg.get("roi", {}) - fallback_order = list( - comps_cfg.get("fallback_order", ["suburb", "city", "province", "global"]) - ) + fallback_order = list(comps_cfg.get("fallback_order", ["suburb", "city", "province", "global"])) include_bedrooms = bool(comps_cfg.get("include_bedrooms", True)) include_bathrooms = bool(comps_cfg.get("include_bathrooms", True)) minimum_cohort_size = int(comps_cfg.get("minimum_cohort_size", 12)) diff --git a/backend/tests/test_analytics_service.py b/backend/tests/test_analytics_service.py index 4541ba7..1afb3f0 100644 --- a/backend/tests/test_analytics_service.py +++ b/backend/tests/test_analytics_service.py @@ -11,4 +11,3 @@ def test_run_analytics_job_sets_analyzed_status(db_session: Session) -> None: analyzed = run_analytics_job(db_session, job.id) assert analyzed.status == "analyzed" - diff --git a/backend/tests/test_scoring_v2_evaluation.py b/backend/tests/test_scoring_v2_evaluation.py index f6b5632..19400a3 100644 --- a/backend/tests/test_scoring_v2_evaluation.py +++ b/backend/tests/test_scoring_v2_evaluation.py @@ -203,6 +203,7 @@ def test_top_band_displacement_threshold_can_fail(db_session: Session, monkeypat assert report["decision"] == "revert" assert "stability" in report["failed_gates"] + def test_full_dataset_displacement_warning_is_context_only( db_session: Session, monkeypatch ) -> None: