diff --git a/app/domains/commit/services/matching_evaluation.py b/app/domains/commit/services/matching_evaluation.py new file mode 100644 index 0000000..c6cf2ff --- /dev/null +++ b/app/domains/commit/services/matching_evaluation.py @@ -0,0 +1,391 @@ +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +DEFAULT_CONFIDENCE_THRESHOLD = 70 + + +@dataclass(frozen=True) +class GoldenMatchCase: + case_id: str + application_id: int | None + application_title: str + application_reasons: tuple[str, ...] + expected_commit_hashes: tuple[str, ...] + accepted_commit_hashes: tuple[str, ...] + distractor_commit_hashes: tuple[str, ...] + should_match: bool + tags: tuple[str, ...] + + +@dataclass(frozen=True) +class CaseEvaluation: + case_id: str + application_title: str + should_match: bool + passed: bool + expected_found: bool + first_expected_rank: int | None + recommended_count: int + recommended_hashes: tuple[str, ...] + false_positive_hashes: tuple[str, ...] + high_confidence_false_positive_hashes: tuple[str, ...] + distractor_hit_hashes: tuple[str, ...] + + def as_dict(self) -> dict[str, Any]: + return { + "case_id": self.case_id, + "application_title": self.application_title, + "should_match": self.should_match, + "passed": self.passed, + "expected_found": self.expected_found, + "first_expected_rank": self.first_expected_rank, + "recommended_count": self.recommended_count, + "recommended_hashes": list(self.recommended_hashes), + "false_positive_hashes": list(self.false_positive_hashes), + "high_confidence_false_positive_hashes": list( + self.high_confidence_false_positive_hashes + ), + "distractor_hit_hashes": list(self.distractor_hit_hashes), + } + + +@dataclass(frozen=True) +class MatchEvaluationSummary: + total_cases: int + passed_cases: int + match_cases: int + no_match_cases: int + recall_at_k: float + precision_at_k: float + mean_reciprocal_rank: float + no_match_accuracy: float + false_positive_count: int + high_confidence_false_positive_count: int + distractor_hit_count: int + cases: tuple[CaseEvaluation, ...] + + def as_dict(self) -> dict[str, Any]: + return { + "total_cases": self.total_cases, + "passed_cases": self.passed_cases, + "match_cases": self.match_cases, + "no_match_cases": self.no_match_cases, + "recall_at_k": self.recall_at_k, + "precision_at_k": self.precision_at_k, + "mean_reciprocal_rank": self.mean_reciprocal_rank, + "no_match_accuracy": self.no_match_accuracy, + "false_positive_count": self.false_positive_count, + "high_confidence_false_positive_count": ( + self.high_confidence_false_positive_count + ), + "distractor_hit_count": self.distractor_hit_count, + "cases": [case.as_dict() for case in self.cases], + } + + +@dataclass(frozen=True) +class _Recommendation: + commit_hash: str + confidence: int | None + + +def load_golden_cases(path: str | Path) -> list[GoldenMatchCase]: + raw = json.loads(Path(path).read_text(encoding="utf-8")) + cases = raw["cases"] if isinstance(raw, dict) else raw + parsed_cases = [_golden_case_from_mapping(case) for case in cases] + _validate_unique_case_ids(parsed_cases) + return parsed_cases + + +def evaluate_match_response( + cases: list[GoldenMatchCase], + response: dict[str, Any], + *, + top_k: int = 5, + confidence_threshold: int = DEFAULT_CONFIDENCE_THRESHOLD, + fail_on_false_positive: bool = False, +) -> MatchEvaluationSummary: + result = _extract_result(response) + application_items = result.get("applications") or [] + by_application_id = { + item.get("application_id"): item + for item in application_items + if item.get("application_id") is not None + } + by_application_title = { + item.get("application_title"): item + for item in application_items + if item.get("application_title") + } + + evaluations: list[CaseEvaluation] = [] + total_relevant_recommendations = 0 + total_recommendations = 0 + reciprocal_ranks: list[float] = [] + + for case in cases: + item = _find_application_item( + case, + by_application_id=by_application_id, + by_application_title=by_application_title, + ) + raw_recommendations = (item or {}).get("recommended_commits") or [] + recommendations = _to_recommendations(raw_recommendations[:top_k]) + recommended_hashes = tuple( + recommendation.commit_hash for recommendation in recommendations + ) + + relevant_hashes = case.expected_commit_hashes + case.accepted_commit_hashes + first_expected_rank = _first_matching_rank( + recommended_hashes, + case.expected_commit_hashes, + ) + expected_found = first_expected_rank is not None + false_positive_hashes = tuple( + recommendation.commit_hash + for recommendation in recommendations + if not _hash_in_set(recommendation.commit_hash, relevant_hashes) + ) + high_confidence_false_positive_hashes = tuple( + recommendation.commit_hash + for recommendation in recommendations + if _is_high_confidence(recommendation, confidence_threshold) + and not _hash_in_set(recommendation.commit_hash, relevant_hashes) + ) + high_confidence_hashes = tuple( + recommendation.commit_hash + for recommendation in recommendations + if _is_high_confidence(recommendation, confidence_threshold) + ) + distractor_hit_hashes = tuple( + recommendation.commit_hash + for recommendation in recommendations + if _hash_in_set(recommendation.commit_hash, case.distractor_commit_hashes) + ) + passed = _case_passed( + case=case, + expected_found=expected_found, + high_confidence_hashes=high_confidence_hashes, + high_confidence_false_positive_hashes=( + high_confidence_false_positive_hashes + ), + fail_on_false_positive=fail_on_false_positive, + ) + + total_recommendations += len(recommended_hashes) + total_relevant_recommendations += sum( + 1 + for commit_hash in recommended_hashes + if _hash_in_set(commit_hash, relevant_hashes) + ) + if case.should_match: + reciprocal_ranks.append( + 0.0 if first_expected_rank is None else 1 / first_expected_rank + ) + + evaluations.append( + CaseEvaluation( + case_id=case.case_id, + application_title=case.application_title, + should_match=case.should_match, + passed=passed, + expected_found=expected_found, + first_expected_rank=first_expected_rank, + recommended_count=len(recommended_hashes), + recommended_hashes=recommended_hashes, + false_positive_hashes=false_positive_hashes, + high_confidence_false_positive_hashes=( + high_confidence_false_positive_hashes + ), + distractor_hit_hashes=distractor_hit_hashes, + ) + ) + + match_cases = sum(1 for case in cases if case.should_match) + no_match_cases = len(cases) - match_cases + passed_cases = sum(1 for evaluation in evaluations if evaluation.passed) + found_cases = sum(1 for evaluation in evaluations if evaluation.expected_found) + no_match_passes = sum( + 1 + for evaluation in evaluations + if not evaluation.should_match and evaluation.passed + ) + false_positive_count = sum( + len(evaluation.false_positive_hashes) for evaluation in evaluations + ) + high_confidence_false_positive_count = sum( + len(evaluation.high_confidence_false_positive_hashes) + for evaluation in evaluations + ) + distractor_hit_count = sum( + len(evaluation.distractor_hit_hashes) for evaluation in evaluations + ) + + return MatchEvaluationSummary( + total_cases=len(cases), + passed_cases=passed_cases, + match_cases=match_cases, + no_match_cases=no_match_cases, + recall_at_k=_safe_div(found_cases, match_cases), + precision_at_k=_safe_div(total_relevant_recommendations, total_recommendations), + mean_reciprocal_rank=_safe_div(sum(reciprocal_ranks), len(reciprocal_ranks)), + no_match_accuracy=_safe_div(no_match_passes, no_match_cases), + false_positive_count=false_positive_count, + high_confidence_false_positive_count=high_confidence_false_positive_count, + distractor_hit_count=distractor_hit_count, + cases=tuple(evaluations), + ) + + +def _golden_case_from_mapping(raw: dict[str, Any]) -> GoldenMatchCase: + expected_commit_hashes = tuple( + _normalize_hash(value) + for value in raw.get("expected_commit_hashes", []) + if _normalize_hash(value) + ) + should_match = bool(raw.get("should_match", True)) + if should_match and not expected_commit_hashes: + raise ValueError(f"expected_commit_hashes is required: {raw.get('case_id')}") + + return GoldenMatchCase( + case_id=str(raw["case_id"]), + application_id=raw.get("application_id"), + application_title=str(raw["application_title"]), + application_reasons=tuple( + str(reason) for reason in raw.get("application_reasons", []) + ), + expected_commit_hashes=expected_commit_hashes, + accepted_commit_hashes=tuple( + _normalize_hash(value) + for value in raw.get("accepted_commit_hashes", []) + if _normalize_hash(value) + ), + distractor_commit_hashes=tuple( + _normalize_hash(value) + for value in raw.get("distractor_commit_hashes", []) + if _normalize_hash(value) + ), + should_match=should_match, + tags=tuple(str(tag) for tag in raw.get("tags", [])), + ) + + +def _validate_unique_case_ids(cases: list[GoldenMatchCase]) -> None: + seen: set[str] = set() + duplicates: set[str] = set() + for case in cases: + if case.case_id in seen: + duplicates.add(case.case_id) + seen.add(case.case_id) + if duplicates: + raise ValueError(f"duplicate case_id values: {sorted(duplicates)}") + + +def _extract_result(response: dict[str, Any]) -> dict[str, Any]: + result = response.get("result") + if isinstance(result, dict) and "applications" in result: + return result + return response + + +def _find_application_item( + case: GoldenMatchCase, + *, + by_application_id: dict[int, dict[str, Any]], + by_application_title: dict[str, dict[str, Any]], +) -> dict[str, Any] | None: + if case.application_id is not None and case.application_id in by_application_id: + return by_application_id[case.application_id] + return by_application_title.get(case.application_title) + + +def _to_recommendations(raw_recommendations: list[Any]) -> tuple[_Recommendation, ...]: + recommendations: list[_Recommendation] = [] + for raw in raw_recommendations: + if not isinstance(raw, dict): + continue + commit_hash = _normalize_hash(raw.get("commit_hash")) + if not commit_hash: + continue + recommendations.append( + _Recommendation( + commit_hash=commit_hash, + confidence=_normalize_confidence(raw.get("confidence")), + ) + ) + return tuple(recommendations) + + +def _normalize_confidence(value: Any) -> int | None: + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + try: + return int(float(value.strip())) + except ValueError: + return None + return None + + +def _is_high_confidence( + recommendation: _Recommendation, + confidence_threshold: int, +) -> bool: + # Older response fixtures may not have confidence. Treat those recommendations as + # above-threshold because /api/commit/match already filters recommendations. + if recommendation.confidence is None: + return True + return recommendation.confidence >= confidence_threshold + + +def _case_passed( + *, + case: GoldenMatchCase, + expected_found: bool, + high_confidence_hashes: tuple[str, ...], + high_confidence_false_positive_hashes: tuple[str, ...], + fail_on_false_positive: bool, +) -> bool: + if not case.should_match: + return not high_confidence_hashes + if not expected_found: + return False + if fail_on_false_positive and high_confidence_false_positive_hashes: + return False + return True + + +def _first_matching_rank( + recommended_hashes: tuple[str, ...], + expected_hashes: tuple[str, ...], +) -> int | None: + for index, commit_hash in enumerate(recommended_hashes, start=1): + if _hash_in_set(commit_hash, expected_hashes): + return index + return None + + +def _hash_in_set(commit_hash: str, expected_hashes: tuple[str, ...]) -> bool: + return any(_hash_matches(commit_hash, expected) for expected in expected_hashes) + + +def _hash_matches(left: str, right: str) -> bool: + if len(left) < 7 or len(right) < 7: + return left == right + return left.startswith(right) or right.startswith(left) + + +def _normalize_hash(value: Any) -> str: + return str(value or "").strip().lower() + + +def _safe_div(numerator: float, denominator: float) -> float: + if denominator == 0: + return 0.0 + return numerator / denominator diff --git a/docs/commit_matching_evaluation.md b/docs/commit_matching_evaluation.md new file mode 100644 index 0000000..9da85b7 --- /dev/null +++ b/docs/commit_matching_evaluation.md @@ -0,0 +1,106 @@ +# Commit Matching Evaluation + +## 목적 + +회의 적용사항과 커밋의 실제 정답 관계가 운영 DB에 직접 저장되어 있지 않기 때문에, +실제 커밋을 기준으로 만든 평가용 적용사항을 별도 golden dataset으로 관리한다. + +이 평가셋은 다음 작업에 사용한다. + +- `/api/commit/match` 결과가 기대 커밋을 상위 K개 안에 포함하는지 확인 +- 70점 이상 false positive 후보가 생기는지 확인 +- 아직 구현되지 않은 적용사항이 빈 추천 목록으로 반환되는지 확인 +- 점수식, threshold, keyword/context 정책 변경 전후의 품질 비교 + +## 평가셋 위치 + +```bash +tests/fixtures/commit_matching_golden_cases.json +``` + +각 case는 실제 Whylog-AI 커밋 해시를 기준으로 구성한다. + +- `application_title`: 시연용/평가용 가짜 회의 적용사항 제목 +- `application_reasons`: 적용사항 근거 +- `expected_commit_hashes`: 반드시 추천되어야 하는 실제 커밋 해시 +- `accepted_commit_hashes`: 정답은 아니지만 false positive로 세지 않을 허용 커밋 해시 +- `distractor_commit_hashes`: 헷갈리지만 정답이 아닌 커밋 해시 +- `should_match`: 추천이 있어야 하는지 여부 +- `tags`: 분석용 태그 + +## 평가 실행 + +먼저 실제 또는 로컬 `/api/commit/match` 응답을 JSON 파일로 저장한다. +응답은 FastAPI 공통 응답 wrapper가 있는 형태와 result 본문만 있는 형태를 모두 지원한다. + +```bash +uv run python scripts/evaluate_commit_matching.py \ + --cases tests/fixtures/commit_matching_golden_cases.json \ + --response /path/to/commit-match-response.json \ + --top-k 5 +``` + +JSON 요약이 필요하면 다음 옵션을 사용한다. + +```bash +uv run python scripts/evaluate_commit_matching.py \ + --response /path/to/commit-match-response.json \ + --json +``` + +CI나 회귀 검증에서 실패 시 non-zero exit code가 필요하면 다음 옵션을 추가한다. + +```bash +uv run python scripts/evaluate_commit_matching.py \ + --response /path/to/commit-match-response.json \ + --fail-on-failure +``` + +정답 커밋이 포함되어도 70점 이상 오탐 커밋이 함께 추천되면 실패로 보고 싶을 때는 +다음 옵션을 함께 사용한다. + +```bash +uv run python scripts/evaluate_commit_matching.py \ + --response /path/to/commit-match-response.json \ + --fail-on-false-positive \ + --fail-on-failure +``` + +기본 high-confidence 기준은 70점이며, 필요하면 조정할 수 있다. + +```bash +uv run python scripts/evaluate_commit_matching.py \ + --response /path/to/commit-match-response.json \ + --confidence-threshold 75 +``` + +## 지표 + +- `recall_at_k`: 정답 커밋이 상위 K개 추천 안에 포함된 비율 +- `precision_at_k`: 추천된 커밋 중 정답 또는 허용 커밋 비율 +- `mean_reciprocal_rank`: 정답 커밋이 몇 번째에 나왔는지 반영한 순위 지표 +- `no_match_accuracy`: 추천이 없어야 하는 적용사항에서 빈 추천을 반환한 비율 +- `false_positive_count`: 정답 또는 허용 커밋이 아닌 추천 개수 +- `high_confidence_false_positive_count`: confidence threshold 이상인 오탐 추천 개수 +- `distractor_hit_count`: hard negative로 지정한 distractor 커밋이 추천된 개수 + +`accepted_commit_hashes`는 precision 계산과 false positive 판정에서만 사용한다. +해당 커밋만 추천되고 `expected_commit_hashes`가 빠진 경우에는 해당 case를 실패로 본다. + +`distractor_commit_hashes`는 정답이 아니지만 헷갈리기 쉬운 커밋을 명시하는 필드다. +추천 결과에 distractor가 포함되면 `distractor_hit_hashes`와 `distractor_hit_count`로 +별도 집계된다. + +기본 pass 기준은 다음과 같다. + +- `should_match=true`: expected commit이 top-k 안에 있으면 pass +- `should_match=false`: confidence threshold 이상 추천이 없으면 pass +- `--fail-on-false-positive`: match case에서도 threshold 이상 오탐이 있으면 fail + +## 고도화 흐름 + +1. 실제 커밋을 기준으로 golden case를 추가한다. +2. 현재 `/api/commit/match` 결과를 저장하고 평가 스크립트로 baseline을 기록한다. +3. 일반어 제거, intent keyword 보정, LLM rerank 같은 정책을 한 번에 하나씩 적용한다. +4. 같은 response fixture 또는 같은 운영 데이터로 지표를 비교한다. +5. 시연용 데이터는 easy case, hard negative, no-match case를 모두 포함하도록 구성한다. diff --git a/scripts/evaluate_commit_matching.py b/scripts/evaluate_commit_matching.py new file mode 100644 index 0000000..1fb90a6 --- /dev/null +++ b/scripts/evaluate_commit_matching.py @@ -0,0 +1,118 @@ +import argparse +import json +import sys +from pathlib import Path + +ROOT_DIR = Path(__file__).resolve().parents[1] +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +from app.domains.commit.services.matching_evaluation import ( # noqa: E402 + evaluate_match_response, + load_golden_cases, +) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Evaluate /api/commit/match response with golden cases.", + ) + parser.add_argument( + "--cases", + default="tests/fixtures/commit_matching_golden_cases.json", + help="Golden case JSON path.", + ) + parser.add_argument( + "--response", + required=True, + help="Saved /api/commit/match response JSON path.", + ) + parser.add_argument( + "--top-k", + type=int, + default=5, + help="Evaluation cutoff for recommended commits.", + ) + parser.add_argument( + "--confidence-threshold", + type=int, + default=70, + help="Confidence cutoff for high-confidence false positive checks.", + ) + parser.add_argument( + "--fail-on-false-positive", + action="store_true", + help=( + "Fail matched cases when a non-relevant commit is recommended at or " + "above the confidence threshold." + ), + ) + parser.add_argument( + "--json", + action="store_true", + help="Print machine-readable JSON summary.", + ) + parser.add_argument( + "--fail-on-failure", + action="store_true", + help="Exit with code 1 when at least one golden case fails.", + ) + args = parser.parse_args() + + cases = load_golden_cases(args.cases) + response = json.loads(Path(args.response).read_text(encoding="utf-8")) + summary = evaluate_match_response( + cases, + response, + top_k=args.top_k, + confidence_threshold=args.confidence_threshold, + fail_on_false_positive=args.fail_on_false_positive, + ) + + if args.json: + print(json.dumps(summary.as_dict(), ensure_ascii=False, indent=2)) + else: + _print_summary(summary.as_dict()) + + if args.fail_on_failure and summary.passed_cases < summary.total_cases: + return 1 + return 0 + + +def _print_summary(summary: dict) -> None: + print("Commit Matching Evaluation") + print(f"- total_cases: {summary['total_cases']}") + print(f"- passed_cases: {summary['passed_cases']}") + print(f"- recall_at_k: {summary['recall_at_k']:.3f}") + print(f"- precision_at_k: {summary['precision_at_k']:.3f}") + print(f"- mean_reciprocal_rank: {summary['mean_reciprocal_rank']:.3f}") + print(f"- no_match_accuracy: {summary['no_match_accuracy']:.3f}") + print(f"- false_positive_count: {summary['false_positive_count']}") + print( + "- high_confidence_false_positive_count: " + f"{summary['high_confidence_false_positive_count']}" + ) + print(f"- distractor_hit_count: {summary['distractor_hit_count']}") + print() + + for case in summary["cases"]: + status = "PASS" if case["passed"] else "FAIL" + print( + f"[{status}] {case['case_id']} - " + f"{case['application_title']} " + f"(rank={case['first_expected_rank']}, " + f"recommended={case['recommended_count']})" + ) + if case["false_positive_hashes"]: + print(f" false_positive_hashes={case['false_positive_hashes']}") + if case["high_confidence_false_positive_hashes"]: + print( + " high_confidence_false_positive_hashes=" + f"{case['high_confidence_false_positive_hashes']}" + ) + if case["distractor_hit_hashes"]: + print(f" distractor_hit_hashes={case['distractor_hit_hashes']}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/fixtures/commit_matching_golden_cases.json b/tests/fixtures/commit_matching_golden_cases.json new file mode 100644 index 0000000..376eb0e --- /dev/null +++ b/tests/fixtures/commit_matching_golden_cases.json @@ -0,0 +1,82 @@ +{ + "version": 1, + "description": "실제 Whylog-AI 커밋을 기준으로 만든 적용사항-커밋 매칭 평가셋입니다.", + "cases": [ + { + "case_id": "gemini-quota-retry", + "application_id": 9101, + "application_title": "Gemini API 사용량 초과 시 재시도 로직 추가", + "application_reasons": [ + "Gemini 호출이 429 또는 일시적 오류로 실패할 때 분석 파이프라인이 중단되지 않아야 합니다.", + "재시도와 백오프를 통해 회의 분석/커밋 분석 안정성을 높여야 합니다." + ], + "expected_commit_hashes": [ + "5706bbfddb71a374545106ee3c7fc83797925964" + ], + "distractor_commit_hashes": [ + "7c67758a4dfaca4cf18e264da011569ae868ba64" + ], + "should_match": true, + "tags": ["gemini", "retry", "hard_negative"] + }, + { + "case_id": "live-transcript-merge", + "application_id": 9102, + "application_title": "WebSocket 발화 로그와 STT 전사 병합 기준 보강", + "application_reasons": [ + "회의 발화자 정보와 전사 텍스트를 더 정확히 결합해야 합니다.", + "짧은 WebSocket 발화가 긴 STT 전사 결과를 과도하게 덮어쓰지 않도록 해야 합니다." + ], + "expected_commit_hashes": [ + "cd139b09dc4f1d88b500af35cc39984ef714ce2b" + ], + "distractor_commit_hashes": [], + "should_match": true, + "tags": ["transcribe", "websocket", "stt"] + }, + { + "case_id": "application-timeline-embedding", + "application_id": 9103, + "application_title": "적용사항 임베딩에 회의 타임라인 맥락 반영", + "application_reasons": [ + "적용사항 제목과 근거만으로는 매칭 의도가 부족합니다.", + "대안논의와 적용합의 내용을 임베딩 텍스트에 포함해 커밋 추천 품질을 높여야 합니다." + ], + "expected_commit_hashes": [ + "3b50917d1ca2f9f811734ac02604bdae93dec86f" + ], + "distractor_commit_hashes": [], + "should_match": true, + "tags": ["application", "embedding", "timeline"] + }, + { + "case_id": "commit-path-module-token", + "application_id": 9104, + "application_title": "커밋 파일 경로 기반 모듈 토큰 보강", + "application_reasons": [ + "LLM이 모듈 태그를 놓쳐도 변경 파일 경로에서 도메인 맥락을 추출해야 합니다.", + "controller/service 같은 구조 단어는 제외하고 실제 도메인 토큰을 매칭 점수에 반영해야 합니다." + ], + "expected_commit_hashes": [ + "daafb71463b87525d65187a04f78d9d9e98c5e24" + ], + "distractor_commit_hashes": [], + "should_match": true, + "tags": ["commit", "path", "module"] + }, + { + "case_id": "no-match-dinner-menu", + "application_id": 9105, + "application_title": "저녁 메뉴로 치킨과 맥주를 선정", + "application_reasons": [ + "서비스 구현과 무관한 회의 잡담성 결정은 커밋 추천이 없어야 합니다." + ], + "expected_commit_hashes": [], + "distractor_commit_hashes": [ + "8af78f0e97ca30b22a8c8782ce372b2a59fe3819" + ], + "should_match": false, + "tags": ["no_match", "false_positive_guard"] + } + ] +} diff --git a/tests/test_commit_matching_evaluation.py b/tests/test_commit_matching_evaluation.py new file mode 100644 index 0000000..bf5d30a --- /dev/null +++ b/tests/test_commit_matching_evaluation.py @@ -0,0 +1,264 @@ +from pathlib import Path + +from app.domains.commit.services.matching_evaluation import ( + GoldenMatchCase, + evaluate_match_response, + load_golden_cases, +) + +FIXTURE_PATH = Path("tests/fixtures/commit_matching_golden_cases.json") + + +def _commit(commit_hash: str) -> dict: + return { + "commit_hash": commit_hash, + "commit_message": "test commit", + "confidence": 90, + } + + +def _application( + application_id: int, + application_title: str, + commits: list[dict], +) -> dict: + return { + "application_id": application_id, + "application_document_id": f"eval_application_{application_id}", + "application_title": application_title, + "recommended_commits": commits, + } + + +def test_load_golden_cases_from_fixture(): + cases = load_golden_cases(FIXTURE_PATH) + + assert len(cases) == 5 + assert cases[0].case_id == "gemini-quota-retry" + assert cases[0].should_match is True + assert cases[0].application_reasons + assert cases[0].distractor_commit_hashes == ( + "7c67758a4dfaca4cf18e264da011569ae868ba64", + ) + assert cases[-1].case_id == "no-match-dinner-menu" + assert cases[-1].should_match is False + + +def test_evaluate_match_response_passes_when_expected_commits_are_returned(): + cases = load_golden_cases(FIXTURE_PATH) + response = { + "result": { + "applications": [ + _application( + 9101, + "Gemini API 사용량 초과 시 재시도 로직 추가", + [_commit("5706bbf")], + ), + _application( + 9102, + "WebSocket 발화 로그와 STT 전사 병합 기준 보강", + [_commit("cd139b0")], + ), + _application( + 9103, + "적용사항 임베딩에 회의 타임라인 맥락 반영", + [_commit("3b50917")], + ), + _application( + 9104, + "커밋 파일 경로 기반 모듈 토큰 보강", + [_commit("daafb71")], + ), + _application(9105, "저녁 메뉴로 치킨과 맥주를 선정", []), + ] + } + } + + summary = evaluate_match_response(cases, response, top_k=5) + + assert summary.total_cases == 5 + assert summary.passed_cases == 5 + assert summary.recall_at_k == 1.0 + assert summary.precision_at_k == 1.0 + assert summary.mean_reciprocal_rank == 1.0 + assert summary.no_match_accuracy == 1.0 + assert summary.false_positive_count == 0 + + +def test_evaluate_match_response_counts_false_positives_and_missed_no_match(): + cases = load_golden_cases(FIXTURE_PATH) + response = { + "applications": [ + _application( + 9101, + "Gemini API 사용량 초과 시 재시도 로직 추가", + [_commit("7c67758"), _commit("5706bbf")], + ), + _application( + 9105, + "저녁 메뉴로 치킨과 맥주를 선정", + [_commit("8af78f0")], + ), + ] + } + + summary = evaluate_match_response(cases[:1] + cases[-1:], response, top_k=5) + + assert summary.total_cases == 2 + assert summary.passed_cases == 1 + assert summary.recall_at_k == 1.0 + assert summary.precision_at_k == 1 / 3 + assert summary.mean_reciprocal_rank == 0.5 + assert summary.no_match_accuracy == 0.0 + assert summary.false_positive_count == 2 + assert summary.cases[0].first_expected_rank == 2 + + +def test_evaluate_match_response_uses_title_fallback_when_id_is_missing(): + cases = [ + GoldenMatchCase( + case_id="title-fallback", + application_id=None, + application_title="회의 타임라인 맥락 임베딩", + application_reasons=(), + expected_commit_hashes=("3b50917d1ca2f9f811734ac02604bdae93dec86f",), + accepted_commit_hashes=(), + distractor_commit_hashes=(), + should_match=True, + tags=(), + ) + ] + response = { + "applications": [ + _application( + 9103, + "회의 타임라인 맥락 임베딩", + [_commit("3b50917")], + ) + ] + } + + summary = evaluate_match_response(cases, response, top_k=5) + + assert summary.passed_cases == 1 + assert summary.cases[0].first_expected_rank == 1 + + +def test_evaluate_match_response_treats_accepted_hash_as_relevant_not_expected(): + cases = [ + GoldenMatchCase( + case_id="accepted-only", + application_id=9101, + application_title="Gemini 재시도 로직", + application_reasons=(), + expected_commit_hashes=("5706bbfddb71a374545106ee3c7fc83797925964",), + accepted_commit_hashes=("7c67758a4dfaca4cf18e264da011569ae868ba64",), + distractor_commit_hashes=(), + should_match=True, + tags=(), + ) + ] + response = { + "applications": [ + _application( + 9101, + "Gemini 재시도 로직", + [_commit("7c67758")], + ) + ] + } + + summary = evaluate_match_response(cases, response, top_k=5) + + assert summary.passed_cases == 0 + assert summary.recall_at_k == 0.0 + assert summary.precision_at_k == 1.0 + assert summary.cases[0].expected_found is False + assert summary.cases[0].false_positive_hashes == () + + +def test_evaluate_match_response_fails_match_case_when_application_is_missing(): + cases = [ + GoldenMatchCase( + case_id="missing-application", + application_id=9999, + application_title="없는 적용사항", + application_reasons=(), + expected_commit_hashes=("5706bbfddb71a374545106ee3c7fc83797925964",), + accepted_commit_hashes=(), + distractor_commit_hashes=(), + should_match=True, + tags=(), + ) + ] + + summary = evaluate_match_response(cases, {"applications": []}, top_k=5) + + assert summary.passed_cases == 0 + assert summary.recall_at_k == 0.0 + assert summary.cases[0].recommended_count == 0 + + +def test_evaluate_match_response_tracks_distractor_hits(): + cases = load_golden_cases(FIXTURE_PATH) + response = { + "applications": [ + _application( + 9101, + "Gemini API 사용량 초과 시 재시도 로직 추가", + [_commit("7c67758"), _commit("5706bbf")], + ) + ] + } + + summary = evaluate_match_response(cases[:1], response, top_k=5) + + assert summary.distractor_hit_count == 1 + assert summary.high_confidence_false_positive_count == 1 + assert summary.cases[0].distractor_hit_hashes == ("7c67758",) + assert summary.cases[0].high_confidence_false_positive_hashes == ("7c67758",) + + +def test_evaluate_match_response_can_fail_match_case_on_false_positive(): + cases = load_golden_cases(FIXTURE_PATH) + response = { + "applications": [ + _application( + 9101, + "Gemini API 사용량 초과 시 재시도 로직 추가", + [_commit("7c67758"), _commit("5706bbf")], + ) + ] + } + + summary = evaluate_match_response( + cases[:1], + response, + top_k=5, + fail_on_false_positive=True, + ) + + assert summary.passed_cases == 0 + assert summary.cases[0].expected_found is True + assert summary.cases[0].first_expected_rank == 2 + + +def test_no_match_case_ignores_below_threshold_recommendation(): + cases = load_golden_cases(FIXTURE_PATH) + low_confidence_commit = _commit("8af78f0") | {"confidence": 49} + response = { + "applications": [ + _application( + 9105, + "저녁 메뉴로 치킨과 맥주를 선정", + [low_confidence_commit], + ), + ] + } + + summary = evaluate_match_response(cases[-1:], response, top_k=5) + + assert summary.passed_cases == 1 + assert summary.no_match_accuracy == 1.0 + assert summary.false_positive_count == 1 + assert summary.high_confidence_false_positive_count == 0