diff --git a/app/domains/commit/services/matching_evaluation.py b/app/domains/commit/services/matching_evaluation.py
new file mode 100644
index 0000000..c6cf2ff
--- /dev/null
+++ b/app/domains/commit/services/matching_evaluation.py
@@ -0,0 +1,391 @@
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+DEFAULT_CONFIDENCE_THRESHOLD = 70
+
+
+@dataclass(frozen=True)
+class GoldenMatchCase:
+    case_id: str
+    application_id: int | None
+    application_title: str
+    application_reasons: tuple[str, ...]
+    expected_commit_hashes: tuple[str, ...]
+    accepted_commit_hashes: tuple[str, ...]
+    distractor_commit_hashes: tuple[str, ...]
+    should_match: bool
+    tags: tuple[str, ...]
+
+
+@dataclass(frozen=True)
+class CaseEvaluation:
+    case_id: str
+    application_title: str
+    should_match: bool
+    passed: bool
+    expected_found: bool
+    first_expected_rank: int | None
+    recommended_count: int
+    recommended_hashes: tuple[str, ...]
+    false_positive_hashes: tuple[str, ...]
+    high_confidence_false_positive_hashes: tuple[str, ...]
+    distractor_hit_hashes: tuple[str, ...]
+
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "case_id": self.case_id,
+            "application_title": self.application_title,
+            "should_match": self.should_match,
+            "passed": self.passed,
+            "expected_found": self.expected_found,
+            "first_expected_rank": self.first_expected_rank,
+            "recommended_count": self.recommended_count,
+            "recommended_hashes": list(self.recommended_hashes),
+            "false_positive_hashes": list(self.false_positive_hashes),
+            "high_confidence_false_positive_hashes": list(
+                self.high_confidence_false_positive_hashes
+            ),
+            "distractor_hit_hashes": list(self.distractor_hit_hashes),
+        }
+
+
+@dataclass(frozen=True)
+class MatchEvaluationSummary:
+    total_cases: int
+    passed_cases: int
+    match_cases: int
+    no_match_cases: int
+    recall_at_k: float
+    precision_at_k: float
+    mean_reciprocal_rank: float
+    no_match_accuracy: float
+    false_positive_count: int
+    high_confidence_false_positive_count: int
+    distractor_hit_count: int
+    cases: tuple[CaseEvaluation, ...]
+
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "total_cases": self.total_cases,
+            "passed_cases": self.passed_cases,
+            "match_cases": self.match_cases,
+            "no_match_cases": self.no_match_cases,
+            "recall_at_k": self.recall_at_k,
+            "precision_at_k": self.precision_at_k,
+            "mean_reciprocal_rank": self.mean_reciprocal_rank,
+            "no_match_accuracy": self.no_match_accuracy,
+            "false_positive_count": self.false_positive_count,
+            "high_confidence_false_positive_count": (
+                self.high_confidence_false_positive_count
+            ),
+            "distractor_hit_count": self.distractor_hit_count,
+            "cases": [case.as_dict() for case in self.cases],
+        }
+
+
+@dataclass(frozen=True)
+class _Recommendation:
+    commit_hash: str
+    confidence: int | None
+
+
+def load_golden_cases(path: str | Path) -> list[GoldenMatchCase]:
+    raw = json.loads(Path(path).read_text(encoding="utf-8"))
+    cases = raw["cases"] if isinstance(raw, dict) else raw
+    parsed_cases = [_golden_case_from_mapping(case) for case in cases]
+    _validate_unique_case_ids(parsed_cases)
+    return parsed_cases
+
+
+def evaluate_match_response(
+    cases: list[GoldenMatchCase],
+    response: dict[str, Any],
+    *,
+    top_k: int = 5,
+    confidence_threshold: int = DEFAULT_CONFIDENCE_THRESHOLD,
+    fail_on_false_positive: bool = False,
+) -> MatchEvaluationSummary:
+    result = _extract_result(response)
+    application_items = result.get("applications") or []
+    by_application_id = {
+        item.get("application_id"): item
+        for item in application_items
+        if item.get("application_id") is not None
+    }
+    by_application_title = {
+        item.get("application_title"): item
+        for item in application_items
+        if item.get("application_title")
+    }
+
+    evaluations: list[CaseEvaluation] = []
+    total_relevant_recommendations = 0
+    total_recommendations = 0
+    reciprocal_ranks: list[float] = []
+
+    for case in cases:
+        item = _find_application_item(
+            case,
+            by_application_id=by_application_id,
+            by_application_title=by_application_title,
+        )
+        raw_recommendations = (item or {}).get("recommended_commits") or []
+        recommendations = _to_recommendations(raw_recommendations[:top_k])
+        recommended_hashes = tuple(
+            recommendation.commit_hash for recommendation in recommendations
+        )
+
+        relevant_hashes = case.expected_commit_hashes + case.accepted_commit_hashes
+        first_expected_rank = _first_matching_rank(
+            recommended_hashes,
+            case.expected_commit_hashes,
+        )
+        expected_found = first_expected_rank is not None
+        false_positive_hashes = tuple(
+            recommendation.commit_hash
+            for recommendation in recommendations
+            if not _hash_in_set(recommendation.commit_hash, relevant_hashes)
+        )
+        high_confidence_false_positive_hashes = tuple(
+            recommendation.commit_hash
+            for recommendation in recommendations
+            if _is_high_confidence(recommendation, confidence_threshold)
+            and not _hash_in_set(recommendation.commit_hash, relevant_hashes)
+        )
+        high_confidence_hashes = tuple(
+            recommendation.commit_hash
+            for recommendation in recommendations
+            if _is_high_confidence(recommendation, confidence_threshold)
+        )
+        distractor_hit_hashes = tuple(
+            recommendation.commit_hash
+            for recommendation in recommendations
+            if _hash_in_set(recommendation.commit_hash, case.distractor_commit_hashes)
+        )
+        passed = _case_passed(
+            case=case,
+            expected_found=expected_found,
+            high_confidence_hashes=high_confidence_hashes,
+            high_confidence_false_positive_hashes=(
+                high_confidence_false_positive_hashes
+            ),
+            fail_on_false_positive=fail_on_false_positive,
+        )
+
+        total_recommendations += len(recommended_hashes)
+        total_relevant_recommendations += sum(
+            1
+            for commit_hash in recommended_hashes
+            if _hash_in_set(commit_hash, relevant_hashes)
+        )
+        if case.should_match:
+            reciprocal_ranks.append(
+                0.0 if first_expected_rank is None else 1 / first_expected_rank
+            )
+
+        evaluations.append(
+            CaseEvaluation(
+                case_id=case.case_id,
+                application_title=case.application_title,
+                should_match=case.should_match,
+                passed=passed,
+                expected_found=expected_found,
+                first_expected_rank=first_expected_rank,
+                recommended_count=len(recommended_hashes),
+                recommended_hashes=recommended_hashes,
+                false_positive_hashes=false_positive_hashes,
+                high_confidence_false_positive_hashes=(
+                    high_confidence_false_positive_hashes
+                ),
+                distractor_hit_hashes=distractor_hit_hashes,
+            )
+        )
+
+    match_cases = sum(1 for case in cases if case.should_match)
+    no_match_cases = len(cases) - match_cases
+    passed_cases = sum(1 for evaluation in evaluations if evaluation.passed)
+    found_cases = sum(1 for evaluation in evaluations if evaluation.expected_found)
+    no_match_passes = sum(
+        1
+        for evaluation in evaluations
+        if not evaluation.should_match and evaluation.passed
+    )
+    false_positive_count = sum(
+        len(evaluation.false_positive_hashes) for evaluation in evaluations
+    )
+    high_confidence_false_positive_count = sum(
+        len(evaluation.high_confidence_false_positive_hashes)
+        for evaluation in evaluations
+    )
+    distractor_hit_count = sum(
+        len(evaluation.distractor_hit_hashes) for evaluation in evaluations
+    )
+
+    return MatchEvaluationSummary(
+        total_cases=len(cases),
+        passed_cases=passed_cases,
+        match_cases=match_cases,
+        no_match_cases=no_match_cases,
+        recall_at_k=_safe_div(found_cases, match_cases),
+        precision_at_k=_safe_div(total_relevant_recommendations, total_recommendations),
+        mean_reciprocal_rank=_safe_div(sum(reciprocal_ranks), len(reciprocal_ranks)),
+        no_match_accuracy=_safe_div(no_match_passes, no_match_cases),
+        false_positive_count=false_positive_count,
+        high_confidence_false_positive_count=high_confidence_false_positive_count,
+        distractor_hit_count=distractor_hit_count,
+        cases=tuple(evaluations),
+    )
+
+
+def _golden_case_from_mapping(raw: dict[str, Any]) -> GoldenMatchCase:
+    expected_commit_hashes = tuple(
+        _normalize_hash(value)
+        for value in raw.get("expected_commit_hashes", [])
+        if _normalize_hash(value)
+    )
+    should_match = bool(raw.get("should_match", True))
+    if should_match and not expected_commit_hashes:
+        raise ValueError(f"expected_commit_hashes is required: {raw.get('case_id')}")
+
+    return GoldenMatchCase(
+        case_id=str(raw["case_id"]),
+        application_id=raw.get("application_id"),
+        application_title=str(raw["application_title"]),
+        application_reasons=tuple(
+            str(reason) for reason in raw.get("application_reasons", [])
+        ),
+        expected_commit_hashes=expected_commit_hashes,
+        accepted_commit_hashes=tuple(
+            _normalize_hash(value)
+            for value in raw.get("accepted_commit_hashes", [])
+            if _normalize_hash(value)
+        ),
+        distractor_commit_hashes=tuple(
+            _normalize_hash(value)
+            for value in raw.get("distractor_commit_hashes", [])
+            if _normalize_hash(value)
+        ),
+        should_match=should_match,
+        tags=tuple(str(tag) for tag in raw.get("tags", [])),
+    )
+
+
+def _validate_unique_case_ids(cases: list[GoldenMatchCase]) -> None:
+    seen: set[str] = set()
+    duplicates: set[str] = set()
+    for case in cases:
+        if case.case_id in seen:
+            duplicates.add(case.case_id)
+        seen.add(case.case_id)
+    if duplicates:
+        raise ValueError(f"duplicate case_id values: {sorted(duplicates)}")
+
+
+def _extract_result(response: dict[str, Any]) -> dict[str, Any]:
+    result = response.get("result")
+    if isinstance(result, dict) and "applications" in result:
+        return result
+    return response
+
+
+def _find_application_item(
+    case: GoldenMatchCase,
+    *,
+    by_application_id: dict[int, dict[str, Any]],
+    by_application_title: dict[str, dict[str, Any]],
+) -> dict[str, Any] | None:
+    if case.application_id is not None and case.application_id in by_application_id:
+        return by_application_id[case.application_id]
+    return by_application_title.get(case.application_title)
+
+
+def _to_recommendations(raw_recommendations: list[Any]) -> tuple[_Recommendation, ...]:
+    recommendations: list[_Recommendation] = []
+    for raw in raw_recommendations:
+        if not isinstance(raw, dict):
+            continue
+        commit_hash = _normalize_hash(raw.get("commit_hash"))
+        if not commit_hash:
+            continue
+        recommendations.append(
+            _Recommendation(
+                commit_hash=commit_hash,
+                confidence=_normalize_confidence(raw.get("confidence")),
+            )
+        )
+    return tuple(recommendations)
+
+
+def _normalize_confidence(value: Any) -> int | None:
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float):
+        return int(value)
+    if isinstance(value, str):
+        try:
+            return int(float(value.strip()))
+        except ValueError:
+            return None
+    return None
+
+
+def _is_high_confidence(
+    recommendation: _Recommendation,
+    confidence_threshold: int,
+) -> bool:
+    # Older response fixtures may not have confidence. Treat those recommendations as
+    # above-threshold because /api/commit/match already filters recommendations.
+    if recommendation.confidence is None:
+        return True
+    return recommendation.confidence >= confidence_threshold
+
+
+def _case_passed(
+    *,
+    case: GoldenMatchCase,
+    expected_found: bool,
+    high_confidence_hashes: tuple[str, ...],
+    high_confidence_false_positive_hashes: tuple[str, ...],
+    fail_on_false_positive: bool,
+) -> bool:
+    if not case.should_match:
+        return not high_confidence_hashes
+    if not expected_found:
+        return False
+    if fail_on_false_positive and high_confidence_false_positive_hashes:
+        return False
+    return True
+
+
+def _first_matching_rank(
+    recommended_hashes: tuple[str, ...],
+    expected_hashes: tuple[str, ...],
+) -> int | None:
+    for index, commit_hash in enumerate(recommended_hashes, start=1):
+        if _hash_in_set(commit_hash, expected_hashes):
+            return index
+    return None
+
+
+def _hash_in_set(commit_hash: str, expected_hashes: tuple[str, ...]) -> bool:
+    return any(_hash_matches(commit_hash, expected) for expected in expected_hashes)
+
+
+def _hash_matches(left: str, right: str) -> bool:
+    if len(left) < 7 or len(right) < 7:
+        return left == right
+    return left.startswith(right) or right.startswith(left)
+
+
+def _normalize_hash(value: Any) -> str:
+    return str(value or "").strip().lower()
+
+
+def _safe_div(numerator: float, denominator: float) -> float:
+    if denominator == 0:
+        return 0.0
+    return numerator / denominator
diff --git a/docs/commit_matching_evaluation.md b/docs/commit_matching_evaluation.md
new file mode 100644
index 0000000..9da85b7
--- /dev/null
+++ b/docs/commit_matching_evaluation.md
@@ -0,0 +1,106 @@
+# Commit Matching Evaluation
+
+## 목적
+
+회의 적용사항과 커밋의 실제 정답 관계가 운영 DB에 직접 저장되어 있지 않기 때문에,
+실제 커밋을 기준으로 만든 평가용 적용사항을 별도 golden dataset으로 관리한다.
+
+이 평가셋은 다음 작업에 사용한다.
+
+- `/api/commit/match` 결과가 기대 커밋을 상위 K개 안에 포함하는지 확인
+- 70점 이상 false positive 후보가 생기는지 확인
+- 아직 구현되지 않은 적용사항이 빈 추천 목록으로 반환되는지 확인
+- 점수식, threshold, keyword/context 정책 변경 전후의 품질 비교
+
+## 평가셋 위치
+
+```bash
+tests/fixtures/commit_matching_golden_cases.json
+```
+
+각 case는 실제 Whylog-AI 커밋 해시를 기준으로 구성한다.
+
+- `application_title`: 시연용/평가용 가짜 회의 적용사항 제목
+- `application_reasons`: 적용사항 근거
+- `expected_commit_hashes`: 반드시 추천되어야 하는 실제 커밋 해시
+- `accepted_commit_hashes`: 정답은 아니지만 false positive로 세지 않을 허용 커밋 해시
+- `distractor_commit_hashes`: 헷갈리지만 정답이 아닌 커밋 해시
+- `should_match`: 추천이 있어야 하는지 여부
+- `tags`: 분석용 태그
+
+## 평가 실행
+
+먼저 실제 또는 로컬 `/api/commit/match` 응답을 JSON 파일로 저장한다.
+응답은 FastAPI 공통 응답 wrapper가 있는 형태와 result 본문만 있는 형태를 모두 지원한다.
+
+```bash
+uv run python scripts/evaluate_commit_matching.py \
+  --cases tests/fixtures/commit_matching_golden_cases.json \
+  --response /path/to/commit-match-response.json \
+  --top-k 5
+```
+
+JSON 요약이 필요하면 다음 옵션을 사용한다.
+
+```bash
+uv run python scripts/evaluate_commit_matching.py \
+  --response /path/to/commit-match-response.json \
+  --json
+```
+
+CI나 회귀 검증에서 실패 시 non-zero exit code가 필요하면 다음 옵션을 추가한다.
+
+```bash
+uv run python scripts/evaluate_commit_matching.py \
+  --response /path/to/commit-match-response.json \
+  --fail-on-failure
+```
+
+정답 커밋이 포함되어도 70점 이상 오탐 커밋이 함께 추천되면 실패로 보고 싶을 때는
+다음 옵션을 함께 사용한다.
+
+```bash
+uv run python scripts/evaluate_commit_matching.py \
+  --response /path/to/commit-match-response.json \
+  --fail-on-false-positive \
+  --fail-on-failure
+```
+
+기본 high-confidence 기준은 70점이며, 필요하면 조정할 수 있다.
+
+```bash
+uv run python scripts/evaluate_commit_matching.py \
+  --response /path/to/commit-match-response.json \
+  --confidence-threshold 75
+```
+
+## 지표
+
+- `recall_at_k`: 정답 커밋이 상위 K개 추천 안에 포함된 비율
+- `precision_at_k`: 추천된 커밋 중 정답 또는 허용 커밋 비율
+- `mean_reciprocal_rank`: 정답 커밋이 몇 번째에 나왔는지 반영한 순위 지표
+- `no_match_accuracy`: 추천이 없어야 하는 적용사항에서 빈 추천을 반환한 비율
+- `false_positive_count`: 정답 또는 허용 커밋이 아닌 추천 개수
+- `high_confidence_false_positive_count`: confidence threshold 이상인 오탐 추천 개수
+- `distractor_hit_count`: hard negative로 지정한 distractor 커밋이 추천된 개수
+
+`accepted_commit_hashes`는 precision 계산과 false positive 판정에서만 사용한다.
+해당 커밋만 추천되고 `expected_commit_hashes`가 빠진 경우에는 해당 case를 실패로 본다.
+
+`distractor_commit_hashes`는 정답이 아니지만 헷갈리기 쉬운 커밋을 명시하는 필드다.
+추천 결과에 distractor가 포함되면 `distractor_hit_hashes`와 `distractor_hit_count`로
+별도 집계된다.
+
+기본 pass 기준은 다음과 같다.
+
+- `should_match=true`: expected commit이 top-k 안에 있으면 pass
+- `should_match=false`: confidence threshold 이상 추천이 없으면 pass
+- `--fail-on-false-positive`: match case에서도 threshold 이상 오탐이 있으면 fail
+
+## 고도화 흐름
+
+1. 실제 커밋을 기준으로 golden case를 추가한다.
+2. 현재 `/api/commit/match` 결과를 저장하고 평가 스크립트로 baseline을 기록한다.
+3. 일반어 제거, intent keyword 보정, LLM rerank 같은 정책을 한 번에 하나씩 적용한다.
+4. 같은 response fixture 또는 같은 운영 데이터로 지표를 비교한다.
+5. 시연용 데이터는 easy case, hard negative, no-match case를 모두 포함하도록 구성한다.
diff --git a/scripts/evaluate_commit_matching.py b/scripts/evaluate_commit_matching.py
new file mode 100644
index 0000000..1fb90a6
--- /dev/null
+++ b/scripts/evaluate_commit_matching.py
@@ -0,0 +1,118 @@
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ROOT_DIR = Path(__file__).resolve().parents[1]
+if str(ROOT_DIR) not in sys.path:
+    sys.path.insert(0, str(ROOT_DIR))
+
+from app.domains.commit.services.matching_evaluation import (  # noqa: E402
+    evaluate_match_response,
+    load_golden_cases,
+)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Evaluate /api/commit/match response with golden cases.",
+    )
+    parser.add_argument(
+        "--cases",
+        default="tests/fixtures/commit_matching_golden_cases.json",
+        help="Golden case JSON path.",
+    )
+    parser.add_argument(
+        "--response",
+        required=True,
+        help="Saved /api/commit/match response JSON path.",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=5,
+        help="Evaluation cutoff for recommended commits.",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=int,
+        default=70,
+        help="Confidence cutoff for high-confidence false positive checks.",
+    )
+    parser.add_argument(
+        "--fail-on-false-positive",
+        action="store_true",
+        help=(
+            "Fail matched cases when a non-relevant commit is recommended at or "
+            "above the confidence threshold."
+        ),
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Print machine-readable JSON summary.",
+    )
+    parser.add_argument(
+        "--fail-on-failure",
+        action="store_true",
+        help="Exit with code 1 when at least one golden case fails.",
+    )
+    args = parser.parse_args()
+
+    cases = load_golden_cases(args.cases)
+    response = json.loads(Path(args.response).read_text(encoding="utf-8"))
+    summary = evaluate_match_response(
+        cases,
+        response,
+        top_k=args.top_k,
+        confidence_threshold=args.confidence_threshold,
+        fail_on_false_positive=args.fail_on_false_positive,
+    )
+
+    if args.json:
+        print(json.dumps(summary.as_dict(), ensure_ascii=False, indent=2))
+    else:
+        _print_summary(summary.as_dict())
+
+    if args.fail_on_failure and summary.passed_cases < summary.total_cases:
+        return 1
+    return 0
+
+
+def _print_summary(summary: dict) -> None:
+    print("Commit Matching Evaluation")
+    print(f"- total_cases: {summary['total_cases']}")
+    print(f"- passed_cases: {summary['passed_cases']}")
+    print(f"- recall_at_k: {summary['recall_at_k']:.3f}")
+    print(f"- precision_at_k: {summary['precision_at_k']:.3f}")
+    print(f"- mean_reciprocal_rank: {summary['mean_reciprocal_rank']:.3f}")
+    print(f"- no_match_accuracy: {summary['no_match_accuracy']:.3f}")
+    print(f"- false_positive_count: {summary['false_positive_count']}")
+    print(
+        "- high_confidence_false_positive_count: "
+        f"{summary['high_confidence_false_positive_count']}"
+    )
+    print(f"- distractor_hit_count: {summary['distractor_hit_count']}")
+    print()
+
+    for case in summary["cases"]:
+        status = "PASS" if case["passed"] else "FAIL"
+        print(
+            f"[{status}] {case['case_id']} - "
+            f"{case['application_title']} "
+            f"(rank={case['first_expected_rank']}, "
+            f"recommended={case['recommended_count']})"
+        )
+        if case["false_positive_hashes"]:
+            print(f"  false_positive_hashes={case['false_positive_hashes']}")
+        if case["high_confidence_false_positive_hashes"]:
+            print(
+                "  high_confidence_false_positive_hashes="
+                f"{case['high_confidence_false_positive_hashes']}"
+            )
+        if case["distractor_hit_hashes"]:
+            print(f"  distractor_hit_hashes={case['distractor_hit_hashes']}")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/fixtures/commit_matching_golden_cases.json b/tests/fixtures/commit_matching_golden_cases.json
new file mode 100644
index 0000000..376eb0e
--- /dev/null
+++ b/tests/fixtures/commit_matching_golden_cases.json
@@ -0,0 +1,82 @@
+{
+  "version": 1,
+  "description": "실제 Whylog-AI 커밋을 기준으로 만든 적용사항-커밋 매칭 평가셋입니다.",
+  "cases": [
+    {
+      "case_id": "gemini-quota-retry",
+      "application_id": 9101,
+      "application_title": "Gemini API 사용량 초과 시 재시도 로직 추가",
+      "application_reasons": [
+        "Gemini 호출이 429 또는 일시적 오류로 실패할 때 분석 파이프라인이 중단되지 않아야 합니다.",
+        "재시도와 백오프를 통해 회의 분석/커밋 분석 안정성을 높여야 합니다."
+      ],
+      "expected_commit_hashes": [
+        "5706bbfddb71a374545106ee3c7fc83797925964"
+      ],
+      "distractor_commit_hashes": [
+        "7c67758a4dfaca4cf18e264da011569ae868ba64"
+      ],
+      "should_match": true,
+      "tags": ["gemini", "retry", "hard_negative"]
+    },
+    {
+      "case_id": "live-transcript-merge",
+      "application_id": 9102,
+      "application_title": "WebSocket 발화 로그와 STT 전사 병합 기준 보강",
+      "application_reasons": [
+        "회의 발화자 정보와 전사 텍스트를 더 정확히 결합해야 합니다.",
+        "짧은 WebSocket 발화가 긴 STT 전사 결과를 과도하게 덮어쓰지 않도록 해야 합니다."
+      ],
+      "expected_commit_hashes": [
+        "cd139b09dc4f1d88b500af35cc39984ef714ce2b"
+      ],
+      "distractor_commit_hashes": [],
+      "should_match": true,
+      "tags": ["transcribe", "websocket", "stt"]
+    },
+    {
+      "case_id": "application-timeline-embedding",
+      "application_id": 9103,
+      "application_title": "적용사항 임베딩에 회의 타임라인 맥락 반영",
+      "application_reasons": [
+        "적용사항 제목과 근거만으로는 매칭 의도가 부족합니다.",
+        "대안논의와 적용합의 내용을 임베딩 텍스트에 포함해 커밋 추천 품질을 높여야 합니다."
+      ],
+      "expected_commit_hashes": [
+        "3b50917d1ca2f9f811734ac02604bdae93dec86f"
+      ],
+      "distractor_commit_hashes": [],
+      "should_match": true,
+      "tags": ["application", "embedding", "timeline"]
+    },
+    {
+      "case_id": "commit-path-module-token",
+      "application_id": 9104,
+      "application_title": "커밋 파일 경로 기반 모듈 토큰 보강",
+      "application_reasons": [
+        "LLM이 모듈 태그를 놓쳐도 변경 파일 경로에서 도메인 맥락을 추출해야 합니다.",
+        "controller/service 같은 구조 단어는 제외하고 실제 도메인 토큰을 매칭 점수에 반영해야 합니다."
+      ],
+      "expected_commit_hashes": [
+        "daafb71463b87525d65187a04f78d9d9e98c5e24"
+      ],
+      "distractor_commit_hashes": [],
+      "should_match": true,
+      "tags": ["commit", "path", "module"]
+    },
+    {
+      "case_id": "no-match-dinner-menu",
+      "application_id": 9105,
+      "application_title": "저녁 메뉴로 치킨과 맥주를 선정",
+      "application_reasons": [
+        "서비스 구현과 무관한 회의 잡담성 결정은 커밋 추천이 없어야 합니다."
+      ],
+      "expected_commit_hashes": [],
+      "distractor_commit_hashes": [
+        "8af78f0e97ca30b22a8c8782ce372b2a59fe3819"
+      ],
+      "should_match": false,
+      "tags": ["no_match", "false_positive_guard"]
+    }
+  ]
+}
diff --git a/tests/test_commit_matching_evaluation.py b/tests/test_commit_matching_evaluation.py
new file mode 100644
index 0000000..bf5d30a
--- /dev/null
+++ b/tests/test_commit_matching_evaluation.py
@@ -0,0 +1,264 @@
+from pathlib import Path
+
+from app.domains.commit.services.matching_evaluation import (
+    GoldenMatchCase,
+    evaluate_match_response,
+    load_golden_cases,
+)
+
+FIXTURE_PATH = Path("tests/fixtures/commit_matching_golden_cases.json")
+
+
+def _commit(commit_hash: str) -> dict:
+    return {
+        "commit_hash": commit_hash,
+        "commit_message": "test commit",
+        "confidence": 90,
+    }
+
+
+def _application(
+    application_id: int,
+    application_title: str,
+    commits: list[dict],
+) -> dict:
+    return {
+        "application_id": application_id,
+        "application_document_id": f"eval_application_{application_id}",
+        "application_title": application_title,
+        "recommended_commits": commits,
+    }
+
+
+def test_load_golden_cases_from_fixture():
+    cases = load_golden_cases(FIXTURE_PATH)
+
+    assert len(cases) == 5
+    assert cases[0].case_id == "gemini-quota-retry"
+    assert cases[0].should_match is True
+    assert cases[0].application_reasons
+    assert cases[0].distractor_commit_hashes == (
+        "7c67758a4dfaca4cf18e264da011569ae868ba64",
+    )
+    assert cases[-1].case_id == "no-match-dinner-menu"
+    assert cases[-1].should_match is False
+
+
+def test_evaluate_match_response_passes_when_expected_commits_are_returned():
+    cases = load_golden_cases(FIXTURE_PATH)
+    response = {
+        "result": {
+            "applications": [
+                _application(
+                    9101,
+                    "Gemini API 사용량 초과 시 재시도 로직 추가",
+                    [_commit("5706bbf")],
+                ),
+                _application(
+                    9102,
+                    "WebSocket 발화 로그와 STT 전사 병합 기준 보강",
+                    [_commit("cd139b0")],
+                ),
+                _application(
+                    9103,
+                    "적용사항 임베딩에 회의 타임라인 맥락 반영",
+                    [_commit("3b50917")],
+                ),
+                _application(
+                    9104,
+                    "커밋 파일 경로 기반 모듈 토큰 보강",
+                    [_commit("daafb71")],
+                ),
+                _application(9105, "저녁 메뉴로 치킨과 맥주를 선정", []),
+            ]
+        }
+    }
+
+    summary = evaluate_match_response(cases, response, top_k=5)
+
+    assert summary.total_cases == 5
+    assert summary.passed_cases == 5
+    assert summary.recall_at_k == 1.0
+    assert summary.precision_at_k == 1.0
+    assert summary.mean_reciprocal_rank == 1.0
+    assert summary.no_match_accuracy == 1.0
+    assert summary.false_positive_count == 0
+
+
+def test_evaluate_match_response_counts_false_positives_and_missed_no_match():
+    cases = load_golden_cases(FIXTURE_PATH)
+    response = {
+        "applications": [
+            _application(
+                9101,
+                "Gemini API 사용량 초과 시 재시도 로직 추가",
+                [_commit("7c67758"), _commit("5706bbf")],
+            ),
+            _application(
+                9105,
+                "저녁 메뉴로 치킨과 맥주를 선정",
+                [_commit("8af78f0")],
+            ),
+        ]
+    }
+
+    summary = evaluate_match_response(cases[:1] + cases[-1:], response, top_k=5)
+
+    assert summary.total_cases == 2
+    assert summary.passed_cases == 1
+    assert summary.recall_at_k == 1.0
+    assert summary.precision_at_k == 1 / 3
+    assert summary.mean_reciprocal_rank == 0.5
+    assert summary.no_match_accuracy == 0.0
+    assert summary.false_positive_count == 2
+    assert summary.cases[0].first_expected_rank == 2
+
+
+def test_evaluate_match_response_uses_title_fallback_when_id_is_missing():
+    cases = [
+        GoldenMatchCase(
+            case_id="title-fallback",
+            application_id=None,
+            application_title="회의 타임라인 맥락 임베딩",
+            application_reasons=(),
+            expected_commit_hashes=("3b50917d1ca2f9f811734ac02604bdae93dec86f",),
+            accepted_commit_hashes=(),
+            distractor_commit_hashes=(),
+            should_match=True,
+            tags=(),
+        )
+    ]
+    response = {
+        "applications": [
+            _application(
+                9103,
+                "회의 타임라인 맥락 임베딩",
+                [_commit("3b50917")],
+            )
+        ]
+    }
+
+    summary = evaluate_match_response(cases, response, top_k=5)
+
+    assert summary.passed_cases == 1
+    assert summary.cases[0].first_expected_rank == 1
+
+
+def test_evaluate_match_response_treats_accepted_hash_as_relevant_not_expected():
+    cases = [
+        GoldenMatchCase(
+            case_id="accepted-only",
+            application_id=9101,
+            application_title="Gemini 재시도 로직",
+            application_reasons=(),
+            expected_commit_hashes=("5706bbfddb71a374545106ee3c7fc83797925964",),
+            accepted_commit_hashes=("7c67758a4dfaca4cf18e264da011569ae868ba64",),
+            distractor_commit_hashes=(),
+            should_match=True,
+            tags=(),
+        )
+    ]
+    response = {
+        "applications": [
+            _application(
+                9101,
+                "Gemini 재시도 로직",
+                [_commit("7c67758")],
+            )
+        ]
+    }
+
+    summary = evaluate_match_response(cases, response, top_k=5)
+
+    assert summary.passed_cases == 0
+    assert summary.recall_at_k == 0.0
+    assert summary.precision_at_k == 1.0
+    assert summary.cases[0].expected_found is False
+    assert summary.cases[0].false_positive_hashes == ()
+
+
+def test_evaluate_match_response_fails_match_case_when_application_is_missing():
+    cases = [
+        GoldenMatchCase(
+            case_id="missing-application",
+            application_id=9999,
+            application_title="없는 적용사항",
+            application_reasons=(),
+            expected_commit_hashes=("5706bbfddb71a374545106ee3c7fc83797925964",),
+            accepted_commit_hashes=(),
+            distractor_commit_hashes=(),
+            should_match=True,
+            tags=(),
+        )
+    ]
+
+    summary = evaluate_match_response(cases, {"applications": []}, top_k=5)
+
+    assert summary.passed_cases == 0
+    assert summary.recall_at_k == 0.0
+    assert summary.cases[0].recommended_count == 0
+
+
+def test_evaluate_match_response_tracks_distractor_hits():
+    cases = load_golden_cases(FIXTURE_PATH)
+    response = {
+        "applications": [
+            _application(
+                9101,
+                "Gemini API 사용량 초과 시 재시도 로직 추가",
+                [_commit("7c67758"), _commit("5706bbf")],
+            )
+        ]
+    }
+
+    summary = evaluate_match_response(cases[:1], response, top_k=5)
+
+    assert summary.distractor_hit_count == 1
+    assert summary.high_confidence_false_positive_count == 1
+    assert summary.cases[0].distractor_hit_hashes == ("7c67758",)
+    assert summary.cases[0].high_confidence_false_positive_hashes == ("7c67758",)
+
+
+def test_evaluate_match_response_can_fail_match_case_on_false_positive():
+    cases = load_golden_cases(FIXTURE_PATH)
+    response = {
+        "applications": [
+            _application(
+                9101,
+                "Gemini API 사용량 초과 시 재시도 로직 추가",
+                [_commit("7c67758"), _commit("5706bbf")],
+            )
+        ]
+    }
+
+    summary = evaluate_match_response(
+        cases[:1],
+        response,
+        top_k=5,
+        fail_on_false_positive=True,
+    )
+
+    assert summary.passed_cases == 0
+    assert summary.cases[0].expected_found is True
+    assert summary.cases[0].first_expected_rank == 2
+
+
+def test_no_match_case_ignores_below_threshold_recommendation():
+    cases = load_golden_cases(FIXTURE_PATH)
+    low_confidence_commit = _commit("8af78f0") | {"confidence": 49}
+    response = {
+        "applications": [
+            _application(
+                9105,
+                "저녁 메뉴로 치킨과 맥주를 선정",
+                [low_confidence_commit],
+            ),
+        ]
+    }
+
+    summary = evaluate_match_response(cases[-1:], response, top_k=5)
+
+    assert summary.passed_cases == 1
+    assert summary.no_match_accuracy == 1.0
+    assert summary.false_positive_count == 1
+    assert summary.high_confidence_false_positive_count == 0