diff --git a/pyproject.toml b/pyproject.toml index e63930e..c5c7901 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "rageval" -version = "0.1.0" +version = "0.2.0" description = "Production-grade RAG evaluation framework with retrieval and generation metrics" readme = "README.md" license = { text = "MIT" } @@ -22,12 +22,14 @@ dependencies = [ "pydantic>=2.0", "structlog>=23.0", "duckdb>=0.9", +] + +[project.optional-dependencies] +llm = [ "llama-index-core>=0.11", "llama-index-llms-openai>=0.2", "llama-index-embeddings-openai>=0.2", ] - -[project.optional-dependencies] dev = [ "pytest>=8.0", "pytest-asyncio>=0.23", @@ -35,6 +37,9 @@ dev = [ "ruff>=0.4", "mypy>=1.9", ] +all = [ + "rageval[llm,dev]", +] [tool.hatch.build.targets.wheel] packages = ["src/rageval"] diff --git a/src/rageval/__init__.py b/src/rageval/__init__.py index 7a782e8..d4fd634 100644 --- a/src/rageval/__init__.py +++ b/src/rageval/__init__.py @@ -1,8 +1,11 @@ """rageval - Production-grade RAG evaluation framework. -Evaluates Retrieval-Augmented Generation pipelines with retrieval metrics -(precision, recall, F1, MRR, NDCG) and generation quality metrics -(faithfulness, relevance, answer correctness) using LlamaIndex. +Evaluates Retrieval-Augmented Generation pipelines with: +- Retrieval metrics: precision, recall, F1, MRR, NDCG, hit rate +- Generation metrics: faithfulness, relevance, correctness, ROUGE-L +- Dual evaluation: LLM-as-Judge (LlamaIndex) + heuristic fallback +- DuckDB storage: persistent results, cross-run comparison, JSON export +- CI/CD gate: threshold-based regression detection """ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/src/rageval/core/__init__.py b/src/rageval/core/__init__.py index 3ab1559..48955f8 100644 --- a/src/rageval/core/__init__.py +++ b/src/rageval/core/__init__.py @@ -2,6 +2,7 @@ from rageval.core.config import EvalConfig, LLMProviderConfig from rageval.core.models import ( + EvalMode, EvalResult, EvalSample, GenerationMetrics, @@ -10,6 +11,7 @@ __all__ = [ "EvalConfig", + "EvalMode", "EvalResult", "EvalSample", "GenerationMetrics", diff --git a/src/rageval/core/models.py b/src/rageval/core/models.py index 999d0ab..9121f99 100644 --- a/src/rageval/core/models.py +++ b/src/rageval/core/models.py @@ -22,6 +22,14 @@ class EvalStatus(StrEnum): FAILED = "failed" +class EvalMode(StrEnum): + """Which evaluation mode produced the generation metrics.""" + + LLM_JUDGE = "llm_judge" + HEURISTIC = "heuristic" + NONE = "none" + + class RetrievalMetrics(BaseModel): """Retrieval quality metrics for a single sample. @@ -95,12 +103,22 @@ class GenerationMetrics(BaseModel): le=1.0, description="Semantic similarity to the reference answer", ) + rouge_l: float = Field( + default=0.0, + ge=0.0, + le=1.0, + description="ROUGE-L F1 score vs reference (word-order-aware similarity)", + ) context_utilization: float = Field( default=0.0, ge=0.0, le=1.0, description="How well the answer uses relevant retrieved context", ) + eval_mode: EvalMode = Field( + default=EvalMode.NONE, + description="Which evaluation mode produced these metrics", + ) class EvalSample(BaseModel): @@ -171,6 +189,7 @@ class EvalRunSummary(BaseModel): avg_faithfulness: float = Field(default=0.0, ge=0.0, le=1.0) avg_relevance: float = Field(default=0.0, ge=0.0, le=1.0) avg_correctness: float = Field(default=0.0, ge=0.0, le=1.0) + avg_rouge_l: float = Field(default=0.0, ge=0.0, le=1.0) avg_context_utilization: float = Field(default=0.0, ge=0.0, le=1.0) # Timing avg_latency_ms: float = Field(default=0.0, ge=0.0) diff --git a/src/rageval/metrics/__init__.py b/src/rageval/metrics/__init__.py index 6c89ebf..6477b7f 100644 --- a/src/rageval/metrics/__init__.py +++ b/src/rageval/metrics/__init__.py @@ -1,13 +1,16 @@ """Metrics computation for RAG evaluation. -Provides retrieval metrics (precision, recall, F1, MRR, NDCG) -and generation quality metrics (faithfulness, relevance, correctness). +Provides retrieval metrics (precision, recall, F1, MRR, NDCG), +generation quality metrics (faithfulness, relevance, correctness), +and ROUGE-L (word-order-aware text similarity). """ from rageval.metrics.generation import GenerationEvaluator from rageval.metrics.retrieval import RetrievalEvaluator +from rageval.metrics.rouge import rouge_l_score __all__ = [ "GenerationEvaluator", "RetrievalEvaluator", + "rouge_l_score", ] diff --git a/src/rageval/metrics/generation.py b/src/rageval/metrics/generation.py index 5bdc69b..3b266f4 100644 --- a/src/rageval/metrics/generation.py +++ b/src/rageval/metrics/generation.py @@ -1,18 +1,20 @@ -"""Generation quality metrics using LLM-as-Judge. +"""Generation quality metrics using LLM-as-Judge or heuristic fallback. -Evaluates the quality of RAG-generated answers using LlamaIndex's -evaluation modules. Measures faithfulness (grounding in context), -relevance (addressing the query), and correctness (vs. reference answer). +Evaluates the quality of RAG-generated answers using either: +1. LLM-as-Judge (LlamaIndex evaluators) — accurate, ~2s/sample +2. Heuristic fallback (ROUGE-L + Jaccard + context overlap) — fast, free -When no LLM is configured, falls back to deterministic heuristics -(token overlap, embedding similarity proxies). +When no LLM is configured or LlamaIndex is not installed, falls back +to heuristic mode automatically. The evaluation mode is tracked in +every GenerationMetrics result via the eval_mode field. """ from __future__ import annotations import structlog -from rageval.core.models import GenerationMetrics +from rageval.core.models import EvalMode, GenerationMetrics +from rageval.metrics.rouge import rouge_l_score logger = structlog.get_logger(__name__) @@ -20,6 +22,13 @@ class GenerationEvaluator: """Evaluate generation quality with LLM judge or heuristic fallback. + The evaluator automatically selects the best available mode: + - If an API key is provided and LlamaIndex is installed: LLM-as-Judge + - Otherwise: heuristic mode (ROUGE-L + Jaccard + context utilization) + + Every result includes an ``eval_mode`` field so you always know + which mode produced the scores. + Example:: evaluator = GenerationEvaluator() @@ -30,19 +39,26 @@ class GenerationEvaluator: reference_answer="Retrieval-Augmented Generation...", ) print(f"Faithfulness: {metrics.faithfulness:.2f}") + print(f"ROUGE-L: {metrics.rouge_l:.2f}") + print(f"Mode: {metrics.eval_mode}") """ def __init__(self, *, judge_model: str = "gpt-4o", api_key: str = "") -> None: """Initialize the generation evaluator. Args: - judge_model: LLM model to use as judge. - api_key: API key for the LLM provider. + judge_model: LLM model to use as judge (when LLM mode is active). + api_key: API key for the LLM provider. If empty, uses heuristic mode. """ self._judge_model = judge_model self._api_key = api_key self._llm_available = bool(api_key) + @property + def mode(self) -> EvalMode: + """Return the current evaluation mode.""" + return EvalMode.LLM_JUDGE if self._llm_available else EvalMode.HEURISTIC + def evaluate( self, *, @@ -53,6 +69,9 @@ def evaluate( ) -> GenerationMetrics: """Evaluate a single generated answer. + Automatically selects LLM-as-Judge or heuristic mode based on + configuration. Falls back to heuristic on any LLM error. + Args: query: The original user query. generated_answer: The RAG pipeline's answer. @@ -60,7 +79,7 @@ def evaluate( reference_answer: Ground truth answer for correctness scoring. Returns: - GenerationMetrics with all quality scores. + GenerationMetrics with all quality scores and eval_mode. """ if self._llm_available: return self._evaluate_with_llm( @@ -105,6 +124,10 @@ def evaluate_batch( results.append(metrics) return results + # ------------------------------------------------------------------ + # LLM-as-Judge mode + # ------------------------------------------------------------------ + def _evaluate_with_llm( self, *, @@ -115,8 +138,9 @@ def _evaluate_with_llm( ) -> GenerationMetrics: """Use LlamaIndex LLM evaluators for quality assessment. - This uses LlamaIndex's FaithfulnessEvaluator, RelevancyEvaluator, - and CorrectnessEvaluator under the hood. + Each evaluator is called independently so a failure in one + metric doesn't lose the others. Falls back to heuristic + for the entire sample only if LlamaIndex is not installed. """ try: from llama_index.core.evaluation import ( @@ -125,47 +149,6 @@ def _evaluate_with_llm( RelevancyEvaluator, ) from llama_index.llms.openai import OpenAI - - llm = OpenAI(model=self._judge_model, api_key=self._api_key) - - # Faithfulness: is the answer grounded in the context? - faith_eval = FaithfulnessEvaluator(llm=llm) - faith_result = faith_eval.evaluate( - query=query, - response=generated_answer, - contexts=retrieved_contexts, - ) - faithfulness = 1.0 if faith_result.passing else 0.0 - - # Relevancy: does the answer address the query? - rel_eval = RelevancyEvaluator(llm=llm) - rel_result = rel_eval.evaluate( - query=query, - response=generated_answer, - contexts=retrieved_contexts, - ) - relevance = 1.0 if rel_result.passing else 0.0 - - # Correctness: semantic match to reference answer - correctness = 0.0 - if reference_answer: - corr_eval = CorrectnessEvaluator(llm=llm) - corr_result = corr_eval.evaluate( - query=query, - response=generated_answer, - reference=reference_answer, - ) - correctness = (corr_result.score or 0.0) / 5.0 # Normalize to 0-1 - - context_util = self._context_utilization(generated_answer, retrieved_contexts) - - return GenerationMetrics( - faithfulness=faithfulness, - relevance=relevance, - correctness=correctness, - context_utilization=context_util, - ) - except ImportError: logger.warning("generation.llm_unavailable", msg="LlamaIndex not installed") return self._evaluate_with_heuristics( @@ -174,14 +157,108 @@ def _evaluate_with_llm( retrieved_contexts=retrieved_contexts, reference_answer=reference_answer, ) + + llm = OpenAI(model=self._judge_model, api_key=self._api_key) + + # Faithfulness: is the answer grounded in context? + faithfulness = self._llm_faithfulness( + llm, FaithfulnessEvaluator, query, generated_answer, retrieved_contexts + ) + + # Relevance: does the answer address the query? + relevance = self._llm_relevance( + llm, RelevancyEvaluator, query, generated_answer, retrieved_contexts + ) + + # Correctness: semantic match to reference answer + correctness = self._llm_correctness( + llm, CorrectnessEvaluator, query, generated_answer, reference_answer + ) + + # ROUGE-L: always computed (deterministic, free) + rouge_l = rouge_l_score(generated_answer, reference_answer) if reference_answer else 0.0 + + # Context utilization: always computed (deterministic) + context_util = self._context_utilization(generated_answer, retrieved_contexts) + + return GenerationMetrics( + faithfulness=faithfulness, + relevance=relevance, + correctness=correctness, + rouge_l=rouge_l, + context_utilization=context_util, + eval_mode=EvalMode.LLM_JUDGE, + ) + + @staticmethod + def _llm_faithfulness( + llm: object, + evaluator_cls: type, + query: str, + answer: str, + contexts: list[str], + ) -> float: + """Evaluate faithfulness with per-metric error handling.""" + try: + evaluator = evaluator_cls(llm=llm) + result = evaluator.evaluate( + query=query, + response=answer, + contexts=contexts, + ) + return 1.0 if result.passing else 0.0 except Exception: - logger.exception("generation.llm_eval_failed") - return self._evaluate_with_heuristics( + logger.exception("generation.llm_faithfulness_failed") + return 0.0 + + @staticmethod + def _llm_relevance( + llm: object, + evaluator_cls: type, + query: str, + answer: str, + contexts: list[str], + ) -> float: + """Evaluate relevance with per-metric error handling.""" + try: + evaluator = evaluator_cls(llm=llm) + result = evaluator.evaluate( query=query, - generated_answer=generated_answer, - retrieved_contexts=retrieved_contexts, - reference_answer=reference_answer, + response=answer, + contexts=contexts, + ) + return 1.0 if result.passing else 0.0 + except Exception: + logger.exception("generation.llm_relevance_failed") + return 0.0 + + @staticmethod + def _llm_correctness( + llm: object, + evaluator_cls: type, + query: str, + answer: str, + reference: str, + ) -> float: + """Evaluate correctness with per-metric error handling.""" + if not reference: + return 0.0 + try: + evaluator = evaluator_cls(llm=llm) + result = evaluator.evaluate( + query=query, + response=answer, + reference=reference, ) + score = result.score or 0.0 + return min(score / 5.0, 1.0) # Normalize 0-5 → 0-1 + except Exception: + logger.exception("generation.llm_correctness_failed") + return 0.0 + + # ------------------------------------------------------------------ + # Heuristic mode + # ------------------------------------------------------------------ def _evaluate_with_heuristics( self, @@ -191,10 +268,14 @@ def _evaluate_with_heuristics( retrieved_contexts: list[str], reference_answer: str, ) -> GenerationMetrics: - """Deterministic heuristic evaluation using token overlap. - - Used when no LLM judge is configured. Provides rough proxy - metrics based on word overlap analysis. + """Deterministic heuristic evaluation using ROUGE-L and token overlap. + + Used when no LLM judge is configured. Provides proxy metrics: + - Faithfulness: Jaccard overlap between answer and context tokens + - Relevance: Jaccard overlap between answer and query tokens + - Correctness: Jaccard overlap between answer and reference tokens + - ROUGE-L: Longest Common Subsequence F1 vs reference (order-aware) + - Context utilization: fraction of contexts with >20% token overlap """ answer_tokens = set(generated_answer.lower().split()) @@ -213,6 +294,9 @@ def _evaluate_with_heuristics( ref_tokens = set(reference_answer.lower().split()) correctness = self._token_overlap(answer_tokens, ref_tokens) + # ROUGE-L: order-aware similarity vs reference + rouge_l = rouge_l_score(generated_answer, reference_answer) if reference_answer else 0.0 + # Context utilization context_util = self._context_utilization(generated_answer, retrieved_contexts) @@ -220,12 +304,21 @@ def _evaluate_with_heuristics( faithfulness=min(faithfulness, 1.0), relevance=min(relevance, 1.0), correctness=min(correctness, 1.0), + rouge_l=min(rouge_l, 1.0), context_utilization=min(context_util, 1.0), + eval_mode=EvalMode.HEURISTIC, ) + # ------------------------------------------------------------------ + # Shared utilities + # ------------------------------------------------------------------ + @staticmethod def _token_overlap(set_a: set[str], set_b: set[str]) -> float: - """Compute Jaccard-like overlap between two token sets.""" + """Compute Jaccard similarity between two token sets. + + Jaccard = |A & B| / |A | B| + """ if not set_a or not set_b: return 0.0 intersection = set_a & set_b @@ -234,7 +327,12 @@ def _token_overlap(set_a: set[str], set_b: set[str]) -> float: @staticmethod def _context_utilization(answer: str, contexts: list[str]) -> float: - """Measure how much of the retrieved context is used in the answer.""" + """Measure what fraction of retrieved contexts contributed to the answer. + + A context is considered "utilized" if >20% of its tokens appear + in the generated answer. This threshold is empirical — adjust + per use case. + """ if not contexts or not answer: return 0.0 diff --git a/src/rageval/metrics/rouge.py b/src/rageval/metrics/rouge.py new file mode 100644 index 0000000..b4b5d11 --- /dev/null +++ b/src/rageval/metrics/rouge.py @@ -0,0 +1,133 @@ +"""ROUGE-L metric implementation (pure Python, zero dependencies). + +Computes ROUGE-L (Longest Common Subsequence) score between two texts. +This is a standard NLP metric for measuring text similarity that captures +word ordering — unlike Jaccard which treats text as unordered bags of words. + +Reference: Lin, C.Y. (2004). ROUGE: A Package for Automatic Evaluation +of Summaries. ACL Workshop on Text Summarization. +""" + +from __future__ import annotations + + +def _lcs_length(seq_a: list[str], seq_b: list[str]) -> int: + """Compute length of the Longest Common Subsequence. + + Uses O(min(m,n)) space optimization instead of full m*n table. + + Args: + seq_a: First token sequence. + seq_b: Second token sequence. + + Returns: + Length of the LCS. + """ + if not seq_a or not seq_b: + return 0 + + # Ensure seq_b is the shorter sequence for space optimization + if len(seq_a) < len(seq_b): + seq_a, seq_b = seq_b, seq_a + + m = len(seq_b) + prev = [0] * (m + 1) + curr = [0] * (m + 1) + + for token_a in seq_a: + for j, token_b in enumerate(seq_b): + if token_a == token_b: + curr[j + 1] = prev[j] + 1 + else: + curr[j + 1] = max(curr[j], prev[j + 1]) + prev, curr = curr, [0] * (m + 1) + + return prev[m] + + +def rouge_l_score(candidate: str, reference: str) -> float: + """Compute ROUGE-L F1 score between candidate and reference text. + + ROUGE-L uses the Longest Common Subsequence (LCS) to measure + similarity while preserving word order information. + + Args: + candidate: The generated text to evaluate. + reference: The ground truth reference text. + + Returns: + ROUGE-L F1 score in [0.0, 1.0]. + + Example:: + + >>> rouge_l_score("the cat sat on the mat", "the cat on the mat") + 0.9... + """ + if not candidate or not reference: + return 0.0 + + cand_tokens = candidate.lower().split() + ref_tokens = reference.lower().split() + + if not cand_tokens or not ref_tokens: + return 0.0 + + lcs_len = _lcs_length(cand_tokens, ref_tokens) + + if lcs_len == 0: + return 0.0 + + precision = lcs_len / len(cand_tokens) + recall = lcs_len / len(ref_tokens) + + if precision + recall == 0: + return 0.0 + + f1 = 2 * (precision * recall) / (precision + recall) + return f1 + + +def rouge_l_precision(candidate: str, reference: str) -> float: + """Compute ROUGE-L precision (LCS / candidate length). + + Args: + candidate: The generated text. + reference: The reference text. + + Returns: + ROUGE-L precision in [0.0, 1.0]. + """ + if not candidate or not reference: + return 0.0 + + cand_tokens = candidate.lower().split() + ref_tokens = reference.lower().split() + + if not cand_tokens or not ref_tokens: + return 0.0 + + lcs_len = _lcs_length(cand_tokens, ref_tokens) + return lcs_len / len(cand_tokens) + + +def rouge_l_recall(candidate: str, reference: str) -> float: + """Compute ROUGE-L recall (LCS / reference length). + + Args: + candidate: The generated text. + reference: The reference text. + + Returns: + ROUGE-L recall in [0.0, 1.0]. + """ + if not candidate or not reference: + return 0.0 + + cand_tokens = candidate.lower().split() + ref_tokens = reference.lower().split() + + if not cand_tokens or not ref_tokens: + return 0.0 + + lcs_len = _lcs_length(cand_tokens, ref_tokens) + return lcs_len / len(ref_tokens) diff --git a/src/rageval/pipeline/storage.py b/src/rageval/pipeline/storage.py index 8c8894d..404d917 100644 --- a/src/rageval/pipeline/storage.py +++ b/src/rageval/pipeline/storage.py @@ -1,11 +1,12 @@ """DuckDB-backed result storage for evaluation runs. -Provides persistent storage, querying, and comparison of evaluation -results across runs, models, and dataset versions. +Provides persistent storage, querying, comparison, and export of +evaluation results across runs, models, and dataset versions. """ from __future__ import annotations +import json from typing import Any import duckdb @@ -19,12 +20,17 @@ class ResultStore: """Store and query evaluation results in DuckDB. + Supports both in-memory (for tests) and file-backed (for production) + databases. Provides SQL-powered aggregation, cross-run comparison, + filtered queries, and JSON export. + Example:: store = ResultStore(db_path=":memory:") store.store_results("run-1", results) summary = store.get_run_summary("run-1") print(f"Avg F1: {summary.avg_f1:.3f}") + store.export_json("run-1", "/tmp/run-1-results.json") """ def __init__(self, *, db_path: str = ":memory:") -> None: @@ -56,7 +62,9 @@ def _ensure_tables(self) -> None: faithfulness DOUBLE DEFAULT 0, relevance DOUBLE DEFAULT 0, correctness DOUBLE DEFAULT 0, + rouge_l DOUBLE DEFAULT 0, context_utilization DOUBLE DEFAULT 0, + eval_mode VARCHAR DEFAULT 'none', -- Timing latency_ms DOUBLE DEFAULT 0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP @@ -78,6 +86,7 @@ def _ensure_tables(self) -> None: avg_faithfulness DOUBLE DEFAULT 0, avg_relevance DOUBLE DEFAULT 0, avg_correctness DOUBLE DEFAULT 0, + avg_rouge_l DOUBLE DEFAULT 0, avg_context_utilization DOUBLE DEFAULT 0, avg_latency_ms DOUBLE DEFAULT 0, total_duration_seconds DOUBLE DEFAULT 0, @@ -89,6 +98,9 @@ def _ensure_tables(self) -> None: def store_results(self, run_id: str, results: list[EvalResult]) -> int: """Store evaluation results for a run. + Uses INSERT OR REPLACE for idempotent upserts — re-running + an evaluation on the same samples updates rather than duplicates. + Args: run_id: Unique identifier for this evaluation run. results: List of evaluation results to store. @@ -104,8 +116,9 @@ def store_results(self, run_id: str, results: list[EvalResult]) -> int: generated_answer, precision, recall, f1_score, mrr, ndcg, hit_rate, retrieved_count, relevant_count, relevant_retrieved_count, faithfulness, relevance, - correctness, context_utilization, latency_ms, created_at - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + correctness, rouge_l, context_utilization, eval_mode, + latency_ms, created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, [ result.id, @@ -126,7 +139,9 @@ def store_results(self, run_id: str, results: list[EvalResult]) -> int: result.generation_metrics.faithfulness, result.generation_metrics.relevance, result.generation_metrics.correctness, + result.generation_metrics.rouge_l, result.generation_metrics.context_utilization, + result.generation_metrics.eval_mode.value, result.latency_ms, result.created_at.isoformat(), ], @@ -136,7 +151,9 @@ def store_results(self, run_id: str, results: list[EvalResult]) -> int: return len(results) def get_run_summary(self, run_id: str) -> EvalRunSummary: - """Compute aggregate metrics for a run. + """Compute aggregate metrics for a run via SQL. + + Uses DuckDB's AVG() aggregation — no materializing rows into Python. Args: run_id: The run to summarize. @@ -157,6 +174,7 @@ def get_run_summary(self, run_id: str) -> EvalRunSummary: AVG(faithfulness) as avg_faithfulness, AVG(relevance) as avg_relevance, AVG(correctness) as avg_correctness, + AVG(rouge_l) as avg_rouge_l, AVG(context_utilization) as avg_context_utilization, AVG(latency_ms) as avg_latency_ms FROM eval_results @@ -181,13 +199,16 @@ def get_run_summary(self, run_id: str) -> EvalRunSummary: avg_faithfulness=row[7] or 0.0, avg_relevance=row[8] or 0.0, avg_correctness=row[9] or 0.0, - avg_context_utilization=row[10] or 0.0, - avg_latency_ms=row[11] or 0.0, + avg_rouge_l=row[10] or 0.0, + avg_context_utilization=row[11] or 0.0, + avg_latency_ms=row[12] or 0.0, ) def compare_runs(self, run_id_a: str, run_id_b: str) -> dict[str, dict[str, float]]: """Compare two evaluation runs side by side. + Computes delta (candidate - baseline) for every metric. + Args: run_id_a: First run (baseline). run_id_b: Second run (candidate). @@ -209,6 +230,7 @@ def compare_runs(self, run_id_a: str, run_id_b: str) -> dict[str, dict[str, floa "avg_faithfulness", "avg_relevance", "avg_correctness", + "avg_rouge_l", "avg_context_utilization", ] @@ -230,14 +252,27 @@ def query_results( *, min_f1: float | None = None, max_f1: float | None = None, + min_faithfulness: float | None = None, + max_faithfulness: float | None = None, + min_rouge_l: float | None = None, + max_rouge_l: float | None = None, + eval_mode: str | None = None, limit: int = 100, ) -> list[dict[str, Any]]: - """Query results with optional filters. + """Query results with flexible filters. + + Supports filtering on any metric range, evaluation mode, and + result count limit. Args: run_id: Run to query. min_f1: Minimum F1 score filter. max_f1: Maximum F1 score filter. + min_faithfulness: Minimum faithfulness filter. + max_faithfulness: Maximum faithfulness filter. + min_rouge_l: Minimum ROUGE-L filter. + max_rouge_l: Maximum ROUGE-L filter. + eval_mode: Filter by evaluation mode ('llm_judge' or 'heuristic'). limit: Max results to return. Returns: @@ -246,12 +281,20 @@ def query_results( conditions = ["run_id = ?"] params: list[Any] = [run_id] - if min_f1 is not None: - conditions.append("f1_score >= ?") - params.append(min_f1) - if max_f1 is not None: - conditions.append("f1_score <= ?") - params.append(max_f1) + filter_map: list[tuple[str, str, float | str | None]] = [ + ("f1_score >= ?", "f1_score >= ?", min_f1), + ("f1_score <= ?", "f1_score <= ?", max_f1), + ("faithfulness >= ?", "faithfulness >= ?", min_faithfulness), + ("faithfulness <= ?", "faithfulness <= ?", max_faithfulness), + ("rouge_l >= ?", "rouge_l >= ?", min_rouge_l), + ("rouge_l <= ?", "rouge_l <= ?", max_rouge_l), + ("eval_mode = ?", "eval_mode = ?", eval_mode), + ] + + for condition, _, value in filter_map: + if value is not None: + conditions.append(condition) + params.append(value) where = " AND ".join(conditions) params.append(limit) @@ -265,6 +308,88 @@ def query_results( columns = [d[0] for d in desc] if desc else [] return [dict(zip(columns, row)) for row in rows] + def export_json(self, run_id: str, file_path: str) -> int: + """Export a run's results to a JSON file. + + Exports both individual results and the aggregate summary + in a single JSON document suitable for dashboards, reports, + or archiving. + + Args: + run_id: Run to export. + file_path: Output file path (.json). + + Returns: + Number of results exported. + """ + results = self.query_results(run_id, limit=10000) + summary = self.get_run_summary(run_id) + + export_data = { + "run_id": run_id, + "summary": { + "status": summary.status.value, + "sample_count": summary.sample_count, + "avg_precision": summary.avg_precision, + "avg_recall": summary.avg_recall, + "avg_f1": summary.avg_f1, + "avg_mrr": summary.avg_mrr, + "avg_ndcg": summary.avg_ndcg, + "avg_hit_rate": summary.avg_hit_rate, + "avg_faithfulness": summary.avg_faithfulness, + "avg_relevance": summary.avg_relevance, + "avg_correctness": summary.avg_correctness, + "avg_rouge_l": summary.avg_rouge_l, + "avg_context_utilization": summary.avg_context_utilization, + "avg_latency_ms": summary.avg_latency_ms, + }, + "results": _serialize_results(results), + } + + with open(file_path, "w") as f: + json.dump(export_data, f, indent=2, default=str) + + logger.info("storage.exported_json", run_id=run_id, count=len(results), path=file_path) + return len(results) + + def list_runs(self) -> list[dict[str, Any]]: + """List all evaluation runs with their sample counts. + + Returns: + List of dicts with run_id and sample_count. + """ + rows = self._conn.execute( + """ + SELECT run_id, COUNT(*) as sample_count, + MIN(created_at) as first_result, + MAX(created_at) as last_result + FROM eval_results + GROUP BY run_id + ORDER BY MIN(created_at) DESC + """ + ).fetchall() + + return [ + { + "run_id": row[0], + "sample_count": row[1], + "first_result": str(row[2]), + "last_result": str(row[3]), + } + for row in rows + ] + def close(self) -> None: """Close the database connection.""" self._conn.close() + + +def _serialize_results(results: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Convert DuckDB result rows to JSON-serializable dicts.""" + serialized = [] + for row in results: + clean: dict[str, Any] = {} + for key, value in row.items(): + clean[key] = str(value) if hasattr(value, "isoformat") else value + serialized.append(clean) + return serialized diff --git a/tests/test_generation_metrics.py b/tests/test_generation_metrics.py index 89d57f4..784c09f 100644 --- a/tests/test_generation_metrics.py +++ b/tests/test_generation_metrics.py @@ -1,9 +1,10 @@ -"""Tests for generation quality metrics (faithfulness, relevance, correctness).""" +"""Tests for generation quality metrics (faithfulness, relevance, correctness, ROUGE-L).""" from __future__ import annotations import pytest +from rageval.core.models import EvalMode from rageval.metrics.generation import GenerationEvaluator @@ -174,6 +175,61 @@ def test_batch_all_metrics_present(self, evaluator: GenerationEvaluator) -> None assert results[0].context_utilization >= 0.0 +class TestRougeL: + """ROUGE-L integration in generation metrics.""" + + def test_rouge_l_with_reference(self, evaluator: GenerationEvaluator) -> None: + """ROUGE-L should be computed when reference is provided.""" + result = evaluator.evaluate( + query="What is RAG?", + generated_answer="RAG combines retrieval with generation for better answers", + retrieved_contexts=["RAG is a technique."], + reference_answer="RAG combines retrieval with generation", + ) + assert result.rouge_l > 0.5 + + def test_rouge_l_without_reference(self, evaluator: GenerationEvaluator) -> None: + """ROUGE-L should be 0 when no reference is provided.""" + result = evaluator.evaluate( + query="What is RAG?", + generated_answer="RAG combines retrieval with generation", + retrieved_contexts=["RAG is a technique."], + ) + assert result.rouge_l == 0.0 + + def test_rouge_l_identical(self, evaluator: GenerationEvaluator) -> None: + """Identical answer and reference should give ROUGE-L = 1.0.""" + result = evaluator.evaluate( + query="What is 2+2?", + generated_answer="The answer is four", + retrieved_contexts=["Basic arithmetic."], + reference_answer="The answer is four", + ) + assert result.rouge_l == 1.0 + + +class TestEvalMode: + """Test evaluation mode tracking.""" + + def test_heuristic_mode_tracked(self, evaluator: GenerationEvaluator) -> None: + """Heuristic evaluator should report its mode.""" + result = evaluator.evaluate( + query="What is RAG?", + generated_answer="RAG is a technique", + retrieved_contexts=["RAG is a technique."], + ) + assert result.eval_mode == EvalMode.HEURISTIC + + def test_mode_property(self, evaluator: GenerationEvaluator) -> None: + """Mode property should reflect configuration.""" + assert evaluator.mode == EvalMode.HEURISTIC + + def test_llm_mode_when_configured(self) -> None: + """Evaluator with API key should report LLM mode.""" + llm_eval = GenerationEvaluator(api_key="test-key-123") + assert llm_eval.mode == EvalMode.LLM_JUDGE + + class TestEdgeCases: """Edge cases for generation evaluation.""" @@ -185,6 +241,7 @@ def test_empty_answer(self, evaluator: GenerationEvaluator) -> None: ) assert result.faithfulness == 0.0 assert result.relevance == 0.0 + assert result.rouge_l == 0.0 def test_empty_query(self, evaluator: GenerationEvaluator) -> None: result = evaluator.evaluate( diff --git a/tests/test_models.py b/tests/test_models.py index a6da5b2..0acb47a 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -7,6 +7,7 @@ from rageval.core.config import EvalConfig, LLMProviderConfig, RetrieverConfig from rageval.core.models import ( + EvalMode, EvalResult, EvalRunSummary, EvalSample, @@ -73,16 +74,22 @@ def test_default_values(self) -> None: assert metrics.faithfulness == 0.0 assert metrics.relevance == 0.0 assert metrics.correctness == 0.0 + assert metrics.rouge_l == 0.0 assert metrics.context_utilization == 0.0 + assert metrics.eval_mode == EvalMode.NONE def test_valid_scores(self) -> None: metrics = GenerationMetrics( faithfulness=0.9, relevance=0.85, correctness=0.7, + rouge_l=0.8, context_utilization=0.6, + eval_mode=EvalMode.HEURISTIC, ) assert metrics.faithfulness == 0.9 + assert metrics.rouge_l == 0.8 + assert metrics.eval_mode == EvalMode.HEURISTIC def test_rejects_out_of_range(self) -> None: with pytest.raises(ValidationError): @@ -92,6 +99,19 @@ def test_rejects_negative(self) -> None: with pytest.raises(ValidationError): GenerationMetrics(relevance=-0.1) + def test_rejects_rouge_l_out_of_range(self) -> None: + with pytest.raises(ValidationError): + GenerationMetrics(rouge_l=1.5) + + +class TestEvalMode: + """Test EvalMode enum.""" + + def test_values(self) -> None: + assert EvalMode.LLM_JUDGE == "llm_judge" + assert EvalMode.HEURISTIC == "heuristic" + assert EvalMode.NONE == "none" + class TestEvalSample: """Test EvalSample model.""" diff --git a/tests/test_rouge.py b/tests/test_rouge.py new file mode 100644 index 0000000..7929a9d --- /dev/null +++ b/tests/test_rouge.py @@ -0,0 +1,110 @@ +"""Tests for ROUGE-L metric implementation.""" + +from __future__ import annotations + +import pytest + +from rageval.metrics.rouge import _lcs_length, rouge_l_precision, rouge_l_recall, rouge_l_score + + +class TestLCSLength: + """Test the Longest Common Subsequence computation.""" + + def test_identical_sequences(self) -> None: + assert _lcs_length(["a", "b", "c"], ["a", "b", "c"]) == 3 + + def test_no_overlap(self) -> None: + assert _lcs_length(["a", "b"], ["c", "d"]) == 0 + + def test_partial_overlap(self) -> None: + assert _lcs_length(["a", "b", "c", "d"], ["a", "c", "e"]) == 2 + + def test_subsequence_not_substring(self) -> None: + """LCS finds non-contiguous matches.""" + assert _lcs_length(["the", "cat", "sat", "on", "mat"], ["the", "on", "mat"]) == 3 + + def test_empty_first(self) -> None: + assert _lcs_length([], ["a", "b"]) == 0 + + def test_empty_second(self) -> None: + assert _lcs_length(["a", "b"], []) == 0 + + def test_both_empty(self) -> None: + assert _lcs_length([], []) == 0 + + def test_single_match(self) -> None: + assert _lcs_length(["hello"], ["hello"]) == 1 + + def test_single_no_match(self) -> None: + assert _lcs_length(["hello"], ["world"]) == 0 + + +class TestRougeL: + """Test ROUGE-L F1 score.""" + + def test_identical_text(self) -> None: + score = rouge_l_score("the cat sat on the mat", "the cat sat on the mat") + assert score == 1.0 + + def test_completely_different(self) -> None: + score = rouge_l_score("hello world", "goodbye universe") + assert score == 0.0 + + def test_partial_overlap_preserves_order(self) -> None: + """ROUGE-L should reward matching word order.""" + score = rouge_l_score( + "RAG combines retrieval with generation for better answers", + "RAG combines retrieval with generation", + ) + assert score > 0.7 + + def test_reordered_text_scores_lower(self) -> None: + """Reordered words should score lower than ordered matches.""" + ordered = rouge_l_score("the cat sat on the mat", "the cat sat on the mat") + reordered = rouge_l_score("mat the on sat cat the", "the cat sat on the mat") + assert ordered > reordered + + def test_empty_candidate(self) -> None: + assert rouge_l_score("", "some reference") == 0.0 + + def test_empty_reference(self) -> None: + assert rouge_l_score("some candidate", "") == 0.0 + + def test_both_empty(self) -> None: + assert rouge_l_score("", "") == 0.0 + + def test_case_insensitive(self) -> None: + score = rouge_l_score("The Cat SAT", "the cat sat") + assert score == 1.0 + + def test_known_value(self) -> None: + """Verify against a hand-computed ROUGE-L score. + + Candidate: "the cat on the mat" (5 tokens) + Reference: "the cat sat on the mat" (6 tokens) + LCS: "the cat on the mat" = 5 + P = 5/5 = 1.0, R = 5/6 = 0.833 + F1 = 2 * (1.0 * 0.833) / (1.0 + 0.833) = 0.909 + """ + score = rouge_l_score("the cat on the mat", "the cat sat on the mat") + assert score == pytest.approx(0.909, abs=0.01) + + +class TestRougeLPrecisionRecall: + """Test precision and recall separately.""" + + def test_precision_perfect(self) -> None: + """Every candidate word is in the reference subsequence.""" + p = rouge_l_precision("the cat", "the cat sat on the mat") + assert p == 1.0 + + def test_recall_perfect(self) -> None: + """Every reference word is in the candidate subsequence.""" + r = rouge_l_recall("the cat sat on the mat and more", "the cat sat on the mat") + assert r == 1.0 + + def test_precision_empty(self) -> None: + assert rouge_l_precision("", "reference") == 0.0 + + def test_recall_empty(self) -> None: + assert rouge_l_recall("candidate", "") == 0.0 diff --git a/tests/test_storage.py b/tests/test_storage.py index 49776a2..ab67eba 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -2,6 +2,8 @@ from __future__ import annotations +from pathlib import Path + import pytest from rageval.core.models import ( @@ -162,3 +164,58 @@ def test_query_with_limit(self, store: ResultStore) -> None: def test_query_empty_run(self, store: ResultStore) -> None: results = store.query_results("nonexistent") assert len(results) == 0 + + def test_query_by_faithfulness(self, store: ResultStore) -> None: + results = [ + _make_result("s1", faithfulness=0.2), + _make_result("s2", faithfulness=0.8), + _make_result("s3", faithfulness=0.9), + ] + store.store_results("run-1", results) + filtered = store.query_results("run-1", min_faithfulness=0.5) + assert len(filtered) == 2 + + +class TestExportJson: + """Test JSON export functionality.""" + + def test_export_creates_file(self, store: ResultStore, tmp_path: Path) -> None: + """Export should create a valid JSON file.""" + import json + + results = [_make_result("s1"), _make_result("s2")] + store.store_results("run-1", results) + + out_path = str(tmp_path / "export.json") + count = store.export_json("run-1", out_path) + assert count == 2 + + with open(out_path) as f: + data = json.load(f) + + assert data["run_id"] == "run-1" + assert data["summary"]["sample_count"] == 2 + assert len(data["results"]) == 2 + + def test_export_empty_run(self, store: ResultStore, tmp_path: Path) -> None: + """Exporting a nonexistent run should create a file with 0 results.""" + out_path = str(tmp_path / "empty.json") + count = store.export_json("nonexistent", out_path) + assert count == 0 + + +class TestListRuns: + """Test listing all runs.""" + + def test_list_runs(self, store: ResultStore) -> None: + store.store_results("run-a", [_make_result("s1")]) + store.store_results("run-b", [_make_result("s2"), _make_result("s3")]) + + runs = store.list_runs() + assert len(runs) == 2 + run_ids = {r["run_id"] for r in runs} + assert run_ids == {"run-a", "run-b"} + + def test_list_runs_empty(self, store: ResultStore) -> None: + runs = store.list_runs() + assert len(runs) == 0