Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "rageval"
version = "0.1.0"
version = "0.2.0"
description = "Production-grade RAG evaluation framework with retrieval and generation metrics"
readme = "README.md"
license = { text = "MIT" }
Expand All @@ -22,19 +22,24 @@ dependencies = [
"pydantic>=2.0",
"structlog>=23.0",
"duckdb>=0.9",
]

[project.optional-dependencies]
llm = [
"llama-index-core>=0.11",
"llama-index-llms-openai>=0.2",
"llama-index-embeddings-openai>=0.2",
]

[project.optional-dependencies]
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.23",
"pytest-cov>=4.0",
"ruff>=0.4",
"mypy>=1.9",
]
all = [
"rageval[llm,dev]",
]

[tool.hatch.build.targets.wheel]
packages = ["src/rageval"]
Expand Down
11 changes: 7 additions & 4 deletions src/rageval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""rageval - Production-grade RAG evaluation framework.

Evaluates Retrieval-Augmented Generation pipelines with retrieval metrics
(precision, recall, F1, MRR, NDCG) and generation quality metrics
(faithfulness, relevance, answer correctness) using LlamaIndex.
Evaluates Retrieval-Augmented Generation pipelines with:
- Retrieval metrics: precision, recall, F1, MRR, NDCG, hit rate
- Generation metrics: faithfulness, relevance, correctness, ROUGE-L
- Dual evaluation: LLM-as-Judge (LlamaIndex) + heuristic fallback
- DuckDB storage: persistent results, cross-run comparison, JSON export
- CI/CD gate: threshold-based regression detection
"""

__version__ = "0.1.0"
__version__ = "0.2.0"
2 changes: 2 additions & 0 deletions src/rageval/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from rageval.core.config import EvalConfig, LLMProviderConfig
from rageval.core.models import (
EvalMode,
EvalResult,
EvalSample,
GenerationMetrics,
Expand All @@ -10,6 +11,7 @@

__all__ = [
"EvalConfig",
"EvalMode",
"EvalResult",
"EvalSample",
"GenerationMetrics",
Expand Down
19 changes: 19 additions & 0 deletions src/rageval/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ class EvalStatus(StrEnum):
FAILED = "failed"


class EvalMode(StrEnum):
"""Which evaluation mode produced the generation metrics."""

LLM_JUDGE = "llm_judge"
HEURISTIC = "heuristic"
NONE = "none"


class RetrievalMetrics(BaseModel):
"""Retrieval quality metrics for a single sample.

Expand Down Expand Up @@ -95,12 +103,22 @@ class GenerationMetrics(BaseModel):
le=1.0,
description="Semantic similarity to the reference answer",
)
rouge_l: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="ROUGE-L F1 score vs reference (word-order-aware similarity)",
)
context_utilization: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="How well the answer uses relevant retrieved context",
)
eval_mode: EvalMode = Field(
default=EvalMode.NONE,
description="Which evaluation mode produced these metrics",
)


class EvalSample(BaseModel):
Expand Down Expand Up @@ -171,6 +189,7 @@ class EvalRunSummary(BaseModel):
avg_faithfulness: float = Field(default=0.0, ge=0.0, le=1.0)
avg_relevance: float = Field(default=0.0, ge=0.0, le=1.0)
avg_correctness: float = Field(default=0.0, ge=0.0, le=1.0)
avg_rouge_l: float = Field(default=0.0, ge=0.0, le=1.0)
avg_context_utilization: float = Field(default=0.0, ge=0.0, le=1.0)
# Timing
avg_latency_ms: float = Field(default=0.0, ge=0.0)
Expand Down
7 changes: 5 additions & 2 deletions src/rageval/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Metrics computation for RAG evaluation.

Provides retrieval metrics (precision, recall, F1, MRR, NDCG)
and generation quality metrics (faithfulness, relevance, correctness).
Provides retrieval metrics (precision, recall, F1, MRR, NDCG),
generation quality metrics (faithfulness, relevance, correctness),
and ROUGE-L (word-order-aware text similarity).
"""

from rageval.metrics.generation import GenerationEvaluator
from rageval.metrics.retrieval import RetrievalEvaluator
from rageval.metrics.rouge import rouge_l_score

__all__ = [
"GenerationEvaluator",
"RetrievalEvaluator",
"rouge_l_score",
]
Loading
Loading