diff --git a/pyproject.toml b/pyproject.toml
index e63930e..c5c7901 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "rageval"
-version = "0.1.0"
+version = "0.2.0"
 description = "Production-grade RAG evaluation framework with retrieval and generation metrics"
 readme = "README.md"
 license = { text = "MIT" }
@@ -22,12 +22,14 @@ dependencies = [
     "pydantic>=2.0",
     "structlog>=23.0",
     "duckdb>=0.9",
+]
+
+[project.optional-dependencies]
+llm = [
     "llama-index-core>=0.11",
     "llama-index-llms-openai>=0.2",
     "llama-index-embeddings-openai>=0.2",
 ]
-
-[project.optional-dependencies]
 dev = [
     "pytest>=8.0",
     "pytest-asyncio>=0.23",
@@ -35,6 +37,9 @@ dev = [
     "ruff>=0.4",
     "mypy>=1.9",
 ]
+all = [
+    "rageval[llm,dev]",
+]
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/rageval"]
diff --git a/src/rageval/__init__.py b/src/rageval/__init__.py
index 7a782e8..d4fd634 100644
--- a/src/rageval/__init__.py
+++ b/src/rageval/__init__.py
@@ -1,8 +1,11 @@
 """rageval - Production-grade RAG evaluation framework.
 
-Evaluates Retrieval-Augmented Generation pipelines with retrieval metrics
-(precision, recall, F1, MRR, NDCG) and generation quality metrics
-(faithfulness, relevance, answer correctness) using LlamaIndex.
+Evaluates Retrieval-Augmented Generation pipelines with:
+- Retrieval metrics: precision, recall, F1, MRR, NDCG, hit rate
+- Generation metrics: faithfulness, relevance, correctness, ROUGE-L
+- Dual evaluation: LLM-as-Judge (LlamaIndex) + heuristic fallback
+- DuckDB storage: persistent results, cross-run comparison, JSON export
+- CI/CD gate: threshold-based regression detection
 """
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/src/rageval/core/__init__.py b/src/rageval/core/__init__.py
index 3ab1559..48955f8 100644
--- a/src/rageval/core/__init__.py
+++ b/src/rageval/core/__init__.py
@@ -2,6 +2,7 @@
 
 from rageval.core.config import EvalConfig, LLMProviderConfig
 from rageval.core.models import (
+    EvalMode,
     EvalResult,
     EvalSample,
     GenerationMetrics,
@@ -10,6 +11,7 @@
 
 __all__ = [
     "EvalConfig",
+    "EvalMode",
     "EvalResult",
     "EvalSample",
     "GenerationMetrics",
diff --git a/src/rageval/core/models.py b/src/rageval/core/models.py
index 999d0ab..9121f99 100644
--- a/src/rageval/core/models.py
+++ b/src/rageval/core/models.py
@@ -22,6 +22,14 @@ class EvalStatus(StrEnum):
     FAILED = "failed"
 
 
+class EvalMode(StrEnum):
+    """Which evaluation mode produced the generation metrics."""
+
+    LLM_JUDGE = "llm_judge"
+    HEURISTIC = "heuristic"
+    NONE = "none"
+
+
 class RetrievalMetrics(BaseModel):
     """Retrieval quality metrics for a single sample.
 
@@ -95,12 +103,22 @@ class GenerationMetrics(BaseModel):
         le=1.0,
         description="Semantic similarity to the reference answer",
     )
+    rouge_l: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="ROUGE-L F1 score vs reference (word-order-aware similarity)",
+    )
     context_utilization: float = Field(
         default=0.0,
         ge=0.0,
         le=1.0,
         description="How well the answer uses relevant retrieved context",
     )
+    eval_mode: EvalMode = Field(
+        default=EvalMode.NONE,
+        description="Which evaluation mode produced these metrics",
+    )
 
 
 class EvalSample(BaseModel):
@@ -171,6 +189,7 @@ class EvalRunSummary(BaseModel):
     avg_faithfulness: float = Field(default=0.0, ge=0.0, le=1.0)
     avg_relevance: float = Field(default=0.0, ge=0.0, le=1.0)
     avg_correctness: float = Field(default=0.0, ge=0.0, le=1.0)
+    avg_rouge_l: float = Field(default=0.0, ge=0.0, le=1.0)
     avg_context_utilization: float = Field(default=0.0, ge=0.0, le=1.0)
     # Timing
     avg_latency_ms: float = Field(default=0.0, ge=0.0)
diff --git a/src/rageval/metrics/__init__.py b/src/rageval/metrics/__init__.py
index 6c89ebf..6477b7f 100644
--- a/src/rageval/metrics/__init__.py
+++ b/src/rageval/metrics/__init__.py
@@ -1,13 +1,16 @@
 """Metrics computation for RAG evaluation.
 
-Provides retrieval metrics (precision, recall, F1, MRR, NDCG)
-and generation quality metrics (faithfulness, relevance, correctness).
+Provides retrieval metrics (precision, recall, F1, MRR, NDCG),
+generation quality metrics (faithfulness, relevance, correctness),
+and ROUGE-L (word-order-aware text similarity).
 """
 
 from rageval.metrics.generation import GenerationEvaluator
 from rageval.metrics.retrieval import RetrievalEvaluator
+from rageval.metrics.rouge import rouge_l_score
 
 __all__ = [
     "GenerationEvaluator",
     "RetrievalEvaluator",
+    "rouge_l_score",
 ]
diff --git a/src/rageval/metrics/generation.py b/src/rageval/metrics/generation.py
index 5bdc69b..3b266f4 100644
--- a/src/rageval/metrics/generation.py
+++ b/src/rageval/metrics/generation.py
@@ -1,18 +1,20 @@
-"""Generation quality metrics using LLM-as-Judge.
+"""Generation quality metrics using LLM-as-Judge or heuristic fallback.
 
-Evaluates the quality of RAG-generated answers using LlamaIndex's
-evaluation modules. Measures faithfulness (grounding in context),
-relevance (addressing the query), and correctness (vs. reference answer).
+Evaluates the quality of RAG-generated answers using either:
+1. LLM-as-Judge (LlamaIndex evaluators) — accurate, ~2s/sample
+2. Heuristic fallback (ROUGE-L + Jaccard + context overlap) — fast, free
 
-When no LLM is configured, falls back to deterministic heuristics
-(token overlap, embedding similarity proxies).
+When no LLM is configured or LlamaIndex is not installed, falls back
+to heuristic mode automatically. The evaluation mode is tracked in
+every GenerationMetrics result via the eval_mode field.
 """
 
 from __future__ import annotations
 
 import structlog
 
-from rageval.core.models import GenerationMetrics
+from rageval.core.models import EvalMode, GenerationMetrics
+from rageval.metrics.rouge import rouge_l_score
 
 logger = structlog.get_logger(__name__)
 
@@ -20,6 +22,13 @@
 class GenerationEvaluator:
     """Evaluate generation quality with LLM judge or heuristic fallback.
 
+    The evaluator automatically selects the best available mode:
+    - If an API key is provided and LlamaIndex is installed: LLM-as-Judge
+    - Otherwise: heuristic mode (ROUGE-L + Jaccard + context utilization)
+
+    Every result includes an ``eval_mode`` field so you always know
+    which mode produced the scores.
+
     Example::
 
         evaluator = GenerationEvaluator()
@@ -30,19 +39,26 @@ class GenerationEvaluator:
             reference_answer="Retrieval-Augmented Generation...",
         )
         print(f"Faithfulness: {metrics.faithfulness:.2f}")
+        print(f"ROUGE-L: {metrics.rouge_l:.2f}")
+        print(f"Mode: {metrics.eval_mode}")
     """
 
     def __init__(self, *, judge_model: str = "gpt-4o", api_key: str = "") -> None:
         """Initialize the generation evaluator.
 
         Args:
-            judge_model: LLM model to use as judge.
-            api_key: API key for the LLM provider.
+            judge_model: LLM model to use as judge (when LLM mode is active).
+            api_key: API key for the LLM provider. If empty, uses heuristic mode.
         """
         self._judge_model = judge_model
         self._api_key = api_key
         self._llm_available = bool(api_key)
 
+    @property
+    def mode(self) -> EvalMode:
+        """Return the current evaluation mode."""
+        return EvalMode.LLM_JUDGE if self._llm_available else EvalMode.HEURISTIC
+
     def evaluate(
         self,
         *,
@@ -53,6 +69,9 @@ def evaluate(
     ) -> GenerationMetrics:
         """Evaluate a single generated answer.
 
+        Automatically selects LLM-as-Judge or heuristic mode based on
+        configuration. Falls back to heuristic on any LLM error.
+
         Args:
             query: The original user query.
             generated_answer: The RAG pipeline's answer.
@@ -60,7 +79,7 @@ def evaluate(
             reference_answer: Ground truth answer for correctness scoring.
 
         Returns:
-            GenerationMetrics with all quality scores.
+            GenerationMetrics with all quality scores and eval_mode.
         """
         if self._llm_available:
             return self._evaluate_with_llm(
@@ -105,6 +124,10 @@ def evaluate_batch(
             results.append(metrics)
         return results
 
+    # ------------------------------------------------------------------
+    # LLM-as-Judge mode
+    # ------------------------------------------------------------------
+
     def _evaluate_with_llm(
         self,
         *,
@@ -115,8 +138,9 @@ def _evaluate_with_llm(
     ) -> GenerationMetrics:
         """Use LlamaIndex LLM evaluators for quality assessment.
 
-        This uses LlamaIndex's FaithfulnessEvaluator, RelevancyEvaluator,
-        and CorrectnessEvaluator under the hood.
+        Each evaluator is called independently so a failure in one
+        metric doesn't lose the others. Falls back to heuristic
+        for the entire sample only if LlamaIndex is not installed.
         """
         try:
             from llama_index.core.evaluation import (
@@ -125,47 +149,6 @@ def _evaluate_with_llm(
                 RelevancyEvaluator,
             )
             from llama_index.llms.openai import OpenAI
-
-            llm = OpenAI(model=self._judge_model, api_key=self._api_key)
-
-            # Faithfulness: is the answer grounded in the context?
-            faith_eval = FaithfulnessEvaluator(llm=llm)
-            faith_result = faith_eval.evaluate(
-                query=query,
-                response=generated_answer,
-                contexts=retrieved_contexts,
-            )
-            faithfulness = 1.0 if faith_result.passing else 0.0
-
-            # Relevancy: does the answer address the query?
-            rel_eval = RelevancyEvaluator(llm=llm)
-            rel_result = rel_eval.evaluate(
-                query=query,
-                response=generated_answer,
-                contexts=retrieved_contexts,
-            )
-            relevance = 1.0 if rel_result.passing else 0.0
-
-            # Correctness: semantic match to reference answer
-            correctness = 0.0
-            if reference_answer:
-                corr_eval = CorrectnessEvaluator(llm=llm)
-                corr_result = corr_eval.evaluate(
-                    query=query,
-                    response=generated_answer,
-                    reference=reference_answer,
-                )
-                correctness = (corr_result.score or 0.0) / 5.0  # Normalize to 0-1
-
-            context_util = self._context_utilization(generated_answer, retrieved_contexts)
-
-            return GenerationMetrics(
-                faithfulness=faithfulness,
-                relevance=relevance,
-                correctness=correctness,
-                context_utilization=context_util,
-            )
-
         except ImportError:
             logger.warning("generation.llm_unavailable", msg="LlamaIndex not installed")
             return self._evaluate_with_heuristics(
@@ -174,14 +157,108 @@ def _evaluate_with_llm(
                 retrieved_contexts=retrieved_contexts,
                 reference_answer=reference_answer,
             )
+
+        llm = OpenAI(model=self._judge_model, api_key=self._api_key)
+
+        # Faithfulness: is the answer grounded in context?
+        faithfulness = self._llm_faithfulness(
+            llm, FaithfulnessEvaluator, query, generated_answer, retrieved_contexts
+        )
+
+        # Relevance: does the answer address the query?
+        relevance = self._llm_relevance(
+            llm, RelevancyEvaluator, query, generated_answer, retrieved_contexts
+        )
+
+        # Correctness: semantic match to reference answer
+        correctness = self._llm_correctness(
+            llm, CorrectnessEvaluator, query, generated_answer, reference_answer
+        )
+
+        # ROUGE-L: always computed (deterministic, free)
+        rouge_l = rouge_l_score(generated_answer, reference_answer) if reference_answer else 0.0
+
+        # Context utilization: always computed (deterministic)
+        context_util = self._context_utilization(generated_answer, retrieved_contexts)
+
+        return GenerationMetrics(
+            faithfulness=faithfulness,
+            relevance=relevance,
+            correctness=correctness,
+            rouge_l=rouge_l,
+            context_utilization=context_util,
+            eval_mode=EvalMode.LLM_JUDGE,
+        )
+
+    @staticmethod
+    def _llm_faithfulness(
+        llm: object,
+        evaluator_cls: type,
+        query: str,
+        answer: str,
+        contexts: list[str],
+    ) -> float:
+        """Evaluate faithfulness with per-metric error handling."""
+        try:
+            evaluator = evaluator_cls(llm=llm)
+            result = evaluator.evaluate(
+                query=query,
+                response=answer,
+                contexts=contexts,
+            )
+            return 1.0 if result.passing else 0.0
         except Exception:
-            logger.exception("generation.llm_eval_failed")
-            return self._evaluate_with_heuristics(
+            logger.exception("generation.llm_faithfulness_failed")
+            return 0.0
+
+    @staticmethod
+    def _llm_relevance(
+        llm: object,
+        evaluator_cls: type,
+        query: str,
+        answer: str,
+        contexts: list[str],
+    ) -> float:
+        """Evaluate relevance with per-metric error handling."""
+        try:
+            evaluator = evaluator_cls(llm=llm)
+            result = evaluator.evaluate(
                 query=query,
-                generated_answer=generated_answer,
-                retrieved_contexts=retrieved_contexts,
-                reference_answer=reference_answer,
+                response=answer,
+                contexts=contexts,
+            )
+            return 1.0 if result.passing else 0.0
+        except Exception:
+            logger.exception("generation.llm_relevance_failed")
+            return 0.0
+
+    @staticmethod
+    def _llm_correctness(
+        llm: object,
+        evaluator_cls: type,
+        query: str,
+        answer: str,
+        reference: str,
+    ) -> float:
+        """Evaluate correctness with per-metric error handling."""
+        if not reference:
+            return 0.0
+        try:
+            evaluator = evaluator_cls(llm=llm)
+            result = evaluator.evaluate(
+                query=query,
+                response=answer,
+                reference=reference,
             )
+            score = result.score or 0.0
+            return min(score / 5.0, 1.0)  # Normalize 0-5 → 0-1
+        except Exception:
+            logger.exception("generation.llm_correctness_failed")
+            return 0.0
+
+    # ------------------------------------------------------------------
+    # Heuristic mode
+    # ------------------------------------------------------------------
 
     def _evaluate_with_heuristics(
         self,
@@ -191,10 +268,14 @@ def _evaluate_with_heuristics(
         retrieved_contexts: list[str],
         reference_answer: str,
     ) -> GenerationMetrics:
-        """Deterministic heuristic evaluation using token overlap.
-
-        Used when no LLM judge is configured. Provides rough proxy
-        metrics based on word overlap analysis.
+        """Deterministic heuristic evaluation using ROUGE-L and token overlap.
+
+        Used when no LLM judge is configured. Provides proxy metrics:
+        - Faithfulness: Jaccard overlap between answer and context tokens
+        - Relevance: Jaccard overlap between answer and query tokens
+        - Correctness: Jaccard overlap between answer and reference tokens
+        - ROUGE-L: Longest Common Subsequence F1 vs reference (order-aware)
+        - Context utilization: fraction of contexts with >20% token overlap
         """
         answer_tokens = set(generated_answer.lower().split())
 
@@ -213,6 +294,9 @@ def _evaluate_with_heuristics(
             ref_tokens = set(reference_answer.lower().split())
             correctness = self._token_overlap(answer_tokens, ref_tokens)
 
+        # ROUGE-L: order-aware similarity vs reference
+        rouge_l = rouge_l_score(generated_answer, reference_answer) if reference_answer else 0.0
+
         # Context utilization
         context_util = self._context_utilization(generated_answer, retrieved_contexts)
 
@@ -220,12 +304,21 @@ def _evaluate_with_heuristics(
             faithfulness=min(faithfulness, 1.0),
             relevance=min(relevance, 1.0),
             correctness=min(correctness, 1.0),
+            rouge_l=min(rouge_l, 1.0),
             context_utilization=min(context_util, 1.0),
+            eval_mode=EvalMode.HEURISTIC,
         )
 
+    # ------------------------------------------------------------------
+    # Shared utilities
+    # ------------------------------------------------------------------
+
     @staticmethod
     def _token_overlap(set_a: set[str], set_b: set[str]) -> float:
-        """Compute Jaccard-like overlap between two token sets."""
+        """Compute Jaccard similarity between two token sets.
+
+        Jaccard = |A & B| / |A | B|
+        """
         if not set_a or not set_b:
             return 0.0
         intersection = set_a & set_b
@@ -234,7 +327,12 @@ def _token_overlap(set_a: set[str], set_b: set[str]) -> float:
 
     @staticmethod
     def _context_utilization(answer: str, contexts: list[str]) -> float:
-        """Measure how much of the retrieved context is used in the answer."""
+        """Measure what fraction of retrieved contexts contributed to the answer.
+
+        A context is considered "utilized" if >20% of its tokens appear
+        in the generated answer. This threshold is empirical — adjust
+        per use case.
+        """
         if not contexts or not answer:
             return 0.0
 
diff --git a/src/rageval/metrics/rouge.py b/src/rageval/metrics/rouge.py
new file mode 100644
index 0000000..b4b5d11
--- /dev/null
+++ b/src/rageval/metrics/rouge.py
@@ -0,0 +1,133 @@
+"""ROUGE-L metric implementation (pure Python, zero dependencies).
+
+Computes ROUGE-L (Longest Common Subsequence) score between two texts.
+This is a standard NLP metric for measuring text similarity that captures
+word ordering — unlike Jaccard which treats text as unordered bags of words.
+
+Reference: Lin, C.Y. (2004). ROUGE: A Package for Automatic Evaluation
+of Summaries. ACL Workshop on Text Summarization.
+"""
+
+from __future__ import annotations
+
+
+def _lcs_length(seq_a: list[str], seq_b: list[str]) -> int:
+    """Compute length of the Longest Common Subsequence.
+
+    Uses O(min(m,n)) space optimization instead of full m*n table.
+
+    Args:
+        seq_a: First token sequence.
+        seq_b: Second token sequence.
+
+    Returns:
+        Length of the LCS.
+    """
+    if not seq_a or not seq_b:
+        return 0
+
+    # Ensure seq_b is the shorter sequence for space optimization
+    if len(seq_a) < len(seq_b):
+        seq_a, seq_b = seq_b, seq_a
+
+    m = len(seq_b)
+    prev = [0] * (m + 1)
+    curr = [0] * (m + 1)
+
+    for token_a in seq_a:
+        for j, token_b in enumerate(seq_b):
+            if token_a == token_b:
+                curr[j + 1] = prev[j] + 1
+            else:
+                curr[j + 1] = max(curr[j], prev[j + 1])
+        prev, curr = curr, [0] * (m + 1)
+
+    return prev[m]
+
+
+def rouge_l_score(candidate: str, reference: str) -> float:
+    """Compute ROUGE-L F1 score between candidate and reference text.
+
+    ROUGE-L uses the Longest Common Subsequence (LCS) to measure
+    similarity while preserving word order information.
+
+    Args:
+        candidate: The generated text to evaluate.
+        reference: The ground truth reference text.
+
+    Returns:
+        ROUGE-L F1 score in [0.0, 1.0].
+
+    Example::
+
+        >>> rouge_l_score("the cat sat on the mat", "the cat on the mat")
+        0.9...
+    """
+    if not candidate or not reference:
+        return 0.0
+
+    cand_tokens = candidate.lower().split()
+    ref_tokens = reference.lower().split()
+
+    if not cand_tokens or not ref_tokens:
+        return 0.0
+
+    lcs_len = _lcs_length(cand_tokens, ref_tokens)
+
+    if lcs_len == 0:
+        return 0.0
+
+    precision = lcs_len / len(cand_tokens)
+    recall = lcs_len / len(ref_tokens)
+
+    if precision + recall == 0:
+        return 0.0
+
+    f1 = 2 * (precision * recall) / (precision + recall)
+    return f1
+
+
+def rouge_l_precision(candidate: str, reference: str) -> float:
+    """Compute ROUGE-L precision (LCS / candidate length).
+
+    Args:
+        candidate: The generated text.
+        reference: The reference text.
+
+    Returns:
+        ROUGE-L precision in [0.0, 1.0].
+    """
+    if not candidate or not reference:
+        return 0.0
+
+    cand_tokens = candidate.lower().split()
+    ref_tokens = reference.lower().split()
+
+    if not cand_tokens or not ref_tokens:
+        return 0.0
+
+    lcs_len = _lcs_length(cand_tokens, ref_tokens)
+    return lcs_len / len(cand_tokens)
+
+
+def rouge_l_recall(candidate: str, reference: str) -> float:
+    """Compute ROUGE-L recall (LCS / reference length).
+
+    Args:
+        candidate: The generated text.
+        reference: The reference text.
+
+    Returns:
+        ROUGE-L recall in [0.0, 1.0].
+    """
+    if not candidate or not reference:
+        return 0.0
+
+    cand_tokens = candidate.lower().split()
+    ref_tokens = reference.lower().split()
+
+    if not cand_tokens or not ref_tokens:
+        return 0.0
+
+    lcs_len = _lcs_length(cand_tokens, ref_tokens)
+    return lcs_len / len(ref_tokens)
diff --git a/src/rageval/pipeline/storage.py b/src/rageval/pipeline/storage.py
index 8c8894d..404d917 100644
--- a/src/rageval/pipeline/storage.py
+++ b/src/rageval/pipeline/storage.py
@@ -1,11 +1,12 @@
 """DuckDB-backed result storage for evaluation runs.
 
-Provides persistent storage, querying, and comparison of evaluation
-results across runs, models, and dataset versions.
+Provides persistent storage, querying, comparison, and export of
+evaluation results across runs, models, and dataset versions.
 """
 
 from __future__ import annotations
 
+import json
 from typing import Any
 
 import duckdb
@@ -19,12 +20,17 @@
 class ResultStore:
     """Store and query evaluation results in DuckDB.
 
+    Supports both in-memory (for tests) and file-backed (for production)
+    databases. Provides SQL-powered aggregation, cross-run comparison,
+    filtered queries, and JSON export.
+
     Example::
 
         store = ResultStore(db_path=":memory:")
         store.store_results("run-1", results)
         summary = store.get_run_summary("run-1")
         print(f"Avg F1: {summary.avg_f1:.3f}")
+        store.export_json("run-1", "/tmp/run-1-results.json")
     """
 
     def __init__(self, *, db_path: str = ":memory:") -> None:
@@ -56,7 +62,9 @@ def _ensure_tables(self) -> None:
                 faithfulness DOUBLE DEFAULT 0,
                 relevance DOUBLE DEFAULT 0,
                 correctness DOUBLE DEFAULT 0,
+                rouge_l DOUBLE DEFAULT 0,
                 context_utilization DOUBLE DEFAULT 0,
+                eval_mode VARCHAR DEFAULT 'none',
                 -- Timing
                 latency_ms DOUBLE DEFAULT 0,
                 created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
@@ -78,6 +86,7 @@ def _ensure_tables(self) -> None:
                 avg_faithfulness DOUBLE DEFAULT 0,
                 avg_relevance DOUBLE DEFAULT 0,
                 avg_correctness DOUBLE DEFAULT 0,
+                avg_rouge_l DOUBLE DEFAULT 0,
                 avg_context_utilization DOUBLE DEFAULT 0,
                 avg_latency_ms DOUBLE DEFAULT 0,
                 total_duration_seconds DOUBLE DEFAULT 0,
@@ -89,6 +98,9 @@ def _ensure_tables(self) -> None:
     def store_results(self, run_id: str, results: list[EvalResult]) -> int:
         """Store evaluation results for a run.
 
+        Uses INSERT OR REPLACE for idempotent upserts — re-running
+        an evaluation on the same samples updates rather than duplicates.
+
         Args:
             run_id: Unique identifier for this evaluation run.
             results: List of evaluation results to store.
@@ -104,8 +116,9 @@ def store_results(self, run_id: str, results: list[EvalResult]) -> int:
                     generated_answer, precision, recall, f1_score,
                     mrr, ndcg, hit_rate, retrieved_count, relevant_count,
                     relevant_retrieved_count, faithfulness, relevance,
-                    correctness, context_utilization, latency_ms, created_at
-                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                    correctness, rouge_l, context_utilization, eval_mode,
+                    latency_ms, created_at
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 [
                     result.id,
@@ -126,7 +139,9 @@ def store_results(self, run_id: str, results: list[EvalResult]) -> int:
                     result.generation_metrics.faithfulness,
                     result.generation_metrics.relevance,
                     result.generation_metrics.correctness,
+                    result.generation_metrics.rouge_l,
                     result.generation_metrics.context_utilization,
+                    result.generation_metrics.eval_mode.value,
                     result.latency_ms,
                     result.created_at.isoformat(),
                 ],
@@ -136,7 +151,9 @@ def store_results(self, run_id: str, results: list[EvalResult]) -> int:
         return len(results)
 
     def get_run_summary(self, run_id: str) -> EvalRunSummary:
-        """Compute aggregate metrics for a run.
+        """Compute aggregate metrics for a run via SQL.
+
+        Uses DuckDB's AVG() aggregation — no materializing rows into Python.
 
         Args:
             run_id: The run to summarize.
@@ -157,6 +174,7 @@ def get_run_summary(self, run_id: str) -> EvalRunSummary:
                 AVG(faithfulness) as avg_faithfulness,
                 AVG(relevance) as avg_relevance,
                 AVG(correctness) as avg_correctness,
+                AVG(rouge_l) as avg_rouge_l,
                 AVG(context_utilization) as avg_context_utilization,
                 AVG(latency_ms) as avg_latency_ms
             FROM eval_results
@@ -181,13 +199,16 @@ def get_run_summary(self, run_id: str) -> EvalRunSummary:
             avg_faithfulness=row[7] or 0.0,
             avg_relevance=row[8] or 0.0,
             avg_correctness=row[9] or 0.0,
-            avg_context_utilization=row[10] or 0.0,
-            avg_latency_ms=row[11] or 0.0,
+            avg_rouge_l=row[10] or 0.0,
+            avg_context_utilization=row[11] or 0.0,
+            avg_latency_ms=row[12] or 0.0,
         )
 
     def compare_runs(self, run_id_a: str, run_id_b: str) -> dict[str, dict[str, float]]:
         """Compare two evaluation runs side by side.
 
+        Computes delta (candidate - baseline) for every metric.
+
         Args:
             run_id_a: First run (baseline).
             run_id_b: Second run (candidate).
@@ -209,6 +230,7 @@ def compare_runs(self, run_id_a: str, run_id_b: str) -> dict[str, dict[str, floa
             "avg_faithfulness",
             "avg_relevance",
             "avg_correctness",
+            "avg_rouge_l",
             "avg_context_utilization",
         ]
 
@@ -230,14 +252,27 @@ def query_results(
         *,
         min_f1: float | None = None,
         max_f1: float | None = None,
+        min_faithfulness: float | None = None,
+        max_faithfulness: float | None = None,
+        min_rouge_l: float | None = None,
+        max_rouge_l: float | None = None,
+        eval_mode: str | None = None,
         limit: int = 100,
     ) -> list[dict[str, Any]]:
-        """Query results with optional filters.
+        """Query results with flexible filters.
+
+        Supports filtering on any metric range, evaluation mode, and
+        result count limit.
 
         Args:
             run_id: Run to query.
             min_f1: Minimum F1 score filter.
             max_f1: Maximum F1 score filter.
+            min_faithfulness: Minimum faithfulness filter.
+            max_faithfulness: Maximum faithfulness filter.
+            min_rouge_l: Minimum ROUGE-L filter.
+            max_rouge_l: Maximum ROUGE-L filter.
+            eval_mode: Filter by evaluation mode ('llm_judge' or 'heuristic').
             limit: Max results to return.
 
         Returns:
@@ -246,12 +281,20 @@ def query_results(
         conditions = ["run_id = ?"]
         params: list[Any] = [run_id]
 
-        if min_f1 is not None:
-            conditions.append("f1_score >= ?")
-            params.append(min_f1)
-        if max_f1 is not None:
-            conditions.append("f1_score <= ?")
-            params.append(max_f1)
+        filter_map: list[tuple[str, str, float | str | None]] = [
+            ("f1_score >= ?", "f1_score >= ?", min_f1),
+            ("f1_score <= ?", "f1_score <= ?", max_f1),
+            ("faithfulness >= ?", "faithfulness >= ?", min_faithfulness),
+            ("faithfulness <= ?", "faithfulness <= ?", max_faithfulness),
+            ("rouge_l >= ?", "rouge_l >= ?", min_rouge_l),
+            ("rouge_l <= ?", "rouge_l <= ?", max_rouge_l),
+            ("eval_mode = ?", "eval_mode = ?", eval_mode),
+        ]
+
+        for condition, _, value in filter_map:
+            if value is not None:
+                conditions.append(condition)
+                params.append(value)
 
         where = " AND ".join(conditions)
         params.append(limit)
@@ -265,6 +308,88 @@ def query_results(
         columns = [d[0] for d in desc] if desc else []
         return [dict(zip(columns, row)) for row in rows]
 
+    def export_json(self, run_id: str, file_path: str) -> int:
+        """Export a run's results to a JSON file.
+
+        Exports both individual results and the aggregate summary
+        in a single JSON document suitable for dashboards, reports,
+        or archiving.
+
+        Args:
+            run_id: Run to export.
+            file_path: Output file path (.json).
+
+        Returns:
+            Number of results exported.
+        """
+        results = self.query_results(run_id, limit=10000)
+        summary = self.get_run_summary(run_id)
+
+        export_data = {
+            "run_id": run_id,
+            "summary": {
+                "status": summary.status.value,
+                "sample_count": summary.sample_count,
+                "avg_precision": summary.avg_precision,
+                "avg_recall": summary.avg_recall,
+                "avg_f1": summary.avg_f1,
+                "avg_mrr": summary.avg_mrr,
+                "avg_ndcg": summary.avg_ndcg,
+                "avg_hit_rate": summary.avg_hit_rate,
+                "avg_faithfulness": summary.avg_faithfulness,
+                "avg_relevance": summary.avg_relevance,
+                "avg_correctness": summary.avg_correctness,
+                "avg_rouge_l": summary.avg_rouge_l,
+                "avg_context_utilization": summary.avg_context_utilization,
+                "avg_latency_ms": summary.avg_latency_ms,
+            },
+            "results": _serialize_results(results),
+        }
+
+        with open(file_path, "w") as f:
+            json.dump(export_data, f, indent=2, default=str)
+
+        logger.info("storage.exported_json", run_id=run_id, count=len(results), path=file_path)
+        return len(results)
+
+    def list_runs(self) -> list[dict[str, Any]]:
+        """List all evaluation runs with their sample counts.
+
+        Returns:
+            List of dicts with run_id and sample_count.
+        """
+        rows = self._conn.execute(
+            """
+            SELECT run_id, COUNT(*) as sample_count,
+                   MIN(created_at) as first_result,
+                   MAX(created_at) as last_result
+            FROM eval_results
+            GROUP BY run_id
+            ORDER BY MIN(created_at) DESC
+            """
+        ).fetchall()
+
+        return [
+            {
+                "run_id": row[0],
+                "sample_count": row[1],
+                "first_result": str(row[2]),
+                "last_result": str(row[3]),
+            }
+            for row in rows
+        ]
+
     def close(self) -> None:
         """Close the database connection."""
         self._conn.close()
+
+
+def _serialize_results(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Convert DuckDB result rows to JSON-serializable dicts."""
+    serialized = []
+    for row in results:
+        clean: dict[str, Any] = {}
+        for key, value in row.items():
+            clean[key] = str(value) if hasattr(value, "isoformat") else value
+        serialized.append(clean)
+    return serialized
diff --git a/tests/test_generation_metrics.py b/tests/test_generation_metrics.py
index 89d57f4..784c09f 100644
--- a/tests/test_generation_metrics.py
+++ b/tests/test_generation_metrics.py
@@ -1,9 +1,10 @@
-"""Tests for generation quality metrics (faithfulness, relevance, correctness)."""
+"""Tests for generation quality metrics (faithfulness, relevance, correctness, ROUGE-L)."""
 
 from __future__ import annotations
 
 import pytest
 
+from rageval.core.models import EvalMode
 from rageval.metrics.generation import GenerationEvaluator
 
 
@@ -174,6 +175,61 @@ def test_batch_all_metrics_present(self, evaluator: GenerationEvaluator) -> None
         assert results[0].context_utilization >= 0.0
 
 
+class TestRougeL:
+    """ROUGE-L integration in generation metrics."""
+
+    def test_rouge_l_with_reference(self, evaluator: GenerationEvaluator) -> None:
+        """ROUGE-L should be computed when reference is provided."""
+        result = evaluator.evaluate(
+            query="What is RAG?",
+            generated_answer="RAG combines retrieval with generation for better answers",
+            retrieved_contexts=["RAG is a technique."],
+            reference_answer="RAG combines retrieval with generation",
+        )
+        assert result.rouge_l > 0.5
+
+    def test_rouge_l_without_reference(self, evaluator: GenerationEvaluator) -> None:
+        """ROUGE-L should be 0 when no reference is provided."""
+        result = evaluator.evaluate(
+            query="What is RAG?",
+            generated_answer="RAG combines retrieval with generation",
+            retrieved_contexts=["RAG is a technique."],
+        )
+        assert result.rouge_l == 0.0
+
+    def test_rouge_l_identical(self, evaluator: GenerationEvaluator) -> None:
+        """Identical answer and reference should give ROUGE-L = 1.0."""
+        result = evaluator.evaluate(
+            query="What is 2+2?",
+            generated_answer="The answer is four",
+            retrieved_contexts=["Basic arithmetic."],
+            reference_answer="The answer is four",
+        )
+        assert result.rouge_l == 1.0
+
+
+class TestEvalMode:
+    """Test evaluation mode tracking."""
+
+    def test_heuristic_mode_tracked(self, evaluator: GenerationEvaluator) -> None:
+        """Heuristic evaluator should report its mode."""
+        result = evaluator.evaluate(
+            query="What is RAG?",
+            generated_answer="RAG is a technique",
+            retrieved_contexts=["RAG is a technique."],
+        )
+        assert result.eval_mode == EvalMode.HEURISTIC
+
+    def test_mode_property(self, evaluator: GenerationEvaluator) -> None:
+        """Mode property should reflect configuration."""
+        assert evaluator.mode == EvalMode.HEURISTIC
+
+    def test_llm_mode_when_configured(self) -> None:
+        """Evaluator with API key should report LLM mode."""
+        llm_eval = GenerationEvaluator(api_key="test-key-123")
+        assert llm_eval.mode == EvalMode.LLM_JUDGE
+
+
 class TestEdgeCases:
     """Edge cases for generation evaluation."""
 
@@ -185,6 +241,7 @@ def test_empty_answer(self, evaluator: GenerationEvaluator) -> None:
         )
         assert result.faithfulness == 0.0
         assert result.relevance == 0.0
+        assert result.rouge_l == 0.0
 
     def test_empty_query(self, evaluator: GenerationEvaluator) -> None:
         result = evaluator.evaluate(
diff --git a/tests/test_models.py b/tests/test_models.py
index a6da5b2..0acb47a 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -7,6 +7,7 @@
 
 from rageval.core.config import EvalConfig, LLMProviderConfig, RetrieverConfig
 from rageval.core.models import (
+    EvalMode,
     EvalResult,
     EvalRunSummary,
     EvalSample,
@@ -73,16 +74,22 @@ def test_default_values(self) -> None:
         assert metrics.faithfulness == 0.0
         assert metrics.relevance == 0.0
         assert metrics.correctness == 0.0
+        assert metrics.rouge_l == 0.0
         assert metrics.context_utilization == 0.0
+        assert metrics.eval_mode == EvalMode.NONE
 
     def test_valid_scores(self) -> None:
         metrics = GenerationMetrics(
             faithfulness=0.9,
             relevance=0.85,
             correctness=0.7,
+            rouge_l=0.8,
             context_utilization=0.6,
+            eval_mode=EvalMode.HEURISTIC,
         )
         assert metrics.faithfulness == 0.9
+        assert metrics.rouge_l == 0.8
+        assert metrics.eval_mode == EvalMode.HEURISTIC
 
     def test_rejects_out_of_range(self) -> None:
         with pytest.raises(ValidationError):
@@ -92,6 +99,19 @@ def test_rejects_negative(self) -> None:
         with pytest.raises(ValidationError):
             GenerationMetrics(relevance=-0.1)
 
+    def test_rejects_rouge_l_out_of_range(self) -> None:
+        with pytest.raises(ValidationError):
+            GenerationMetrics(rouge_l=1.5)
+
+
+class TestEvalMode:
+    """Test EvalMode enum."""
+
+    def test_values(self) -> None:
+        assert EvalMode.LLM_JUDGE == "llm_judge"
+        assert EvalMode.HEURISTIC == "heuristic"
+        assert EvalMode.NONE == "none"
+
 
 class TestEvalSample:
     """Test EvalSample model."""
diff --git a/tests/test_rouge.py b/tests/test_rouge.py
new file mode 100644
index 0000000..7929a9d
--- /dev/null
+++ b/tests/test_rouge.py
@@ -0,0 +1,110 @@
+"""Tests for ROUGE-L metric implementation."""
+
+from __future__ import annotations
+
+import pytest
+
+from rageval.metrics.rouge import _lcs_length, rouge_l_precision, rouge_l_recall, rouge_l_score
+
+
+class TestLCSLength:
+    """Test the Longest Common Subsequence computation."""
+
+    def test_identical_sequences(self) -> None:
+        assert _lcs_length(["a", "b", "c"], ["a", "b", "c"]) == 3
+
+    def test_no_overlap(self) -> None:
+        assert _lcs_length(["a", "b"], ["c", "d"]) == 0
+
+    def test_partial_overlap(self) -> None:
+        assert _lcs_length(["a", "b", "c", "d"], ["a", "c", "e"]) == 2
+
+    def test_subsequence_not_substring(self) -> None:
+        """LCS finds non-contiguous matches."""
+        assert _lcs_length(["the", "cat", "sat", "on", "mat"], ["the", "on", "mat"]) == 3
+
+    def test_empty_first(self) -> None:
+        assert _lcs_length([], ["a", "b"]) == 0
+
+    def test_empty_second(self) -> None:
+        assert _lcs_length(["a", "b"], []) == 0
+
+    def test_both_empty(self) -> None:
+        assert _lcs_length([], []) == 0
+
+    def test_single_match(self) -> None:
+        assert _lcs_length(["hello"], ["hello"]) == 1
+
+    def test_single_no_match(self) -> None:
+        assert _lcs_length(["hello"], ["world"]) == 0
+
+
+class TestRougeL:
+    """Test ROUGE-L F1 score."""
+
+    def test_identical_text(self) -> None:
+        score = rouge_l_score("the cat sat on the mat", "the cat sat on the mat")
+        assert score == 1.0
+
+    def test_completely_different(self) -> None:
+        score = rouge_l_score("hello world", "goodbye universe")
+        assert score == 0.0
+
+    def test_partial_overlap_preserves_order(self) -> None:
+        """ROUGE-L should reward matching word order."""
+        score = rouge_l_score(
+            "RAG combines retrieval with generation for better answers",
+            "RAG combines retrieval with generation",
+        )
+        assert score > 0.7
+
+    def test_reordered_text_scores_lower(self) -> None:
+        """Reordered words should score lower than ordered matches."""
+        ordered = rouge_l_score("the cat sat on the mat", "the cat sat on the mat")
+        reordered = rouge_l_score("mat the on sat cat the", "the cat sat on the mat")
+        assert ordered > reordered
+
+    def test_empty_candidate(self) -> None:
+        assert rouge_l_score("", "some reference") == 0.0
+
+    def test_empty_reference(self) -> None:
+        assert rouge_l_score("some candidate", "") == 0.0
+
+    def test_both_empty(self) -> None:
+        assert rouge_l_score("", "") == 0.0
+
+    def test_case_insensitive(self) -> None:
+        score = rouge_l_score("The Cat SAT", "the cat sat")
+        assert score == 1.0
+
+    def test_known_value(self) -> None:
+        """Verify against a hand-computed ROUGE-L score.
+
+        Candidate: "the cat on the mat"  (5 tokens)
+        Reference: "the cat sat on the mat"  (6 tokens)
+        LCS: "the cat on the mat" = 5
+        P = 5/5 = 1.0, R = 5/6 = 0.833
+        F1 = 2 * (1.0 * 0.833) / (1.0 + 0.833) = 0.909
+        """
+        score = rouge_l_score("the cat on the mat", "the cat sat on the mat")
+        assert score == pytest.approx(0.909, abs=0.01)
+
+
+class TestRougeLPrecisionRecall:
+    """Test precision and recall separately."""
+
+    def test_precision_perfect(self) -> None:
+        """Every candidate word is in the reference subsequence."""
+        p = rouge_l_precision("the cat", "the cat sat on the mat")
+        assert p == 1.0
+
+    def test_recall_perfect(self) -> None:
+        """Every reference word is in the candidate subsequence."""
+        r = rouge_l_recall("the cat sat on the mat and more", "the cat sat on the mat")
+        assert r == 1.0
+
+    def test_precision_empty(self) -> None:
+        assert rouge_l_precision("", "reference") == 0.0
+
+    def test_recall_empty(self) -> None:
+        assert rouge_l_recall("candidate", "") == 0.0
diff --git a/tests/test_storage.py b/tests/test_storage.py
index 49776a2..ab67eba 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+from pathlib import Path
+
 import pytest
 
 from rageval.core.models import (
@@ -162,3 +164,58 @@ def test_query_with_limit(self, store: ResultStore) -> None:
     def test_query_empty_run(self, store: ResultStore) -> None:
         results = store.query_results("nonexistent")
         assert len(results) == 0
+
+    def test_query_by_faithfulness(self, store: ResultStore) -> None:
+        results = [
+            _make_result("s1", faithfulness=0.2),
+            _make_result("s2", faithfulness=0.8),
+            _make_result("s3", faithfulness=0.9),
+        ]
+        store.store_results("run-1", results)
+        filtered = store.query_results("run-1", min_faithfulness=0.5)
+        assert len(filtered) == 2
+
+
+class TestExportJson:
+    """Test JSON export functionality."""
+
+    def test_export_creates_file(self, store: ResultStore, tmp_path: Path) -> None:
+        """Export should create a valid JSON file."""
+        import json
+
+        results = [_make_result("s1"), _make_result("s2")]
+        store.store_results("run-1", results)
+
+        out_path = str(tmp_path / "export.json")
+        count = store.export_json("run-1", out_path)
+        assert count == 2
+
+        with open(out_path) as f:
+            data = json.load(f)
+
+        assert data["run_id"] == "run-1"
+        assert data["summary"]["sample_count"] == 2
+        assert len(data["results"]) == 2
+
+    def test_export_empty_run(self, store: ResultStore, tmp_path: Path) -> None:
+        """Exporting a nonexistent run should create a file with 0 results."""
+        out_path = str(tmp_path / "empty.json")
+        count = store.export_json("nonexistent", out_path)
+        assert count == 0
+
+
+class TestListRuns:
+    """Test listing all runs."""
+
+    def test_list_runs(self, store: ResultStore) -> None:
+        store.store_results("run-a", [_make_result("s1")])
+        store.store_results("run-b", [_make_result("s2"), _make_result("s3")])
+
+        runs = store.list_runs()
+        assert len(runs) == 2
+        run_ids = {r["run_id"] for r in runs}
+        assert run_ids == {"run-a", "run-b"}
+
+    def test_list_runs_empty(self, store: ResultStore) -> None:
+        runs = store.list_runs()
+        assert len(runs) == 0