diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..986f4fb
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,34 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  schedule:
+    - cron: "25 14 * * 1"
+
+jobs:
+  analyze:
+    name: Analyze (Python)
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+      packages: read
+      actions: read
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: python
+          queries: security-and-quality
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
+        with:
+          category: "/language:python"
diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml
new file mode 100644
index 0000000..df39cd3
--- /dev/null
+++ b/.github/workflows/pr-checks.yml
@@ -0,0 +1,42 @@
+name: PR Checks
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  checks:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11", "3.12", "3.13"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install flake8 isort
+
+      - name: Lint with flake8
+        run: |
+          flake8 src/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics
+          flake8 src/ tests/ --count --max-line-length=120 --statistics --exit-zero
+
+      - name: Check import ordering with isort
+        run: |
+          isort --check-only --diff src/ tests/
+
+      - name: Run unit tests
+        run: |
+          python -m pytest tests/ -v --tb=short
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..9855d94
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = -v --tb=short
diff --git a/requirements.txt b/requirements.txt
index cd6ed8e..713b8b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Azure SDK dependencies
 azure-identity>=1.25.3
 azure-ai-projects>=2.2.0
-azure-ai-evaluation==1.16.9
+azure-ai-evaluation==1.17.0
 azure-ai-inference>=1.0.0b9
 # Core Python packages
 python-dotenv>=1.2.2
@@ -9,8 +9,11 @@ pyyaml>=6.0.3
 pip-system-certs>=5.3
 azure-monitor-query>=2.0.0
 azure-monitor-opentelemetry>=1.8.8
-aiohttp>=3.13.5
-agent-framework==1.7.0
+aiohttp>=3.14.1
+agent-framework>=1.9.0
 streamlit>=1.58.0
 pandas==2.3.3
-plotly>=6.7.0
+plotly>=6.8.0
+# Test dependencies
+pytest>=9.0.0
+pytest-asyncio>=1.3.0
diff --git a/src/agent_evaluation/agentic_ops/base_evaluator.py b/src/agent_evaluation/agentic_ops/base_evaluator.py
index ba6e9f3..2ba3971 100644
--- a/src/agent_evaluation/agentic_ops/base_evaluator.py
+++ b/src/agent_evaluation/agentic_ops/base_evaluator.py
@@ -1,9 +1,9 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import logging
 import os
 import re
-import logging
 from typing import Dict, Union
 
 from .client import LLMClient
diff --git a/src/agent_evaluation/agentic_ops/client.py b/src/agent_evaluation/agentic_ops/client.py
index c34aeb1..199f127 100644
--- a/src/agent_evaluation/agentic_ops/client.py
+++ b/src/agent_evaluation/agentic_ops/client.py
@@ -8,12 +8,13 @@
 
 import json
 import logging
-from openai import AzureOpenAI
-from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 import os
-from dotenv import load_dotenv
 import time
 
+from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+from dotenv import load_dotenv
+from openai import AzureOpenAI
+
 load_dotenv()
 
 # Configure logging
diff --git a/src/agent_evaluation/agentic_ops/run_eval.py b/src/agent_evaluation/agentic_ops/run_eval.py
index f06928e..ca4dd4e 100644
--- a/src/agent_evaluation/agentic_ops/run_eval.py
+++ b/src/agent_evaluation/agentic_ops/run_eval.py
@@ -1,11 +1,13 @@
-import os
 import inspect
+import logging
+import os
 import uuid
-from dotenv import load_dotenv
+
 from azure.ai.evaluation import evaluate
 from azure.ai.projects import AIProjectClient
-import logging
 from azure.identity import DefaultAzureCredential
+from dotenv import load_dotenv
+
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/agent_evaluation/agentic_ops/runner.py b/src/agent_evaluation/agentic_ops/runner.py
index 5126171..a6719dd 100644
--- a/src/agent_evaluation/agentic_ops/runner.py
+++ b/src/agent_evaluation/agentic_ops/runner.py
@@ -1,12 +1,14 @@
 import argparse
 import importlib
+import logging
+import os
 import sys
 import time
 from pathlib import Path
-from typing import Optional, Any, Dict
+from typing import Any, Dict, Optional
+
 import yaml
-import os
-import logging
+
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/agent_evaluation/cli.py b/src/agent_evaluation/cli.py
index 495a8fd..013efb5 100644
--- a/src/agent_evaluation/cli.py
+++ b/src/agent_evaluation/cli.py
@@ -9,11 +9,10 @@
 import argparse
 import sys
 from pathlib import Path
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
 
 import yaml
 
-
 # Root of the project (two levels up from this file)
 ROOT_DIR = Path(__file__).resolve().parents[2]
 SAMPLES_DIR = ROOT_DIR / "src" / "evaluations" / "offline"
@@ -74,7 +73,8 @@ def print_samples_table(samples: List[Dict[str, str]]) -> None:
 
 def run_sample(sample: Dict[str, str], extra_args: Optional[List[str]] = None) -> int:
     """Run a selected evaluation sample."""
-    from src.agent_evaluation.agentic_ops.runner import run_pipeline, parse_args
+    from src.agent_evaluation.agentic_ops.runner import (parse_args,
+                                                         run_pipeline)
 
     config_path = sample["config_path"]
     print(f"\n{'='*70}")
diff --git a/src/evaluations/offline/agentic_evaluation/eval_factory.py b/src/evaluations/offline/agentic_evaluation/eval_factory.py
index 19f9c89..440fff9 100644
--- a/src/evaluations/offline/agentic_evaluation/eval_factory.py
+++ b/src/evaluations/offline/agentic_evaluation/eval_factory.py
@@ -1,8 +1,12 @@
-from azure.ai.evaluation import RelevanceEvaluator,  TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-from .evaluator.evaluator_repo.evaluate_agent_invoked import EvaluateAgentsInvoked
-
-import os
 import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+                                 ToolCallAccuracyEvaluator)
+
+from .evaluator.evaluator_repo.evaluate_agent_invoked import \
+    EvaluateAgentsInvoked
+
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py b/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py
index 62254d2..919f28e 100644
--- a/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
 import os
 from pathlib import Path
+
 from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
 from src.evaluations.offline.utils.constants import EVAL_NAME
 from src.evaluations.offline.utils.file_operations import get_next_run_id
+
 from ..eval_factory import EvaluatorFactory
 
 
diff --git a/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py b/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
index 1158b21..1eb437f 100644
--- a/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
+++ b/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
@@ -1,5 +1,6 @@
 from .eval_utils.evaluation_utils import agent_invoked_accuracy, compute_recall
 
+
 class EvaluateAgentsInvoked:
     def __init__(self):
         pass
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py b/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py
index 81545b1..312fea0 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py
@@ -1,11 +1,13 @@
-from azure.ai.evaluation import RelevanceEvaluator, CoherenceEvaluator
+import logging
+import os
+
+from azure.ai.evaluation import CoherenceEvaluator, RelevanceEvaluator
+
 from .evaluator.evaluator_repo.coherence import CoherenceEvaluatorCustom
-from .evaluator.evaluator_repo.relevance import RelevanceEvaluatorCustom
 from .evaluator.evaluator_repo.fluency import FluencyEvaluatorCustom
+from .evaluator.evaluator_repo.relevance import RelevanceEvaluatorCustom
 from .evaluator.evaluator_repo.similarity import SimilarityEvaluatorCustom
 
-import os
-import logging
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py
index 14245ef..141df04 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
 import os
 from pathlib import Path
+
 from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
 from src.evaluations.offline.utils.constants import EVAL_NAME
 from src.evaluations.offline.utils.file_operations import get_next_run_id
+
 from ..eval_factory import EvaluatorFactory
 
 
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py
index 3d06c39..7fec400 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py
@@ -1,5 +1,7 @@
 from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+    BaseCustomEvaluator
 
 
 class CoherenceEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py
index 5a03f3d..aeff3a1 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py
@@ -2,7 +2,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+    BaseCustomEvaluator
 
 
 class FluencyEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py
index 71cd9f3..3cd9b73 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py
@@ -1,5 +1,7 @@
 from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+    BaseCustomEvaluator
 
 
 class RelevanceEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py
index b2a34bd..3dd147f 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py
@@ -2,7 +2,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+    BaseCustomEvaluator
 
 
 class SimilarityEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py
index 1fd459c..1fa5560 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py
@@ -1,7 +1,9 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-
-import os
 import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+                                 ToolCallAccuracyEvaluator)
+
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py
index 14245ef..141df04 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
 import os
 from pathlib import Path
+
 from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
 from src.evaluations.offline.utils.constants import EVAL_NAME
 from src.evaluations.offline.utils.file_operations import get_next_run_id
+
 from ..eval_factory import EvaluatorFactory
 
 
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py
index ddb1e73..8def77a 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py
@@ -11,11 +11,14 @@
 FLOW:
     Load Queries → Run Inference → Save Responses
 """
-import os
 import logging
+import os
 import random
 from pathlib import Path
-from src.evaluations.offline.utils.file_operations import load_queries_from_jsonl
+
+from src.evaluations.offline.utils.file_operations import \
+    load_queries_from_jsonl
+
 from .experiment_utils import get_file_paths, prepare_output_file, save_result
 
 
@@ -101,7 +104,7 @@ def inference_main(config: dict, args=None) -> None:
 if __name__ == "__main__":
     # For standalone execution, load config from experiment.yaml
     import yaml
-    
+
     # Get project root (go up 5 levels from this file)
     current_file = Path(__file__)  # .../experiment/agent_inference.py
     project_root = current_file.parent.parent.parent.parent.parent.parent  # Go up to project root
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py
index 0f27fc0..379241b 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py
@@ -3,11 +3,11 @@
 ==============
 Helper functions for file management in inference pipelines.
 """
-import os
 import logging
+import os
 from pathlib import Path
-from src.evaluations.offline.utils.file_operations import append_to_jsonl
 
+from src.evaluations.offline.utils.file_operations import append_to_jsonl
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py
index 24d2636..dd14b67 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py
@@ -1,9 +1,10 @@
 """Tool functions for the device agents in the Multi-Agent system."""
 
+from random import choice, randint
 from typing import Annotated
-from pydantic import Field
-from random import randint, choice
+
 from agent_framework import tool
+from pydantic import Field
 
 
 # =============================================================================
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py
index b7f7de5..6efccf7 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py
@@ -22,60 +22,36 @@
 """
 import asyncio
 import json
-import os
 import logging
-from typing import Annotated
+import os
 from pathlib import Path
+from typing import Annotated
+
 from agent_framework import Agent, tool
 from agent_framework.observability import enable_instrumentation, get_tracer
+from agent_framework.openai import OpenAIChatClient
+from azure.identity import AzureCliCredential
 from azure.monitor.opentelemetry import configure_azure_monitor
 from opentelemetry import context as otel_context
 from opentelemetry.trace import SpanKind
 from opentelemetry.trace.span import format_trace_id
-from agent_framework.openai import OpenAIChatClient
-from azure.identity import AzureCliCredential
 from pydantic import Field
 
 # Handle both standalone and package execution
 try:
     # When run as part of pipeline (package import)
-    from .agent_tools import (
-        # AC tools
-        set_ac_temperature,
-        turn_ac_on,
-        turn_ac_off,
-        set_ac_mode,
-        get_ac_status,
-        # TV tools
-        turn_tv_on,
-        turn_tv_off,
-        set_tv_channel,
-        set_tv_volume,
-        get_tv_status,
-        # Dishwasher tools
-        start_dishwasher,
-        stop_dishwasher,
-        get_dishwasher_status,
-        set_dishwasher_delay,
-    )
+    from .agent_tools import (  # AC tools; TV tools; Dishwasher tools
+        get_ac_status, get_dishwasher_status, get_tv_status, set_ac_mode,
+        set_ac_temperature, set_dishwasher_delay, set_tv_channel,
+        set_tv_volume, start_dishwasher, stop_dishwasher, turn_ac_off,
+        turn_ac_on, turn_tv_off, turn_tv_on)
 except ImportError:
     # When run standalone
-    from agent_tools import (
-        set_ac_temperature,
-        turn_ac_on,
-        turn_ac_off,
-        set_ac_mode,
-        get_ac_status,
-        turn_tv_on,
-        turn_tv_off,
-        set_tv_channel,
-        set_tv_volume,
-        get_tv_status,
-        start_dishwasher,
-        stop_dishwasher,
-        get_dishwasher_status,
-        set_dishwasher_delay,
-    )
+    from agent_tools import (get_ac_status, get_dishwasher_status,
+                             get_tv_status, set_ac_mode, set_ac_temperature,
+                             set_dishwasher_delay, set_tv_channel,
+                             set_tv_volume, start_dishwasher, stop_dishwasher,
+                             turn_ac_off, turn_ac_on, turn_tv_off, turn_tv_on)
 
 try:
     from src.evaluations.offline.utils.file_operations import append_to_jsonl
@@ -86,6 +62,7 @@
     from src.evaluations.offline.utils.file_operations import append_to_jsonl
 
 from dotenv import load_dotenv
+
 load_dotenv()
 
 
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
index 11c82b7..eef116d 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
@@ -3,12 +3,14 @@
 
 Delegates to the shared trace_to_jsonl module in utils/.
 """
-from src.evaluations.offline.utils.trace_to_jsonl import get_trace_main  # noqa: F401
+from src.evaluations.offline.utils.trace_to_jsonl import \
+    get_trace_main  # noqa: F401
 
 if __name__ == "__main__":
-    import yaml
     from pathlib import Path
 
+    import yaml
+
     script_dir = Path(__file__).parent
     config_path = script_dir.parent / "experiment.yaml"
 
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py
index 8ae2cb8..fbd30ad 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py
@@ -1,8 +1,12 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-from .evaluator.evaluator_repo.evaluate_agent_invoked import EvaluateAgentsInvoked
-
-import os
 import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+                                 ToolCallAccuracyEvaluator)
+
+from .evaluator.evaluator_repo.evaluate_agent_invoked import \
+    EvaluateAgentsInvoked
+
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py
index 7e3748c..141df04 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py
@@ -1,11 +1,12 @@
+import logging
 import os
 from pathlib import Path
+
 from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
 from src.evaluations.offline.utils.constants import EVAL_NAME
 from src.evaluations.offline.utils.file_operations import get_next_run_id
-from ..eval_factory import EvaluatorFactory
 
+from ..eval_factory import EvaluatorFactory
 
 
 def get_logger(name: str):
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
index 1102f3e..77a61df 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
@@ -1,4 +1,6 @@
-from .eval_utils.evaluation_utils import agent_invoked_accuracy, calculate_match_percentage
+from .eval_utils.evaluation_utils import (agent_invoked_accuracy,
+                                          calculate_match_percentage)
+
 
 class EvaluateAgentsInvoked:
     def __init__(self):
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py
index 7eec989..1d80b7b 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py
@@ -1,8 +1,10 @@
 """Tool functions for the Multi-Tool Agent."""
 
+from random import randint
 from typing import Annotated
+
 from pydantic import Field
-from random import randint
+
 
 def get_weather(
     location: Annotated[str, Field(description="The location to get the weather for.")],
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py
index 4382c79..faac3a9 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py
@@ -9,44 +9,31 @@
 """
 import asyncio
 import json
-import os
 import logging
+import os
 from datetime import datetime
 from pathlib import Path
+
 from agent_framework import Agent, ChatOptions
 from agent_framework.observability import enable_instrumentation, get_tracer
+from agent_framework.openai import OpenAIChatClient
+from azure.identity import AzureCliCredential
 from azure.monitor.opentelemetry import configure_azure_monitor
 from opentelemetry import context as otel_context
 from opentelemetry.trace import SpanKind
 from opentelemetry.trace.span import format_trace_id
-from agent_framework.openai import OpenAIChatClient
-from azure.identity import AzureCliCredential
 
 # Handle both standalone and package execution
 try:
     # When run as part of pipeline (package import)
-    from .agent_tools import (
-        get_current_datetime,
-        calculate_sum,
-        calculate_product,
-        convert_temperature,
-        count_words,
-        generate_uuid,
-        format_json,
-        get_weather
-    )
+    from .agent_tools import (calculate_product, calculate_sum,
+                              convert_temperature, count_words, format_json,
+                              generate_uuid, get_current_datetime, get_weather)
 except ImportError:
     # When run standalone
-    from agent_tools import (
-        get_current_datetime,
-        calculate_sum,
-        calculate_product,
-        convert_temperature,
-        count_words,
-        generate_uuid,
-        format_json,
-        get_weather
-    )
+    from agent_tools import (calculate_product, calculate_sum,
+                             convert_temperature, count_words, format_json,
+                             generate_uuid, get_current_datetime, get_weather)
 
 try:
     from src.evaluations.offline.utils.file_operations import append_to_jsonl
@@ -57,6 +44,7 @@
     from src.evaluations.offline.utils.file_operations import append_to_jsonl
 
 from dotenv import load_dotenv
+
 load_dotenv()
 
 
@@ -193,7 +181,8 @@ async def run_inference_async(config: dict) -> None:
         # agent_framework's own spans, so disabling just this instrumentor
         # is safe.
         try:
-            from azure.ai.projects.telemetry._responses_instrumentor import ResponsesInstrumentor
+            from azure.ai.projects.telemetry._responses_instrumentor import \
+                ResponsesInstrumentor
             if ResponsesInstrumentor().is_instrumented():
                 ResponsesInstrumentor().uninstrument()
                 logger.info("[AGENT] Disabled azure-ai-projects ResponsesInstrumentor (parallel-tool-call bug workaround)")
@@ -275,7 +264,7 @@ def inference_main(config: dict, args=None) -> None:
 # =============================================================================
 if __name__ == "__main__":
     import yaml
-    
+
     # Get config path relative to this file
     script_dir = Path(__file__).parent
     config_path = script_dir.parent / "experiment.yaml"
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
index 74eaee7..79b0532 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
@@ -4,12 +4,14 @@
 
 Delegates to the shared trace_to_jsonl module in utils/.
 """
-from src.evaluations.offline.utils.trace_to_jsonl import get_trace_main  # noqa: F401
+from src.evaluations.offline.utils.trace_to_jsonl import \
+    get_trace_main  # noqa: F401
 
 if __name__ == "__main__":
-    import yaml
     from pathlib import Path
 
+    import yaml
+
     script_dir = Path(__file__).parent
     config_path = script_dir.parent / "experiment.yaml"
 
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py
index 1fd459c..1fa5560 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py
@@ -1,7 +1,9 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-
-import os
 import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+                                 ToolCallAccuracyEvaluator)
+
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py
index 7e3748c..141df04 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py
@@ -1,11 +1,12 @@
+import logging
 import os
 from pathlib import Path
+
 from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
 from src.evaluations.offline.utils.constants import EVAL_NAME
 from src.evaluations.offline.utils.file_operations import get_next_run_id
-from ..eval_factory import EvaluatorFactory
 
+from ..eval_factory import EvaluatorFactory
 
 
 def get_logger(name: str):
diff --git a/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py b/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py
index a4af652..949a336 100644
--- a/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py
+++ b/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py
@@ -1,7 +1,8 @@
+import logging
+import os
+
 from azure.ai.evaluation import RelevanceEvaluator
 
-import os
-import logging
 
 def get_logger(name: str):
     level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py b/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py
index 14245ef..141df04 100644
--- a/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py
+++ b/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
 import os
 from pathlib import Path
+
 from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
 from src.evaluations.offline.utils.constants import EVAL_NAME
 from src.evaluations.offline.utils.file_operations import get_next_run_id
+
 from ..eval_factory import EvaluatorFactory
 
 
diff --git a/src/evaluations/offline/utils/trace_to_jsonl.py b/src/evaluations/offline/utils/trace_to_jsonl.py
index fbe0696..b56f6ff 100644
--- a/src/evaluations/offline/utils/trace_to_jsonl.py
+++ b/src/evaluations/offline/utils/trace_to_jsonl.py
@@ -12,16 +12,18 @@
 Used by both pipeline_multi_agent_evaluation and
 pipeline_multi_tool_agent_evaluation.
 """
-import os
 import json
 import logging
+import os
 import time
-from pathlib import Path
-from typing import Dict, List, Any
 from datetime import timedelta
-from azure.monitor.query import LogsQueryClient, LogsQueryStatus
+from pathlib import Path
+from typing import Any, Dict, List
+
+from azure.core.exceptions import (HttpResponseError, ServiceRequestError,
+                                   ServiceResponseError)
 from azure.identity import DefaultAzureCredential
-from azure.core.exceptions import HttpResponseError, ServiceRequestError, ServiceResponseError
+from azure.monitor.query import LogsQueryClient, LogsQueryStatus
 from dotenv import load_dotenv
 
 load_dotenv()
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_agent_tools.py b/tests/unit/test_agent_tools.py
new file mode 100644
index 0000000..8844ab1
--- /dev/null
+++ b/tests/unit/test_agent_tools.py
@@ -0,0 +1,157 @@
+"""Unit tests for the agent_tools module (device agent tools)."""
+
+from src.evaluations.offline.pipeline_multi_agent_evaluation.agent_inference.agent_tools import (
+    get_ac_status, get_dishwasher_status, get_tv_status, set_ac_mode,
+    set_ac_temperature, set_dishwasher_delay, set_tv_channel, set_tv_volume,
+    start_dishwasher, stop_dishwasher, turn_ac_off, turn_ac_on, turn_tv_off,
+    turn_tv_on)
+
+# ---------------------------------------------------------------------------
+# AC Tools
+# ---------------------------------------------------------------------------
+
+class TestACTools:
+    def test_set_temperature_valid(self):
+        """Valid temperature should succeed."""
+        result = set_ac_temperature(temperature=72)
+        assert "72" in result
+        assert "set to" in result.lower() or "72°F" in result
+
+    def test_set_temperature_too_low(self):
+        """Temperature below range should return error."""
+        result = set_ac_temperature(temperature=50)
+        assert "error" in result.lower() or "out of range" in result.lower()
+
+    def test_set_temperature_too_high(self):
+        """Temperature above range should return error."""
+        result = set_ac_temperature(temperature=90)
+        assert "error" in result.lower() or "out of range" in result.lower()
+
+    def test_set_temperature_boundary_low(self):
+        """60°F should be accepted."""
+        result = set_ac_temperature(temperature=60)
+        assert "60" in result
+
+    def test_set_temperature_boundary_high(self):
+        """85°F should be accepted."""
+        result = set_ac_temperature(temperature=85)
+        assert "85" in result
+
+    def test_turn_ac_on(self):
+        """Should confirm AC turned on."""
+        result = turn_ac_on()
+        assert "on" in result.lower()
+
+    def test_turn_ac_off(self):
+        """Should confirm AC turned off."""
+        result = turn_ac_off()
+        assert "off" in result.lower()
+
+    def test_set_ac_mode_valid(self):
+        """Valid modes should succeed."""
+        for mode in ["cool", "heat", "fan", "auto"]:
+            result = set_ac_mode(mode=mode)
+            assert mode in result.lower()
+
+    def test_set_ac_mode_invalid(self):
+        """Invalid mode should return error."""
+        result = set_ac_mode(mode="turbo")
+        assert "error" in result.lower() or "invalid" in result.lower()
+
+    def test_get_ac_status(self):
+        """Should return a status string."""
+        result = get_ac_status()
+        assert isinstance(result, str)
+        assert "AC" in result or "ac" in result.lower()
+
+
+# ---------------------------------------------------------------------------
+# TV Tools
+# ---------------------------------------------------------------------------
+
+class TestTVTools:
+    def test_turn_tv_on(self):
+        """Should confirm TV turned on."""
+        result = turn_tv_on()
+        assert "on" in result.lower()
+
+    def test_turn_tv_off(self):
+        """Should confirm TV turned off."""
+        result = turn_tv_off()
+        assert "off" in result.lower()
+
+    def test_set_channel_valid(self):
+        """Valid channel should succeed."""
+        result = set_tv_channel(channel=42)
+        assert "42" in result
+
+    def test_set_channel_too_low(self):
+        """Channel 0 should return error."""
+        result = set_tv_channel(channel=0)
+        assert "error" in result.lower() or "out of range" in result.lower()
+
+    def test_set_channel_too_high(self):
+        """Channel 1000 should return error."""
+        result = set_tv_channel(channel=1000)
+        assert "error" in result.lower() or "out of range" in result.lower()
+
+    def test_set_channel_boundary(self):
+        """Channels 1 and 999 should be accepted."""
+        assert "1" in set_tv_channel(channel=1)
+        assert "999" in set_tv_channel(channel=999)
+
+    def test_set_volume_valid(self):
+        """Valid volume should succeed."""
+        result = set_tv_volume(volume=50)
+        assert "50" in result
+
+    def test_set_volume_too_low(self):
+        """Volume -1 should return error."""
+        result = set_tv_volume(volume=-1)
+        assert "error" in result.lower() or "out of range" in result.lower()
+
+    def test_set_volume_too_high(self):
+        """Volume 101 should return error."""
+        result = set_tv_volume(volume=101)
+        assert "error" in result.lower() or "out of range" in result.lower()
+
+    def test_set_volume_boundaries(self):
+        """Volume 0 and 100 should be accepted."""
+        assert "0" in set_tv_volume(volume=0)
+        assert "100" in set_tv_volume(volume=100)
+
+    def test_get_tv_status(self):
+        """Should return a status string."""
+        result = get_tv_status()
+        assert isinstance(result, str)
+        assert "TV" in result or "tv" in result.lower()
+
+
+# ---------------------------------------------------------------------------
+# Dishwasher Tools
+# ---------------------------------------------------------------------------
+
+class TestDishwasherTools:
+    def test_start_dishwasher(self):
+        """Should confirm dishwasher started."""
+        result = start_dishwasher()
+        assert isinstance(result, str)
+        assert len(result) > 0
+
+    def test_stop_dishwasher(self):
+        """Should confirm dishwasher stopped."""
+        result = stop_dishwasher()
+        assert isinstance(result, str)
+        assert len(result) > 0
+
+    def test_get_dishwasher_status(self):
+        """Should return a status string."""
+        result = get_dishwasher_status()
+        assert isinstance(result, str)
+        assert len(result) > 0
+
+    def test_set_dishwasher_delay(self):
+        """Should confirm delay set."""
+        result = set_dishwasher_delay(hours=2)
+        assert isinstance(result, str)
+        assert len(result) > 0
diff --git a/tests/unit/test_base_evaluator.py b/tests/unit/test_base_evaluator.py
new file mode 100644
index 0000000..97a7b83
--- /dev/null
+++ b/tests/unit/test_base_evaluator.py
@@ -0,0 +1,207 @@
+"""Unit tests for the BaseCustomEvaluator (src/agent_evaluation/agentic_ops/base_evaluator.py)."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from src.agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+class ConcreteEvaluator(BaseCustomEvaluator):
+    """Concrete test subclass of BaseCustomEvaluator."""
+
+    def __init__(self, model_config=None):
+        super().__init__(
+            prompty_file_name="test.prompty",
+            result_key="test_score",
+            model_config=model_config,
+        )
+
+    def __call__(self, query: str, response: str, **kwargs):
+        return self.evaluate(query=query, response=response, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# _extract_score
+# ---------------------------------------------------------------------------
+
+class TestExtractScore:
+    @pytest.fixture
+    def evaluator(self):
+        with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+            e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+            e.result_key = "test_score"
+            e.prompty_file_name = "test.prompty"
+            return e
+
+    def test_extracts_structured_s2_tag(self, evaluator):
+        """Should extract score from <S2>4</S2> format."""
+        assert evaluator._extract_score("<S2>4</S2>") == 4
+
+    def test_extracts_score_colon_format(self, evaluator):
+        """Should extract from 'Score: 5' format."""
+        assert evaluator._extract_score("The evaluation is complete. Score: 5") == 5
+
+    def test_extracts_rating_format(self, evaluator):
+        """Should extract from 'Rating: 3' format."""
+        assert evaluator._extract_score("Rating: 3 - The response is adequate") == 3
+
+    def test_returns_default_on_no_match(self, evaluator):
+        """Should return default when no score is found."""
+        assert evaluator._extract_score("No numeric content here at all") == 3
+
+    def test_custom_default(self, evaluator):
+        """Should use custom default score."""
+        assert evaluator._extract_score("no score", default_score=1) == 1
+
+    def test_ignores_out_of_range_scores(self, evaluator):
+        """Scores outside 1-5 from pattern matching should fall through."""
+        # The S2 tag extraction doesn't validate range, but pattern matching does
+        result = evaluator._extract_score("Score: 9")
+        # 9 is out of range for pattern matching fallback, should use default
+        assert result == 3
+
+    def test_s2_tag_any_value(self, evaluator):
+        """S2 tag should accept any digit value."""
+        assert evaluator._extract_score("<S2>1</S2>") == 1
+        assert evaluator._extract_score("<S2>5</S2>") == 5
+
+
+# ---------------------------------------------------------------------------
+# _create_user_prompt
+# ---------------------------------------------------------------------------
+
+class TestCreateUserPrompt:
+    @pytest.fixture
+    def evaluator(self):
+        with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+            e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+            e.result_key = "test_score"
+            e.prompty_file_name = "test.prompty"
+            return e
+
+    def test_replaces_single_placeholder(self, evaluator):
+        """Should replace {{query}} with value."""
+        template = "Evaluate: {{query}}"
+        result = evaluator._create_user_prompt(template, query="What is AI?")
+        assert result == "Evaluate: What is AI?"
+
+    def test_replaces_multiple_placeholders(self, evaluator):
+        """Should replace multiple placeholders."""
+        template = "Query: {{query}}\nResponse: {{response}}"
+        result = evaluator._create_user_prompt(template, query="Q", response="A")
+        assert result == "Query: Q\nResponse: A"
+
+    def test_leaves_unknown_placeholders(self, evaluator):
+        """Unresolved placeholders should remain."""
+        template = "Query: {{query}} Context: {{context}}"
+        result = evaluator._create_user_prompt(template, query="Q")
+        assert "{{context}}" in result
+
+    def test_no_placeholders(self, evaluator):
+        """Template without placeholders should remain unchanged."""
+        template = "Plain text with no placeholders"
+        result = evaluator._create_user_prompt(template, query="Q")
+        assert result == template
+
+
+# ---------------------------------------------------------------------------
+# _load_prompt_content
+# ---------------------------------------------------------------------------
+
+class TestLoadPromptContent:
+    def test_returns_fallback_on_missing_file(self):
+        """Missing prompty file should return fallback prompt."""
+        with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+            e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+            e.prompty_path = "/nonexistent/path/test.prompty"
+            e.result_key = "test_score"
+            e.prompty_file_name = "test.prompty"
+
+        system, user = e._load_prompt_content()
+        assert "test" in user.lower() or "evaluate" in user.lower()
+
+    def test_parses_prompty_with_system_and_user(self, tmp_path):
+        """Should parse system/user sections from prompty file."""
+        # Prompty files have: ---\nmetadata\n---\nprompt content
+        # The parser splits on '---' and expects at least 3 parts
+        prompty_content = "---\nname: test\nmodel: gpt-4\n---\nsystem:\nYou are an evaluator.\nuser:\nEvaluate this: {{query}}\n---\n"
+        prompty_file = tmp_path / "test.prompty"
+        prompty_file.write_text(prompty_content, encoding="utf-8")
+
+        with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+            e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+            e.prompty_path = str(prompty_file)
+            e.result_key = "test_score"
+            e.prompty_file_name = "test.prompty"
+
+        system, user = e._load_prompt_content()
+        # The parser should extract content from the prompty file
+        combined = system + user
+        assert len(combined) > 0
+
+
+# ---------------------------------------------------------------------------
+# evaluate
+# ---------------------------------------------------------------------------
+
+class TestEvaluate:
+    @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient")
+    def test_evaluate_returns_score(self, mock_client_cls):
+        """evaluate() should return dict with result_key and score."""
+        mock_client = MagicMock()
+        mock_client.get_llm_response_with_prompty.return_value = "<S2>4</S2>"
+        mock_client_cls.return_value = mock_client
+
+        with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("system", "{{query}}")):
+            with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+                e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+                e.prompty_path = "test.prompty"
+                e.result_key = "test_score"
+                e.prompty_file_name = "test.prompty"
+
+            result = e.evaluate(query="What is AI?")
+        assert result == {"test_score": 4}
+
+    @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient")
+    def test_evaluate_returns_default_on_error(self, mock_client_cls):
+        """evaluate() should return default score on LLM error."""
+        mock_client = MagicMock()
+        mock_client.get_llm_response_with_prompty.side_effect = Exception("API error")
+        mock_client_cls.return_value = mock_client
+
+        with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("system", "{{query}}")):
+            with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+                e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+                e.prompty_path = "test.prompty"
+                e.result_key = "test_score"
+                e.prompty_file_name = "test.prompty"
+
+            result = e.evaluate(query="What is AI?")
+        assert result == {"test_score": 3}
+
+
+# ---------------------------------------------------------------------------
+# __call__
+# ---------------------------------------------------------------------------
+
+class TestCall:
+    @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient")
+    def test_call_delegates_to_evaluate(self, mock_client_cls):
+        """__call__ should delegate to evaluate."""
+        mock_client = MagicMock()
+        mock_client.get_llm_response_with_prompty.return_value = "<S2>5</S2>"
+        mock_client_cls.return_value = mock_client
+
+        with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("sys", "{{query}}")):
+            with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+                e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+                e.prompty_path = "test.prompty"
+                e.result_key = "test_score"
+                e.prompty_file_name = "test.prompty"
+
+            result = e(query="test")
+        assert result == {"test_score": 5}
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
new file mode 100644
index 0000000..ca4417c
--- /dev/null
+++ b/tests/unit/test_cli.py
@@ -0,0 +1,401 @@
+"""Unit tests for the CLI module (src/agent_evaluation/cli.py)."""
+
+import argparse
+from unittest.mock import MagicMock, mock_open, patch
+
+import pytest
+import yaml
+
+from src.agent_evaluation.cli import (EXCLUDE_DIRS, cmd_info, cmd_list,
+                                      cmd_run, cmd_run_all, discover_samples,
+                                      interactive_select, main,
+                                      print_samples_table, run_sample)
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+SAMPLE_YAML_CONTENT = {
+    "app_name": "TestApp",
+    "experiment_name": "test_experiment",
+    "version": "1.0",
+    "pipeline": [
+        {"config_key": "evaluation", "base_path": "evaluator", "module": "eval_main.eval_main"},
+    ],
+    "evaluation": {
+        "input_path": "datasets/",
+        "input_file": "sample.jsonl",
+        "output_path": "reports/",
+        "evaluators": {"score": "relevance_evaluator"},
+    },
+}
+
+
+@pytest.fixture
+def mock_samples():
+    return [
+        {
+            "name": "agentic_evaluation",
+            "app_name": "TestApp",
+            "experiment_name": "test_experiment",
+            "version": "1.0",
+            "config_path": "src/evaluations/offline/agentic_evaluation/experiment.yaml",
+            "stages": ["evaluation"],
+        },
+        {
+            "name": "rag_evaluation_foundry",
+            "app_name": "RAGApp",
+            "experiment_name": "rag_experiment",
+            "version": "2.0",
+            "config_path": "src/evaluations/offline/rag_evaluation_foundry/experiment.yaml",
+            "stages": ["evaluation"],
+        },
+    ]
+
+
+# ---------------------------------------------------------------------------
+# discover_samples
+# ---------------------------------------------------------------------------
+
+class TestDiscoverSamples:
+    def test_returns_list(self, tmp_path, monkeypatch):
+        """discover_samples should return a list."""
+        monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+        result = discover_samples()
+        assert isinstance(result, list)
+
+    def test_skips_excluded_dirs(self, tmp_path, monkeypatch):
+        """Directories in EXCLUDE_DIRS should be skipped."""
+        monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+        for excluded in EXCLUDE_DIRS:
+            d = tmp_path / excluded
+            d.mkdir()
+            (d / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT))
+
+        result = discover_samples()
+        assert result == []
+
+    def test_skips_dirs_without_experiment_yaml(self, tmp_path, monkeypatch):
+        """Directories without experiment.yaml should be skipped."""
+        monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+        (tmp_path / "some_dir").mkdir()
+        result = discover_samples()
+        assert result == []
+
+    def test_discovers_valid_sample(self, tmp_path, monkeypatch):
+        """A valid sample directory with experiment.yaml should be discovered."""
+        monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+        monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent)
+
+        sample_dir = tmp_path / "my_sample"
+        sample_dir.mkdir()
+        (sample_dir / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT))
+
+        result = discover_samples()
+        assert len(result) == 1
+        assert result[0]["name"] == "my_sample"
+        assert result[0]["app_name"] == "TestApp"
+        assert result[0]["experiment_name"] == "test_experiment"
+        assert result[0]["version"] == "1.0"
+        assert result[0]["stages"] == ["evaluation"]
+
+    def test_discovers_multiple_samples_sorted(self, tmp_path, monkeypatch):
+        """Multiple samples should be returned in sorted order."""
+        monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+        monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent)
+
+        for name in ["z_sample", "a_sample", "m_sample"]:
+            d = tmp_path / name
+            d.mkdir()
+            (d / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT))
+
+        result = discover_samples()
+        assert [s["name"] for s in result] == ["a_sample", "m_sample", "z_sample"]
+
+    def test_uses_defaults_for_missing_yaml_fields(self, tmp_path, monkeypatch):
+        """Missing fields in experiment.yaml should use directory name as default."""
+        monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+        monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent)
+
+        sample_dir = tmp_path / "bare_sample"
+        sample_dir.mkdir()
+        (sample_dir / "experiment.yaml").write_text(yaml.dump({"pipeline": []}))
+
+        result = discover_samples()
+        assert len(result) == 1
+        assert result[0]["name"] == "bare_sample"
+        assert result[0]["app_name"] == "bare_sample"
+        assert result[0]["experiment_name"] == "bare_sample"
+        assert result[0]["version"] == ""
+        assert result[0]["stages"] == []
+
+
+# ---------------------------------------------------------------------------
+# print_samples_table
+# ---------------------------------------------------------------------------
+
+class TestPrintSamplesTable:
+    def test_empty_samples(self, capsys):
+        """Empty list should print 'No evaluation samples found.'"""
+        print_samples_table([])
+        captured = capsys.readouterr()
+        assert "No evaluation samples found." in captured.out
+
+    def test_prints_sample_names(self, capsys, mock_samples):
+        """Should print sample names in the table."""
+        print_samples_table(mock_samples)
+        captured = capsys.readouterr()
+        assert "agentic_evaluation" in captured.out
+        assert "rag_evaluation_foundry" in captured.out
+
+    def test_prints_stages(self, capsys, mock_samples):
+        """Should print stage info in the table."""
+        print_samples_table(mock_samples)
+        captured = capsys.readouterr()
+        assert "evaluation" in captured.out
+
+
+# ---------------------------------------------------------------------------
+# run_sample
+# ---------------------------------------------------------------------------
+
+class TestRunSample:
+    @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline")
+    @patch("src.agent_evaluation.agentic_ops.runner.parse_args")
+    def test_run_sample_success(self, mock_parse, mock_run, mock_samples):
+        """Successful run should return 0."""
+        mock_args = MagicMock()
+        mock_parse.return_value = mock_args
+        mock_run.return_value = None
+
+        result = run_sample(mock_samples[0])
+        assert result == 0
+
+    @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline")
+    @patch("src.agent_evaluation.agentic_ops.runner.parse_args")
+    def test_run_sample_system_exit(self, mock_parse, mock_run, mock_samples):
+        """SystemExit with code should be returned."""
+        mock_args = MagicMock()
+        mock_parse.return_value = mock_args
+        mock_run.side_effect = SystemExit(2)
+
+        result = run_sample(mock_samples[0])
+        assert result == 2
+
+    @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline")
+    @patch("src.agent_evaluation.agentic_ops.runner.parse_args")
+    def test_run_sample_with_extra_args(self, mock_parse, mock_run, mock_samples):
+        """Extra args should be passed to sys.argv."""
+        mock_args = MagicMock()
+        mock_parse.return_value = mock_args
+        mock_run.return_value = None
+
+        result = run_sample(mock_samples[0], extra_args=["--sample", "5"])
+        assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# interactive_select
+# ---------------------------------------------------------------------------
+
+class TestInteractiveSelect:
+    @patch("builtins.input", return_value="1")
+    def test_select_by_number(self, mock_input, mock_samples):
+        """Selecting by number should return the correct sample."""
+        result = interactive_select(mock_samples)
+        assert result == mock_samples[0]
+
+    @patch("builtins.input", return_value="2")
+    def test_select_by_second_number(self, mock_input, mock_samples):
+        """Selecting second item returns second sample."""
+        result = interactive_select(mock_samples)
+        assert result == mock_samples[1]
+
+    @patch("builtins.input", return_value="q")
+    def test_quit(self, mock_input, mock_samples):
+        """Typing 'q' should return None."""
+        result = interactive_select(mock_samples)
+        assert result is None
+
+    @patch("builtins.input", return_value="exit")
+    def test_exit(self, mock_input, mock_samples):
+        """Typing 'exit' should return None."""
+        result = interactive_select(mock_samples)
+        assert result is None
+
+    @patch("builtins.input", return_value="agentic")
+    def test_select_by_partial_name(self, mock_input, mock_samples):
+        """Partial name matching should work for unique matches."""
+        result = interactive_select(mock_samples)
+        assert result == mock_samples[0]
+
+    @patch("builtins.input", side_effect=EOFError)
+    def test_eof_returns_none(self, mock_input, mock_samples):
+        """EOFError should return None."""
+        result = interactive_select(mock_samples)
+        assert result is None
+
+    @patch("builtins.input", side_effect=KeyboardInterrupt)
+    def test_keyboard_interrupt_returns_none(self, mock_input, mock_samples):
+        """KeyboardInterrupt should return None."""
+        result = interactive_select(mock_samples)
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# cmd_list
+# ---------------------------------------------------------------------------
+
+class TestCmdList:
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_cmd_list_returns_zero(self, mock_discover, mock_samples):
+        """cmd_list should always return 0."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace()
+        result = cmd_list(args)
+        assert result == 0
+
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_cmd_list_empty(self, mock_discover):
+        """cmd_list with no samples should still return 0."""
+        mock_discover.return_value = []
+        args = argparse.Namespace()
+        result = cmd_list(args)
+        assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# cmd_run
+# ---------------------------------------------------------------------------
+
+class TestCmdRun:
+    @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_run_by_exact_name(self, mock_discover, mock_run, mock_samples):
+        """Running by exact name should find and run the sample."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="agentic_evaluation", sample=0, index_fname=None)
+        result = cmd_run(args)
+        assert result == 0
+        mock_run.assert_called_once()
+
+    @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_run_by_partial_name(self, mock_discover, mock_run, mock_samples):
+        """Running by partial name should match the sample."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="agentic", sample=0, index_fname=None)
+        result = cmd_run(args)
+        assert result == 0
+
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_run_not_found(self, mock_discover, mock_samples):
+        """Running with a non-matching name should return 1."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="nonexistent", sample=0, index_fname=None)
+        result = cmd_run(args)
+        assert result == 1
+
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_run_no_samples(self, mock_discover):
+        """Running with no samples available returns 1."""
+        mock_discover.return_value = []
+        args = argparse.Namespace(name="anything", sample=0, index_fname=None)
+        result = cmd_run(args)
+        assert result == 1
+
+    @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_run_by_number(self, mock_discover, mock_run, mock_samples):
+        """Running by number index should work."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="1", sample=0, index_fname=None)
+        result = cmd_run(args)
+        assert result == 0
+
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_run_ambiguous_name(self, mock_discover, mock_samples):
+        """Ambiguous partial name should return 1."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="evaluation", sample=0, index_fname=None)
+        result = cmd_run(args)
+        assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# cmd_run_all
+# ---------------------------------------------------------------------------
+
+class TestCmdRunAll:
+    @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_all_pass(self, mock_discover, mock_run, mock_samples):
+        """All samples passing should return 0."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace()
+        result = cmd_run_all(args)
+        assert result == 0
+        assert mock_run.call_count == len(mock_samples)
+
+    @patch("src.agent_evaluation.cli.run_sample", return_value=1)
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_any_failure_returns_one(self, mock_discover, mock_run, mock_samples):
+        """Any sample failing should return 1."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace()
+        result = cmd_run_all(args)
+        assert result == 1
+
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_no_samples(self, mock_discover):
+        """No samples available should return 1."""
+        mock_discover.return_value = []
+        args = argparse.Namespace()
+        result = cmd_run_all(args)
+        assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# cmd_info
+# ---------------------------------------------------------------------------
+
+class TestCmdInfo:
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_info_not_found(self, mock_discover, mock_samples):
+        """Non-existent sample should return 1."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="nonexistent")
+        result = cmd_info(args)
+        assert result == 1
+
+    @patch("builtins.open", mock_open(read_data=yaml.dump(SAMPLE_YAML_CONTENT)))
+    @patch("src.agent_evaluation.cli.discover_samples")
+    def test_info_found(self, mock_discover, mock_samples):
+        """Found sample should return 0."""
+        mock_discover.return_value = mock_samples
+        args = argparse.Namespace(name="agentic_evaluation")
+        result = cmd_info(args)
+        assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+class TestMain:
+    @patch("src.agent_evaluation.cli.discover_samples", return_value=[])
+    def test_main_no_command_no_samples(self, mock_discover):
+        """No command and no samples should exit with 1."""
+        with patch("sys.argv", ["agent_evals"]):
+            with pytest.raises(SystemExit) as exc_info:
+                main()
+            assert exc_info.value.code == 1
+
+    @patch("src.agent_evaluation.cli.cmd_list", return_value=0)
+    def test_main_list_command(self, mock_cmd):
+        """'list' command should dispatch to cmd_list."""
+        with patch("sys.argv", ["agent_evals", "list"]):
+            with pytest.raises(SystemExit) as exc_info:
+                main()
+            assert exc_info.value.code == 0
+        mock_cmd.assert_called_once()
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
new file mode 100644
index 0000000..8fa5dc3
--- /dev/null
+++ b/tests/unit/test_client.py
@@ -0,0 +1,196 @@
+"""Unit tests for the LLM Client (src/agent_evaluation/agentic_ops/client.py)."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from src.agent_evaluation.agentic_ops.client import LLMClient
+
+# ---------------------------------------------------------------------------
+# LLMClient._validate_messages
+# ---------------------------------------------------------------------------
+
+class TestValidateMessages:
+    @pytest.fixture
+    def client(self):
+        with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+            return LLMClient(temperature=0.0)
+
+    def test_valid_messages(self, client):
+        """Valid messages should not raise."""
+        messages = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Hello"},
+        ]
+        client._validate_messages(messages)  # Should not raise
+
+    def test_not_a_list_raises(self, client):
+        """Non-list messages should raise ValueError."""
+        with pytest.raises(ValueError, match="must be a list"):
+            client._validate_messages("not a list")
+
+    def test_non_dict_message_raises(self, client):
+        """Non-dict message items should raise ValueError."""
+        with pytest.raises(ValueError, match="must be a dictionary"):
+            client._validate_messages(["not a dict"])
+
+    def test_missing_role_raises(self, client):
+        """Message without 'role' should raise ValueError."""
+        with pytest.raises(ValueError, match="must have 'role' and 'content'"):
+            client._validate_messages([{"content": "hello"}])
+
+    def test_missing_content_raises(self, client):
+        """Message without 'content' should raise ValueError."""
+        with pytest.raises(ValueError, match="must have 'role' and 'content'"):
+            client._validate_messages([{"role": "user"}])
+
+    def test_invalid_role_raises(self, client):
+        """Invalid role value should raise ValueError."""
+        with pytest.raises(ValueError, match="invalid role"):
+            client._validate_messages([{"role": "invalid", "content": "hi"}])
+
+    def test_valid_roles(self, client):
+        """All valid roles should pass."""
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "usr"},
+            {"role": "assistant", "content": "asst"},
+        ]
+        client._validate_messages(messages)  # Should not raise
+
+
+# ---------------------------------------------------------------------------
+# LLMClient._parse_json_response
+# ---------------------------------------------------------------------------
+
+class TestParseJsonResponse:
+    @pytest.fixture
+    def client(self):
+        with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+            return LLMClient(temperature=0.0)
+
+    def test_valid_json(self, client):
+        """Valid JSON string should be parsed."""
+        result = client._parse_json_response('{"key": "value"}')
+        assert result == {"key": "value"}
+
+    def test_json_with_markdown_fencing(self, client):
+        """JSON wrapped in ```json ... ``` should be parsed."""
+        raw = '```json\n{"key": "value"}\n```'
+        result = client._parse_json_response(raw)
+        assert result == {"key": "value"}
+
+    def test_invalid_json_raises(self, client):
+        """Invalid JSON should raise ValueError."""
+        with pytest.raises(ValueError, match="Invalid JSON"):
+            client._parse_json_response("not valid json {")
+
+    def test_json_array(self, client):
+        """JSON arrays should be parsed."""
+        result = client._parse_json_response('[1, 2, 3]')
+        assert result == [1, 2, 3]
+
+    def test_whitespace_handling(self, client):
+        """Extra whitespace should be handled."""
+        result = client._parse_json_response('  \n {"a": 1} \n  ')
+        assert result == {"a": 1}
+
+
+# ---------------------------------------------------------------------------
+# LLMClient.get_llm_raw_response
+# ---------------------------------------------------------------------------
+
+class TestGetLlmRawResponse:
+    @patch("src.agent_evaluation.agentic_ops.client.get_llm_response")
+    def test_builds_messages_correctly(self, mock_get_response):
+        """Should construct messages with system and user roles."""
+        mock_get_response.return_value = "response text"
+
+        with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+            client = LLMClient(temperature=0.5)
+
+        result = client.get_llm_raw_response("system prompt", "user input")
+        assert result == "response text"
+
+        call_args = mock_get_response.call_args
+        messages = call_args[0][0]
+        assert messages[0] == {"role": "system", "content": "system prompt"}
+        assert messages[1] == {"role": "user", "content": "user input"}
+
+
+# ---------------------------------------------------------------------------
+# LLMClient.get_llm_response_json
+# ---------------------------------------------------------------------------
+
+class TestGetLlmResponseJson:
+    @patch("src.agent_evaluation.agentic_ops.client.get_llm_response")
+    def test_returns_parsed_json(self, mock_get_response):
+        """Should return parsed JSON from LLM response."""
+        mock_get_response.return_value = '{"score": 4}'
+
+        with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+            client = LLMClient()
+
+        result = client.get_llm_response_json("sys", "usr")
+        assert result == {"score": 4}
+
+
+# ---------------------------------------------------------------------------
+# get_llm_response (module-level)
+# ---------------------------------------------------------------------------
+
+class TestGetLlmResponse:
+    @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model")
+    @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance")
+    def test_successful_response(self, mock_get_client):
+        from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "Hello!"
+        mock_client.chat.completions.create.return_value = mock_response
+        mock_get_client.return_value = mock_client
+
+        messages = [{"role": "user", "content": "Hi"}]
+        result = get_llm_response(messages)
+        assert result == "Hello!"
+
+    @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", None)
+    def test_raises_on_missing_deployment(self):
+        from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+        with pytest.raises(ValueError, match="EVAL_AZURE_OPENAI_MODEL"):
+            get_llm_response([{"role": "user", "content": "test"}])
+
+    @patch("src.agent_evaluation.agentic_ops.client.DEFAULT_RETRY_DELAY", 0)
+    @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model")
+    @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance")
+    def test_retries_on_failure(self, mock_get_client):
+        from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = [
+            Exception("transient error"),
+            Exception("transient error"),
+            MagicMock(choices=[MagicMock(message=MagicMock(content="success"))]),
+        ]
+        mock_get_client.return_value = mock_client
+
+        messages = [{"role": "user", "content": "Hi"}]
+        result = get_llm_response(messages, max_retries=3)
+        assert result == "success"
+
+    @patch("src.agent_evaluation.agentic_ops.client.DEFAULT_RETRY_DELAY", 0)
+    @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model")
+    @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance")
+    def test_raises_after_max_retries(self, mock_get_client):
+        from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = Exception("permanent error")
+        mock_get_client.return_value = mock_client
+
+        messages = [{"role": "user", "content": "Hi"}]
+        with pytest.raises(Exception, match="Maximum retries"):
+            get_llm_response(messages, max_retries=2)
diff --git a/tests/unit/test_eval_factories.py b/tests/unit/test_eval_factories.py
new file mode 100644
index 0000000..dc9c016
--- /dev/null
+++ b/tests/unit/test_eval_factories.py
@@ -0,0 +1,254 @@
+"""Unit tests for eval_factory modules across all evaluation samples."""
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Agentic Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestAgenticEvaluationFactory:
+    def test_get_relevance_evaluator(self):
+        from src.evaluations.offline.agentic_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+        assert result is not None
+        assert "Relevance" in result.__name__
+
+    def test_get_custom_agents_evaluator(self):
+        from src.evaluations.offline.agentic_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("custom_agents_invoked_evaluator")
+        assert result is not None
+        assert result.__name__ == "EvaluateAgentsInvoked"
+
+    def test_get_task_adherence_evaluator(self):
+        from src.evaluations.offline.agentic_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+        assert result is not None
+        assert "TaskAdherence" in result.__name__
+
+    def test_get_tool_call_accuracy_evaluator(self):
+        from src.evaluations.offline.agentic_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator")
+        assert result is not None
+        assert "ToolCallAccuracy" in result.__name__
+
+    def test_invalid_evaluator_raises(self):
+        from src.evaluations.offline.agentic_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        with pytest.raises(ValueError, match="not found"):
+            EvaluatorFactory.get_evaluator_factory("nonexistent_evaluator")
+
+    def test_all_registered_evaluators_are_callable(self):
+        from src.evaluations.offline.agentic_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        for name in EvaluatorFactory.EVALUATOR_FACTORIES:
+            result = EvaluatorFactory.get_evaluator_factory(name)
+            assert callable(result), f"{name} factory is not callable"
+
+
+# ---------------------------------------------------------------------------
+# AI Judge Evaluation Custom - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestAiJudgeEvaluationFactory:
+    def test_get_custom_coherence(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("custom_coherence_evaluator")
+        assert result.__name__ == "CoherenceEvaluatorCustom"
+
+    def test_get_custom_relevance(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("custom_relevance_evaluator")
+        assert result.__name__ == "RelevanceEvaluatorCustom"
+
+    def test_get_custom_fluency(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("custom_fluency_evaluator")
+        assert result.__name__ == "FluencyEvaluatorCustom"
+
+    def test_get_custom_similarity(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("custom_similarity_evaluator")
+        assert result.__name__ == "SimilarityEvaluatorCustom"
+
+    def test_get_builtin_relevance(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+        assert "Relevance" in result.__name__
+
+    def test_get_builtin_coherence(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("coherence_evaluator")
+        assert "Coherence" in result.__name__
+
+    def test_invalid_evaluator_raises(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        with pytest.raises(ValueError, match="not found"):
+            EvaluatorFactory.get_evaluator_factory("invalid_evaluator")
+
+    def test_all_registered_evaluators_are_callable(self):
+        from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+            EvaluatorFactory
+
+        for name in EvaluatorFactory.EVALUATOR_FACTORIES:
+            result = EvaluatorFactory.get_evaluator_factory(name)
+            assert callable(result), f"{name} factory is not callable"
+
+
+# ---------------------------------------------------------------------------
+# Pipeline Experiment Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestPipelineExperimentFactory:
+    def test_get_relevance(self):
+        from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+        assert "Relevance" in result.__name__
+
+    def test_get_task_adherence(self):
+        from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+        assert "TaskAdherence" in result.__name__
+
+    def test_get_tool_call_accuracy(self):
+        from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator")
+        assert "ToolCallAccuracy" in result.__name__
+
+    def test_invalid_raises(self):
+        from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        with pytest.raises(ValueError, match="not found"):
+            EvaluatorFactory.get_evaluator_factory("does_not_exist")
+
+
+# ---------------------------------------------------------------------------
+# Pipeline Multi-Agent Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestPipelineMultiAgentFactory:
+    def test_get_relevance(self):
+        from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+        assert "Relevance" in result.__name__
+
+    def test_get_task_adherence(self):
+        from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+        assert "TaskAdherence" in result.__name__
+
+    def test_get_agents_invoked(self):
+        from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("evaluate_agents_invoked")
+        assert result.__name__ == "EvaluateAgentsInvoked"
+
+    def test_get_custom_agents_invoked(self):
+        from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("custom_agents_invoked_accuracy_eval")
+        assert result.__name__ == "EvaluateAgentsInvoked"
+
+    def test_invalid_raises(self):
+        from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        with pytest.raises(ValueError, match="not found"):
+            EvaluatorFactory.get_evaluator_factory("bogus")
+
+
+# ---------------------------------------------------------------------------
+# Pipeline Multi-Tool Agent Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestPipelineMultiToolFactory:
+    def test_get_relevance(self):
+        from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+        assert "Relevance" in result.__name__
+
+    def test_get_task_adherence(self):
+        from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+        assert "TaskAdherence" in result.__name__
+
+    def test_get_tool_call_accuracy(self):
+        from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator")
+        assert "ToolCallAccuracy" in result.__name__
+
+    def test_invalid_raises(self):
+        from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+            EvaluatorFactory
+
+        with pytest.raises(ValueError, match="not found"):
+            EvaluatorFactory.get_evaluator_factory("unknown")
+
+
+# ---------------------------------------------------------------------------
+# RAG Evaluation Foundry - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestRagEvaluationFoundryFactory:
+    def test_get_relevance(self):
+        from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \
+            EvaluatorFactory
+
+        result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+        assert "Relevance" in result.__name__
+
+    def test_invalid_raises(self):
+        from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \
+            EvaluatorFactory
+
+        with pytest.raises(ValueError, match="not found"):
+            EvaluatorFactory.get_evaluator_factory("missing")
+
+    def test_only_has_relevance(self):
+        from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \
+            EvaluatorFactory
+
+        assert len(EvaluatorFactory.EVALUATOR_FACTORIES) == 1
+        assert "relevance_evaluator" in EvaluatorFactory.EVALUATOR_FACTORIES
diff --git a/tests/unit/test_evaluation_utils.py b/tests/unit/test_evaluation_utils.py
new file mode 100644
index 0000000..f28a6c7
--- /dev/null
+++ b/tests/unit/test_evaluation_utils.py
@@ -0,0 +1,130 @@
+"""Unit tests for agent evaluation utility functions (evaluation_utils, agent_tools)."""
+
+import pytest
+
+from src.evaluations.offline.pipeline_multi_agent_evaluation.evaluator.evaluator_repo.eval_utils.evaluation_utils import (
+    agent_invoked_accuracy, calculate_match_percentage)
+
+# ---------------------------------------------------------------------------
+# agent_invoked_accuracy
+# ---------------------------------------------------------------------------
+
+class TestAgentInvokedAccuracy:
+    def test_exact_match(self):
+        """Same agents in same order should return True."""
+        assert agent_invoked_accuracy(["AgentA", "AgentB"], ["AgentA", "AgentB"]) is True
+
+    def test_same_agents_different_order(self):
+        """Same agents in different order should return True (set comparison)."""
+        assert agent_invoked_accuracy(["AgentB", "AgentA"], ["AgentA", "AgentB"]) is True
+
+    def test_missing_agent(self):
+        """Missing expected agent should return False."""
+        assert agent_invoked_accuracy(["AgentA"], ["AgentA", "AgentB"]) is False
+
+    def test_extra_agent(self):
+        """Extra predicted agent should return False."""
+        assert agent_invoked_accuracy(["AgentA", "AgentB", "AgentC"], ["AgentA", "AgentB"]) is False
+
+    def test_empty_both(self):
+        """Both empty should return True."""
+        assert agent_invoked_accuracy([], []) is True
+
+    def test_empty_predicted(self):
+        """Empty predicted with non-empty expected should return False."""
+        assert agent_invoked_accuracy([], ["AgentA"]) is False
+
+    def test_empty_expected(self):
+        """Non-empty predicted with empty expected should return False."""
+        assert agent_invoked_accuracy(["AgentA"], []) is False
+
+    def test_single_agent_match(self):
+        """Single agent match should return True."""
+        assert agent_invoked_accuracy(["AgentA"], ["AgentA"]) is True
+
+    def test_duplicate_agents(self):
+        """Duplicate agents should be treated as set."""
+        assert agent_invoked_accuracy(["AgentA", "AgentA"], ["AgentA"]) is True
+
+
+# ---------------------------------------------------------------------------
+# calculate_match_percentage
+# ---------------------------------------------------------------------------
+
+class TestCalculateMatchPercentage:
+    def test_full_match(self):
+        """All expected in predicted should return 1.0."""
+        assert calculate_match_percentage(["A", "B"], ["A", "B"]) == 1.0
+
+    def test_partial_match(self):
+        """Half expected in predicted should return 0.5."""
+        assert calculate_match_percentage(["A", "B"], ["A"]) == 0.5
+
+    def test_no_match(self):
+        """No overlap should return 0.0."""
+        assert calculate_match_percentage(["A", "B"], ["C", "D"]) == 0.0
+
+    def test_empty_expected(self):
+        """Empty expected should return 0.0 (avoid division by zero)."""
+        assert calculate_match_percentage([], ["A"]) == 0.0
+
+    def test_empty_predicted(self):
+        """Empty predicted should return 0.0."""
+        assert calculate_match_percentage(["A", "B"], []) == 0.0
+
+    def test_extra_predicted_agents(self):
+        """Extra predicted agents don't affect match percentage."""
+        assert calculate_match_percentage(["A"], ["A", "B", "C"]) == 1.0
+
+    def test_three_of_four(self):
+        """3 out of 4 expected should return 0.75."""
+        assert calculate_match_percentage(["A", "B", "C", "D"], ["A", "B", "C"]) == 0.75
+
+
+# ---------------------------------------------------------------------------
+# EvaluateAgentsInvoked
+# ---------------------------------------------------------------------------
+
+class TestEvaluateAgentsInvoked:
+    @pytest.fixture
+    def evaluator(self):
+        from src.evaluations.offline.pipeline_multi_agent_evaluation.evaluator.evaluator_repo.evaluate_agent_invoked import \
+            EvaluateAgentsInvoked
+        return EvaluateAgentsInvoked()
+
+    def test_exact_match_returns_accuracy_1(self, evaluator):
+        """Exact match should set accuracy to 1.0."""
+        result = evaluator(
+            expected_agents_to_invoke=["ACAgent", "TVAgent"],
+            predicted_agents_to_invoke=["ACAgent", "TVAgent"],
+        )
+        assert result["agents_invoke_accuracy"] == 1.0
+        assert result["agents_invoke_exact_match"] is True
+        assert result["agents_invoke_match_percentage"] == 1.0
+
+    def test_orchestrator_filtered(self, evaluator):
+        """OrchestratorAgent should be filtered from predicted."""
+        result = evaluator(
+            expected_agents_to_invoke=["ACAgent"],
+            predicted_agents_to_invoke=["OrchestratorAgent", "ACAgent"],
+        )
+        assert result["agents_invoke_accuracy"] == 1.0
+        assert result["agents_invoke_exact_match"] is True
+
+    def test_mismatch(self, evaluator):
+        """Mismatch should set accuracy to 0.0."""
+        result = evaluator(
+            expected_agents_to_invoke=["ACAgent", "TVAgent"],
+            predicted_agents_to_invoke=["DishwasherAgent"],
+        )
+        assert result["agents_invoke_accuracy"] == 0.0
+        assert result["agents_invoke_exact_match"] is False
+
+    def test_partial_match_percentage(self, evaluator):
+        """Partial match percentage should be calculated."""
+        result = evaluator(
+            expected_agents_to_invoke=["ACAgent", "TVAgent"],
+            predicted_agents_to_invoke=["ACAgent"],
+        )
+        assert result["agents_invoke_match_percentage"] == 0.5
+        assert result["agents_invoke_exact_match"] is False
diff --git a/tests/unit/test_run_eval.py b/tests/unit/test_run_eval.py
new file mode 100644
index 0000000..1455a5f
--- /dev/null
+++ b/tests/unit/test_run_eval.py
@@ -0,0 +1,152 @@
+"""Unit tests for run_eval module (src/agent_evaluation/agentic_ops/run_eval.py)."""
+
+from unittest.mock import MagicMock, patch
+
+from src.agent_evaluation.agentic_ops.run_eval import (setup_evaluation,
+                                                       should_pass_config)
+
+# ---------------------------------------------------------------------------
+# should_pass_config
+# ---------------------------------------------------------------------------
+
+class TestShouldPassConfig:
+    def test_function_with_required_arg(self):
+        """Function with required arg should return True."""
+        def func(config):
+            pass
+        assert should_pass_config(func) is True
+
+    def test_function_no_args(self):
+        """Function with no args should return False."""
+        def func():
+            pass
+        assert should_pass_config(func) is False
+
+    def test_function_only_defaults(self):
+        """Function with only default args should return False."""
+        def func(x=10, y=20):
+            pass
+        assert should_pass_config(func) is False
+
+    def test_function_with_kwargs_only(self):
+        """Function with **kwargs only should return False."""
+        def func(**kwargs):
+            pass
+        assert should_pass_config(func) is False
+
+    def test_function_keyword_only_required(self):
+        """Function with keyword-only required param should return True."""
+        def func(*, config):
+            pass
+        assert should_pass_config(func) is True
+
+
+# ---------------------------------------------------------------------------
+# setup_evaluation
+# ---------------------------------------------------------------------------
+
+class TestSetupEvaluation:
+    def test_setup_with_model_config_param(self):
+        """Evaluator factory accepting model_config should get it passed."""
+        mock_factory_cls = MagicMock()
+        mock_evaluator = MagicMock()
+        mock_factory_cls.return_value = mock_evaluator
+
+        # Create a factory class with model_config parameter
+        def factory_func(model_config=None):
+            return mock_evaluator
+
+        mock_eval_factory = MagicMock()
+        mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+        config = {
+            "evaluators": {"test_eval": "test_factory"},
+            "evaluator_config": {},
+        }
+
+        with patch.dict("os.environ", {
+            "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+            "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+            "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+        }):
+            evaluators, evaluator_config = setup_evaluation(config, mock_eval_factory)
+
+        assert "test_eval" in evaluators
+        assert evaluators["test_eval"] == mock_evaluator
+
+    def test_setup_with_no_params_factory(self):
+        """Evaluator factory accepting no params should be called without args."""
+        mock_evaluator = MagicMock()
+
+        def factory_func():
+            return mock_evaluator
+
+        mock_eval_factory = MagicMock()
+        mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+        config = {
+            "evaluators": {"simple_eval": "simple_factory"},
+            "evaluator_config": {},
+        }
+
+        with patch.dict("os.environ", {
+            "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+            "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+            "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+        }):
+            evaluators, evaluator_config = setup_evaluation(config, mock_eval_factory)
+
+        assert "simple_eval" in evaluators
+        assert evaluators["simple_eval"] == mock_evaluator
+
+    def test_setup_with_azure_ai_project_param(self):
+        """Factory accepting azure_ai_project should get it."""
+        mock_evaluator = MagicMock()
+
+        def factory_func(azure_ai_project=None):
+            return mock_evaluator
+
+        mock_eval_factory = MagicMock()
+        mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+        config = {
+            "evaluators": {"proj_eval": "proj_factory"},
+            "evaluator_config": {},
+        }
+
+        mock_project = MagicMock()
+        with patch.dict("os.environ", {
+            "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+            "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+            "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+        }):
+            evaluators, _ = setup_evaluation(config, mock_eval_factory, azure_ai_project=mock_project)
+
+        assert "proj_eval" in evaluators
+
+    def test_setup_resolves_column_mapping_placeholder(self):
+        """evaluator_config with 'use_column_mapping' should resolve."""
+        mock_evaluator = MagicMock()
+
+        def factory_func():
+            return mock_evaluator
+
+        mock_eval_factory = MagicMock()
+        mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+        config = {
+            "evaluators": {"test_eval": "test_factory"},
+            "column_mapping": {"query": "${data.query}"},
+            "evaluator_config": {
+                "test_eval": {"column_mapping": "use_column_mapping"},
+            },
+        }
+
+        with patch.dict("os.environ", {
+            "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+            "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+            "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+        }):
+            _, evaluator_config = setup_evaluation(config, mock_eval_factory)
+
+        assert evaluator_config["test_eval"]["column_mapping"] == {"query": "${data.query}"}
diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py
new file mode 100644
index 0000000..bb6287d
--- /dev/null
+++ b/tests/unit/test_runner.py
@@ -0,0 +1,99 @@
+"""Unit tests for the pipeline runner (src/agent_evaluation/agentic_ops/runner.py)."""
+
+import argparse
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+from src.agent_evaluation.agentic_ops.runner import load_config, parse_args
+
+# ---------------------------------------------------------------------------
+# load_config
+# ---------------------------------------------------------------------------
+
+class TestLoadConfig:
+    def test_loads_valid_yaml(self, tmp_path):
+        """Should load and return config dict from valid YAML."""
+        config_data = {"app_name": "TestApp", "pipeline": []}
+        config_file = tmp_path / "experiment.yaml"
+        config_file.write_text(yaml.dump(config_data))
+
+        result = load_config(config_file)
+        assert result == config_data
+
+    def test_raises_on_missing_file(self, tmp_path):
+        """Should raise FileNotFoundError for missing config."""
+        missing = tmp_path / "missing.yaml"
+        with pytest.raises(FileNotFoundError):
+            load_config(missing)
+
+    def test_returns_none_for_empty_yaml(self, tmp_path):
+        """Empty YAML file should return None."""
+        config_file = tmp_path / "empty.yaml"
+        config_file.write_text("")
+        result = load_config(config_file)
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# parse_args
+# ---------------------------------------------------------------------------
+
+class TestParseArgs:
+    def test_defaults(self):
+        """Default args should have expected values."""
+        with patch("sys.argv", ["runner"]):
+            args = parse_args()
+        assert args.config_file == "experiment.yaml"
+        assert args.index_fname is None
+        assert args.sample == 0
+
+    def test_custom_config_file(self):
+        """Should accept --config_file argument."""
+        with patch("sys.argv", ["runner", "--config_file", "custom.yaml"]):
+            args = parse_args()
+        assert args.config_file == "custom.yaml"
+
+    def test_sample_arg(self):
+        """Should accept --sample argument."""
+        with patch("sys.argv", ["runner", "--sample", "10"]):
+            args = parse_args()
+        assert args.sample == 10
+
+    def test_index_fname_arg(self):
+        """Should accept --index_fname argument."""
+        with patch("sys.argv", ["runner", "--index_fname", "file_001"]):
+            args = parse_args()
+        assert args.index_fname == "file_001"
+
+
+# ---------------------------------------------------------------------------
+# run_pipeline
+# ---------------------------------------------------------------------------
+
+class TestRunPipeline:
+    @patch("src.agent_evaluation.agentic_ops.runner.importlib.import_module")
+    @patch("src.agent_evaluation.agentic_ops.runner.load_config")
+    def test_run_pipeline_exits_on_invalid_step(self, mock_load_config, mock_import, tmp_path):
+        """Pipeline with an invalid step (missing base_path/module) should exit."""
+        from src.agent_evaluation.agentic_ops.runner import run_pipeline
+
+        mock_load_config.return_value = {
+            "experiment_name": "test",
+            "pipeline": [{"config_key": "evaluation"}],  # missing base_path and module
+            "evaluation": {},
+        }
+
+        with pytest.raises(SystemExit):
+            run_pipeline("test/experiment.yaml", argparse.Namespace(sample=0, index_fname=None))
+
+    @patch("src.agent_evaluation.agentic_ops.runner.load_config")
+    def test_run_pipeline_exits_on_empty_pipeline(self, mock_load_config, tmp_path):
+        """Pipeline with no steps should exit."""
+        from src.agent_evaluation.agentic_ops.runner import run_pipeline
+
+        mock_load_config.return_value = {"pipeline": []}
+
+        with pytest.raises(SystemExit):
+            run_pipeline("test/experiment.yaml", argparse.Namespace(sample=0, index_fname=None))
diff --git a/tests/unit/test_trace_to_jsonl.py b/tests/unit/test_trace_to_jsonl.py
new file mode 100644
index 0000000..596a9a4
--- /dev/null
+++ b/tests/unit/test_trace_to_jsonl.py
@@ -0,0 +1,138 @@
+"""Unit tests for trace_to_jsonl shared module."""
+
+import json
+
+from src.evaluations.offline.utils.trace_to_jsonl import (
+    extract_tool_call_from_span, extract_tool_definitions,
+    merge_tool_definitions)
+
+# ---------------------------------------------------------------------------
+# extract_tool_definitions
+# ---------------------------------------------------------------------------
+
+class TestExtractToolDefinitions:
+    def test_extracts_function_tools(self):
+        """Should extract function-type tool definitions."""
+        tool_defs = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather data",
+                    "parameters": {"type": "object"},
+                },
+            }
+        ]
+        custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)}
+
+        result = extract_tool_definitions(custom_dims)
+        assert len(result) == 1
+        assert result[0]["name"] == "get_weather"
+        assert result[0]["description"] == "Get weather data"
+
+    def test_empty_string(self):
+        """Empty tool definitions string should return empty list."""
+        result = extract_tool_definitions({"gen_ai.tool.definitions": ""})
+        assert result == []
+
+    def test_missing_key(self):
+        """Missing key should return empty list."""
+        result = extract_tool_definitions({})
+        assert result == []
+
+    def test_invalid_json(self):
+        """Invalid JSON should return empty list."""
+        result = extract_tool_definitions({"gen_ai.tool.definitions": "not json"})
+        assert result == []
+
+    def test_non_list_json(self):
+        """Non-list JSON should return empty list."""
+        result = extract_tool_definitions({"gen_ai.tool.definitions": '{"key": "value"}'})
+        assert result == []
+
+    def test_skips_non_function_type(self):
+        """Non-function type tools should be skipped."""
+        tool_defs = [{"type": "retrieval", "name": "search"}]
+        custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)}
+
+        result = extract_tool_definitions(custom_dims)
+        assert result == []
+
+    def test_multiple_tools(self):
+        """Multiple function tools should all be extracted."""
+        tool_defs = [
+            {"type": "function", "function": {"name": "tool_a", "description": "A", "parameters": {}}},
+            {"type": "function", "function": {"name": "tool_b", "description": "B", "parameters": {}}},
+        ]
+        custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)}
+
+        result = extract_tool_definitions(custom_dims)
+        assert len(result) == 2
+        assert result[0]["name"] == "tool_a"
+        assert result[1]["name"] == "tool_b"
+
+
+# ---------------------------------------------------------------------------
+# merge_tool_definitions
+# ---------------------------------------------------------------------------
+
+class TestMergeToolDefinitions:
+    def test_merge_new_tools(self):
+        """New tools should be added."""
+        existing = [{"name": "tool_a", "id": "tool_a"}]
+        new = [{"name": "tool_b", "id": "tool_b"}]
+
+        result = merge_tool_definitions(existing, new)
+        names = {t["name"] for t in result}
+        assert names == {"tool_a", "tool_b"}
+
+    def test_deduplicates_by_name(self):
+        """Duplicate names should not be added."""
+        existing = [{"name": "tool_a", "id": "1", "description": "first"}]
+        new = [{"name": "tool_a", "id": "2", "description": "second"}]
+
+        result = merge_tool_definitions(existing, new)
+        assert len(result) == 1
+        assert result[0]["description"] == "first"  # Keeps existing
+
+    def test_empty_new(self):
+        """Empty new list should return existing unchanged."""
+        existing = [{"name": "tool_a"}]
+        result = merge_tool_definitions(existing, [])
+        assert result == existing
+
+    def test_empty_existing(self):
+        """Empty existing should return new tools."""
+        new = [{"name": "tool_a"}, {"name": "tool_b"}]
+        result = merge_tool_definitions([], new)
+        assert len(result) == 2
+
+    def test_both_empty(self):
+        """Both empty should return empty list."""
+        result = merge_tool_definitions([], [])
+        assert result == []
+
+
+# ---------------------------------------------------------------------------
+# extract_tool_call_from_span
+# ---------------------------------------------------------------------------
+
+class TestExtractToolCallFromSpan:
+    def test_extracts_from_operation_name(self):
+        """Should extract tool name from 'execute_tool <name>' format."""
+        result = extract_tool_call_from_span({}, "execute_tool get_weather")
+        assert result["type"] == "tool_call"
+        assert result["name"] == "get_weather"
+
+    def test_falls_back_to_custom_dims(self):
+        """Should fall back to gen_ai.tool.name from custom dims."""
+        custom_dims = {"gen_ai.tool.name": "search_tool"}
+        result = extract_tool_call_from_span(custom_dims, "some_other_operation")
+        assert result["type"] == "tool_call"
+        assert result["name"] == "search_tool"
+
+    def test_empty_name(self):
+        """Should handle missing tool name gracefully."""
+        result = extract_tool_call_from_span({}, "other_span")
+        assert result["type"] == "tool_call"
+        assert result["name"] == ""
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
new file mode 100644
index 0000000..81306b0
--- /dev/null
+++ b/tests/unit/test_utils.py
@@ -0,0 +1,151 @@
+"""Unit tests for utility modules (file_operations, constants, trace_to_jsonl)."""
+
+import json
+
+from src.evaluations.offline.utils.constants import EVAL_NAME
+from src.evaluations.offline.utils.file_operations import (
+    append_to_jsonl, get_next_run_id, load_queries_from_jsonl, save_to_jsonl)
+
+# ---------------------------------------------------------------------------
+# constants
+# ---------------------------------------------------------------------------
+
+class TestConstants:
+    def test_eval_name_value(self):
+        """EVAL_NAME should be 'experiment_name'."""
+        assert EVAL_NAME == "experiment_name"
+
+
+# ---------------------------------------------------------------------------
+# load_queries_from_jsonl
+# ---------------------------------------------------------------------------
+
+class TestLoadQueriesFromJsonl:
+    def test_loads_valid_jsonl(self, tmp_path):
+        """Should load all lines from valid JSONL file."""
+        data = [{"query": "q1"}, {"query": "q2"}, {"query": "q3"}]
+        f = tmp_path / "test.jsonl"
+        f.write_text("\n".join(json.dumps(d) for d in data))
+
+        result = load_queries_from_jsonl(str(f))
+        assert result == data
+
+    def test_skips_blank_lines(self, tmp_path):
+        """Blank lines should be skipped."""
+        f = tmp_path / "test.jsonl"
+        f.write_text('{"a":1}\n\n{"b":2}\n\n')
+
+        result = load_queries_from_jsonl(str(f))
+        assert len(result) == 2
+
+    def test_empty_file(self, tmp_path):
+        """Empty file should return empty list."""
+        f = tmp_path / "empty.jsonl"
+        f.write_text("")
+
+        result = load_queries_from_jsonl(str(f))
+        assert result == []
+
+    def test_preserves_unicode(self, tmp_path):
+        """Unicode content should be preserved."""
+        data = [{"query": "Hello world"}]
+        f = tmp_path / "unicode.jsonl"
+        f.write_text(json.dumps(data[0], ensure_ascii=True) + "\n")
+
+        result = load_queries_from_jsonl(str(f))
+        assert result[0]["query"] == "Hello world"
+
+
+# ---------------------------------------------------------------------------
+# save_to_jsonl
+# ---------------------------------------------------------------------------
+
+class TestSaveToJsonl:
+    def test_saves_data(self, tmp_path):
+        """Should save list of dicts to JSONL."""
+        data = [{"a": 1}, {"b": 2}]
+        f = tmp_path / "out.jsonl"
+
+        save_to_jsonl(str(f), data)
+
+        lines = f.read_text().strip().split("\n")
+        assert len(lines) == 2
+        assert json.loads(lines[0]) == {"a": 1}
+        assert json.loads(lines[1]) == {"b": 2}
+
+    def test_empty_list(self, tmp_path):
+        """Empty list should create empty file."""
+        f = tmp_path / "empty.jsonl"
+        save_to_jsonl(str(f), [])
+        assert f.read_text() == ""
+
+    def test_overwrites_existing(self, tmp_path):
+        """Should overwrite existing file."""
+        f = tmp_path / "overwrite.jsonl"
+        f.write_text("old content")
+
+        save_to_jsonl(str(f), [{"new": True}])
+        lines = f.read_text().strip().split("\n")
+        assert json.loads(lines[0]) == {"new": True}
+
+
+# ---------------------------------------------------------------------------
+# append_to_jsonl
+# ---------------------------------------------------------------------------
+
+class TestAppendToJsonl:
+    def test_appends_single_record(self, tmp_path):
+        """Should append one record to file."""
+        f = tmp_path / "append.jsonl"
+        f.write_text('{"a":1}\n')
+
+        append_to_jsonl(str(f), {"b": 2})
+
+        lines = f.read_text().strip().split("\n")
+        assert len(lines) == 2
+        assert json.loads(lines[1]) == {"b": 2}
+
+    def test_creates_file_if_missing(self, tmp_path):
+        """Should create file if it doesn't exist."""
+        f = tmp_path / "new.jsonl"
+        append_to_jsonl(str(f), {"first": True})
+
+        assert f.exists()
+        assert json.loads(f.read_text().strip()) == {"first": True}
+
+
+# ---------------------------------------------------------------------------
+# get_next_run_id
+# ---------------------------------------------------------------------------
+
+class TestGetNextRunId:
+    def test_empty_directory(self, tmp_path):
+        """Empty dir should return 1."""
+        assert get_next_run_id(str(tmp_path)) == 1
+
+    def test_nonexistent_directory(self, tmp_path):
+        """Non-existent dir should return 1."""
+        assert get_next_run_id(str(tmp_path / "nonexistent")) == 1
+
+    def test_sequential_numbering(self, tmp_path):
+        """Should return one more than the highest existing number."""
+        (tmp_path / "1_eval_result.json").write_text("{}")
+        (tmp_path / "2_eval_result.json").write_text("{}")
+        (tmp_path / "3_eval_result.json").write_text("{}")
+
+        assert get_next_run_id(str(tmp_path)) == 4
+
+    def test_ignores_non_matching_files(self, tmp_path):
+        """Files not matching the pattern should be ignored."""
+        (tmp_path / "2_eval_result.json").write_text("{}")
+        (tmp_path / "readme.md").write_text("# Hi")
+        (tmp_path / "config.yaml").write_text("")
+
+        assert get_next_run_id(str(tmp_path)) == 3
+
+    def test_handles_gaps(self, tmp_path):
+        """Should use the max, not count."""
+        (tmp_path / "1_a.json").write_text("{}")
+        (tmp_path / "5_b.json").write_text("{}")
+
+        assert get_next_run_id(str(tmp_path)) == 6