diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..986f4fb
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,34 @@
+name: "CodeQL"
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+ schedule:
+ - cron: "25 14 * * 1"
+
+jobs:
+ analyze:
+ name: Analyze (Python)
+ runs-on: ubuntu-latest
+ permissions:
+ security-events: write
+ packages: read
+ actions: read
+ contents: read
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Initialize CodeQL
+ uses: github/codeql-action/init@v3
+ with:
+ languages: python
+ queries: security-and-quality
+
+ - name: Perform CodeQL Analysis
+ uses: github/codeql-action/analyze@v3
+ with:
+ category: "/language:python"
diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml
new file mode 100644
index 0000000..df39cd3
--- /dev/null
+++ b/.github/workflows/pr-checks.yml
@@ -0,0 +1,42 @@
+name: PR Checks
+
+on:
+ pull_request:
+ branches: [main]
+ push:
+ branches: [main]
+
+jobs:
+ checks:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.11", "3.12", "3.13"]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: pip
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ pip install flake8 isort
+
+ - name: Lint with flake8
+ run: |
+ flake8 src/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics
+ flake8 src/ tests/ --count --max-line-length=120 --statistics --exit-zero
+
+ - name: Check import ordering with isort
+ run: |
+ isort --check-only --diff src/ tests/
+
+ - name: Run unit tests
+ run: |
+ python -m pytest tests/ -v --tb=short
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..9855d94
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = -v --tb=short
diff --git a/requirements.txt b/requirements.txt
index cd6ed8e..713b8b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Azure SDK dependencies
azure-identity>=1.25.3
azure-ai-projects>=2.2.0
-azure-ai-evaluation==1.16.9
+azure-ai-evaluation==1.17.0
azure-ai-inference>=1.0.0b9
# Core Python packages
python-dotenv>=1.2.2
@@ -9,8 +9,11 @@ pyyaml>=6.0.3
pip-system-certs>=5.3
azure-monitor-query>=2.0.0
azure-monitor-opentelemetry>=1.8.8
-aiohttp>=3.13.5
-agent-framework==1.7.0
+aiohttp>=3.14.1
+agent-framework>=1.9.0
streamlit>=1.58.0
pandas==2.3.3
-plotly>=6.7.0
+plotly>=6.8.0
+# Test dependencies
+pytest>=9.0.0
+pytest-asyncio>=1.3.0
diff --git a/src/agent_evaluation/agentic_ops/base_evaluator.py b/src/agent_evaluation/agentic_ops/base_evaluator.py
index ba6e9f3..2ba3971 100644
--- a/src/agent_evaluation/agentic_ops/base_evaluator.py
+++ b/src/agent_evaluation/agentic_ops/base_evaluator.py
@@ -1,9 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
+import logging
import os
import re
-import logging
from typing import Dict, Union
from .client import LLMClient
diff --git a/src/agent_evaluation/agentic_ops/client.py b/src/agent_evaluation/agentic_ops/client.py
index c34aeb1..199f127 100644
--- a/src/agent_evaluation/agentic_ops/client.py
+++ b/src/agent_evaluation/agentic_ops/client.py
@@ -8,12 +8,13 @@
import json
import logging
-from openai import AzureOpenAI
-from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import os
-from dotenv import load_dotenv
import time
+from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+from dotenv import load_dotenv
+from openai import AzureOpenAI
+
load_dotenv()
# Configure logging
diff --git a/src/agent_evaluation/agentic_ops/run_eval.py b/src/agent_evaluation/agentic_ops/run_eval.py
index f06928e..ca4dd4e 100644
--- a/src/agent_evaluation/agentic_ops/run_eval.py
+++ b/src/agent_evaluation/agentic_ops/run_eval.py
@@ -1,11 +1,13 @@
-import os
import inspect
+import logging
+import os
import uuid
-from dotenv import load_dotenv
+
from azure.ai.evaluation import evaluate
from azure.ai.projects import AIProjectClient
-import logging
from azure.identity import DefaultAzureCredential
+from dotenv import load_dotenv
+
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/agent_evaluation/agentic_ops/runner.py b/src/agent_evaluation/agentic_ops/runner.py
index 5126171..a6719dd 100644
--- a/src/agent_evaluation/agentic_ops/runner.py
+++ b/src/agent_evaluation/agentic_ops/runner.py
@@ -1,12 +1,14 @@
import argparse
import importlib
+import logging
+import os
import sys
import time
from pathlib import Path
-from typing import Optional, Any, Dict
+from typing import Any, Dict, Optional
+
import yaml
-import os
-import logging
+
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/agent_evaluation/cli.py b/src/agent_evaluation/cli.py
index 495a8fd..013efb5 100644
--- a/src/agent_evaluation/cli.py
+++ b/src/agent_evaluation/cli.py
@@ -9,11 +9,10 @@
import argparse
import sys
from pathlib import Path
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
import yaml
-
# Root of the project (two levels up from this file)
ROOT_DIR = Path(__file__).resolve().parents[2]
SAMPLES_DIR = ROOT_DIR / "src" / "evaluations" / "offline"
@@ -74,7 +73,8 @@ def print_samples_table(samples: List[Dict[str, str]]) -> None:
def run_sample(sample: Dict[str, str], extra_args: Optional[List[str]] = None) -> int:
"""Run a selected evaluation sample."""
- from src.agent_evaluation.agentic_ops.runner import run_pipeline, parse_args
+ from src.agent_evaluation.agentic_ops.runner import (parse_args,
+ run_pipeline)
config_path = sample["config_path"]
print(f"\n{'='*70}")
diff --git a/src/evaluations/offline/agentic_evaluation/eval_factory.py b/src/evaluations/offline/agentic_evaluation/eval_factory.py
index 19f9c89..440fff9 100644
--- a/src/evaluations/offline/agentic_evaluation/eval_factory.py
+++ b/src/evaluations/offline/agentic_evaluation/eval_factory.py
@@ -1,8 +1,12 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-from .evaluator.evaluator_repo.evaluate_agent_invoked import EvaluateAgentsInvoked
-
-import os
import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+ ToolCallAccuracyEvaluator)
+
+from .evaluator.evaluator_repo.evaluate_agent_invoked import \
+ EvaluateAgentsInvoked
+
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py b/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py
index 62254d2..919f28e 100644
--- a/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
import os
from pathlib import Path
+
from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
from src.evaluations.offline.utils.constants import EVAL_NAME
from src.evaluations.offline.utils.file_operations import get_next_run_id
+
from ..eval_factory import EvaluatorFactory
diff --git a/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py b/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
index 1158b21..1eb437f 100644
--- a/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
+++ b/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
@@ -1,5 +1,6 @@
from .eval_utils.evaluation_utils import agent_invoked_accuracy, compute_recall
+
class EvaluateAgentsInvoked:
def __init__(self):
pass
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py b/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py
index 81545b1..312fea0 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py
@@ -1,11 +1,13 @@
-from azure.ai.evaluation import RelevanceEvaluator, CoherenceEvaluator
+import logging
+import os
+
+from azure.ai.evaluation import CoherenceEvaluator, RelevanceEvaluator
+
from .evaluator.evaluator_repo.coherence import CoherenceEvaluatorCustom
-from .evaluator.evaluator_repo.relevance import RelevanceEvaluatorCustom
from .evaluator.evaluator_repo.fluency import FluencyEvaluatorCustom
+from .evaluator.evaluator_repo.relevance import RelevanceEvaluatorCustom
from .evaluator.evaluator_repo.similarity import SimilarityEvaluatorCustom
-import os
-import logging
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py
index 14245ef..141df04 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
import os
from pathlib import Path
+
from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
from src.evaluations.offline.utils.constants import EVAL_NAME
from src.evaluations.offline.utils.file_operations import get_next_run_id
+
from ..eval_factory import EvaluatorFactory
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py
index 3d06c39..7fec400 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py
@@ -1,5 +1,7 @@
from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+ BaseCustomEvaluator
class CoherenceEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py
index 5a03f3d..aeff3a1 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py
@@ -2,7 +2,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+ BaseCustomEvaluator
class FluencyEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py
index 71cd9f3..3cd9b73 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py
@@ -1,5 +1,7 @@
from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+ BaseCustomEvaluator
class RelevanceEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py
index b2a34bd..3dd147f 100644
--- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py
+++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py
@@ -2,7 +2,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import Dict, Union
-from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+from ......agent_evaluation.agentic_ops.base_evaluator import \
+ BaseCustomEvaluator
class SimilarityEvaluatorCustom(BaseCustomEvaluator):
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py
index 1fd459c..1fa5560 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py
@@ -1,7 +1,9 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-
-import os
import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+ ToolCallAccuracyEvaluator)
+
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py
index 14245ef..141df04 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
import os
from pathlib import Path
+
from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
from src.evaluations.offline.utils.constants import EVAL_NAME
from src.evaluations.offline.utils.file_operations import get_next_run_id
+
from ..eval_factory import EvaluatorFactory
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py
index ddb1e73..8def77a 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py
@@ -11,11 +11,14 @@
FLOW:
Load Queries → Run Inference → Save Responses
"""
-import os
import logging
+import os
import random
from pathlib import Path
-from src.evaluations.offline.utils.file_operations import load_queries_from_jsonl
+
+from src.evaluations.offline.utils.file_operations import \
+ load_queries_from_jsonl
+
from .experiment_utils import get_file_paths, prepare_output_file, save_result
@@ -101,7 +104,7 @@ def inference_main(config: dict, args=None) -> None:
if __name__ == "__main__":
# For standalone execution, load config from experiment.yaml
import yaml
-
+
# Get project root (go up 5 levels from this file)
current_file = Path(__file__) # .../experiment/agent_inference.py
project_root = current_file.parent.parent.parent.parent.parent.parent # Go up to project root
diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py
index 0f27fc0..379241b 100644
--- a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py
+++ b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py
@@ -3,11 +3,11 @@
==============
Helper functions for file management in inference pipelines.
"""
-import os
import logging
+import os
from pathlib import Path
-from src.evaluations.offline.utils.file_operations import append_to_jsonl
+from src.evaluations.offline.utils.file_operations import append_to_jsonl
logger = logging.getLogger(__name__)
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py
index 24d2636..dd14b67 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py
@@ -1,9 +1,10 @@
"""Tool functions for the device agents in the Multi-Agent system."""
+from random import choice, randint
from typing import Annotated
-from pydantic import Field
-from random import randint, choice
+
from agent_framework import tool
+from pydantic import Field
# =============================================================================
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py
index b7f7de5..6efccf7 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py
@@ -22,60 +22,36 @@
"""
import asyncio
import json
-import os
import logging
-from typing import Annotated
+import os
from pathlib import Path
+from typing import Annotated
+
from agent_framework import Agent, tool
from agent_framework.observability import enable_instrumentation, get_tracer
+from agent_framework.openai import OpenAIChatClient
+from azure.identity import AzureCliCredential
from azure.monitor.opentelemetry import configure_azure_monitor
from opentelemetry import context as otel_context
from opentelemetry.trace import SpanKind
from opentelemetry.trace.span import format_trace_id
-from agent_framework.openai import OpenAIChatClient
-from azure.identity import AzureCliCredential
from pydantic import Field
# Handle both standalone and package execution
try:
# When run as part of pipeline (package import)
- from .agent_tools import (
- # AC tools
- set_ac_temperature,
- turn_ac_on,
- turn_ac_off,
- set_ac_mode,
- get_ac_status,
- # TV tools
- turn_tv_on,
- turn_tv_off,
- set_tv_channel,
- set_tv_volume,
- get_tv_status,
- # Dishwasher tools
- start_dishwasher,
- stop_dishwasher,
- get_dishwasher_status,
- set_dishwasher_delay,
- )
+ from .agent_tools import ( # AC tools; TV tools; Dishwasher tools
+ get_ac_status, get_dishwasher_status, get_tv_status, set_ac_mode,
+ set_ac_temperature, set_dishwasher_delay, set_tv_channel,
+ set_tv_volume, start_dishwasher, stop_dishwasher, turn_ac_off,
+ turn_ac_on, turn_tv_off, turn_tv_on)
except ImportError:
# When run standalone
- from agent_tools import (
- set_ac_temperature,
- turn_ac_on,
- turn_ac_off,
- set_ac_mode,
- get_ac_status,
- turn_tv_on,
- turn_tv_off,
- set_tv_channel,
- set_tv_volume,
- get_tv_status,
- start_dishwasher,
- stop_dishwasher,
- get_dishwasher_status,
- set_dishwasher_delay,
- )
+ from agent_tools import (get_ac_status, get_dishwasher_status,
+ get_tv_status, set_ac_mode, set_ac_temperature,
+ set_dishwasher_delay, set_tv_channel,
+ set_tv_volume, start_dishwasher, stop_dishwasher,
+ turn_ac_off, turn_ac_on, turn_tv_off, turn_tv_on)
try:
from src.evaluations.offline.utils.file_operations import append_to_jsonl
@@ -86,6 +62,7 @@
from src.evaluations.offline.utils.file_operations import append_to_jsonl
from dotenv import load_dotenv
+
load_dotenv()
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
index 11c82b7..eef116d 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
@@ -3,12 +3,14 @@
Delegates to the shared trace_to_jsonl module in utils/.
"""
-from src.evaluations.offline.utils.trace_to_jsonl import get_trace_main # noqa: F401
+from src.evaluations.offline.utils.trace_to_jsonl import \
+ get_trace_main # noqa: F401
if __name__ == "__main__":
- import yaml
from pathlib import Path
+ import yaml
+
script_dir = Path(__file__).parent
config_path = script_dir.parent / "experiment.yaml"
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py
index 8ae2cb8..fbd30ad 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py
@@ -1,8 +1,12 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-from .evaluator.evaluator_repo.evaluate_agent_invoked import EvaluateAgentsInvoked
-
-import os
import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+ ToolCallAccuracyEvaluator)
+
+from .evaluator.evaluator_repo.evaluate_agent_invoked import \
+ EvaluateAgentsInvoked
+
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py
index 7e3748c..141df04 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py
@@ -1,11 +1,12 @@
+import logging
import os
from pathlib import Path
+
from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
from src.evaluations.offline.utils.constants import EVAL_NAME
from src.evaluations.offline.utils.file_operations import get_next_run_id
-from ..eval_factory import EvaluatorFactory
+from ..eval_factory import EvaluatorFactory
def get_logger(name: str):
diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
index 1102f3e..77a61df 100644
--- a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
+++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py
@@ -1,4 +1,6 @@
-from .eval_utils.evaluation_utils import agent_invoked_accuracy, calculate_match_percentage
+from .eval_utils.evaluation_utils import (agent_invoked_accuracy,
+ calculate_match_percentage)
+
class EvaluateAgentsInvoked:
def __init__(self):
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py
index 7eec989..1d80b7b 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py
@@ -1,8 +1,10 @@
"""Tool functions for the Multi-Tool Agent."""
+from random import randint
from typing import Annotated
+
from pydantic import Field
-from random import randint
+
def get_weather(
location: Annotated[str, Field(description="The location to get the weather for.")],
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py
index 4382c79..faac3a9 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py
@@ -9,44 +9,31 @@
"""
import asyncio
import json
-import os
import logging
+import os
from datetime import datetime
from pathlib import Path
+
from agent_framework import Agent, ChatOptions
from agent_framework.observability import enable_instrumentation, get_tracer
+from agent_framework.openai import OpenAIChatClient
+from azure.identity import AzureCliCredential
from azure.monitor.opentelemetry import configure_azure_monitor
from opentelemetry import context as otel_context
from opentelemetry.trace import SpanKind
from opentelemetry.trace.span import format_trace_id
-from agent_framework.openai import OpenAIChatClient
-from azure.identity import AzureCliCredential
# Handle both standalone and package execution
try:
# When run as part of pipeline (package import)
- from .agent_tools import (
- get_current_datetime,
- calculate_sum,
- calculate_product,
- convert_temperature,
- count_words,
- generate_uuid,
- format_json,
- get_weather
- )
+ from .agent_tools import (calculate_product, calculate_sum,
+ convert_temperature, count_words, format_json,
+ generate_uuid, get_current_datetime, get_weather)
except ImportError:
# When run standalone
- from agent_tools import (
- get_current_datetime,
- calculate_sum,
- calculate_product,
- convert_temperature,
- count_words,
- generate_uuid,
- format_json,
- get_weather
- )
+ from agent_tools import (calculate_product, calculate_sum,
+ convert_temperature, count_words, format_json,
+ generate_uuid, get_current_datetime, get_weather)
try:
from src.evaluations.offline.utils.file_operations import append_to_jsonl
@@ -57,6 +44,7 @@
from src.evaluations.offline.utils.file_operations import append_to_jsonl
from dotenv import load_dotenv
+
load_dotenv()
@@ -193,7 +181,8 @@ async def run_inference_async(config: dict) -> None:
# agent_framework's own spans, so disabling just this instrumentor
# is safe.
try:
- from azure.ai.projects.telemetry._responses_instrumentor import ResponsesInstrumentor
+ from azure.ai.projects.telemetry._responses_instrumentor import \
+ ResponsesInstrumentor
if ResponsesInstrumentor().is_instrumented():
ResponsesInstrumentor().uninstrument()
logger.info("[AGENT] Disabled azure-ai-projects ResponsesInstrumentor (parallel-tool-call bug workaround)")
@@ -275,7 +264,7 @@ def inference_main(config: dict, args=None) -> None:
# =============================================================================
if __name__ == "__main__":
import yaml
-
+
# Get config path relative to this file
script_dir = Path(__file__).parent
config_path = script_dir.parent / "experiment.yaml"
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
index 74eaee7..79b0532 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py
@@ -4,12 +4,14 @@
Delegates to the shared trace_to_jsonl module in utils/.
"""
-from src.evaluations.offline.utils.trace_to_jsonl import get_trace_main # noqa: F401
+from src.evaluations.offline.utils.trace_to_jsonl import \
+ get_trace_main # noqa: F401
if __name__ == "__main__":
- import yaml
from pathlib import Path
+ import yaml
+
script_dir = Path(__file__).parent
config_path = script_dir.parent / "experiment.yaml"
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py
index 1fd459c..1fa5560 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py
@@ -1,7 +1,9 @@
-from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator
-
-import os
import logging
+import os
+
+from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator,
+ ToolCallAccuracyEvaluator)
+
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py
index 7e3748c..141df04 100644
--- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py
+++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py
@@ -1,11 +1,12 @@
+import logging
import os
from pathlib import Path
+
from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
from src.evaluations.offline.utils.constants import EVAL_NAME
from src.evaluations.offline.utils.file_operations import get_next_run_id
-from ..eval_factory import EvaluatorFactory
+from ..eval_factory import EvaluatorFactory
def get_logger(name: str):
diff --git a/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py b/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py
index a4af652..949a336 100644
--- a/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py
+++ b/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py
@@ -1,7 +1,8 @@
+import logging
+import os
+
from azure.ai.evaluation import RelevanceEvaluator
-import os
-import logging
def get_logger(name: str):
level = os.environ.get("LOG_LEVEL", "INFO").upper()
diff --git a/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py b/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py
index 14245ef..141df04 100644
--- a/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py
+++ b/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py
@@ -1,9 +1,11 @@
+import logging
import os
from pathlib import Path
+
from src.agent_evaluation.agentic_ops.run_eval import execute_eval
-import logging
from src.evaluations.offline.utils.constants import EVAL_NAME
from src.evaluations.offline.utils.file_operations import get_next_run_id
+
from ..eval_factory import EvaluatorFactory
diff --git a/src/evaluations/offline/utils/trace_to_jsonl.py b/src/evaluations/offline/utils/trace_to_jsonl.py
index fbe0696..b56f6ff 100644
--- a/src/evaluations/offline/utils/trace_to_jsonl.py
+++ b/src/evaluations/offline/utils/trace_to_jsonl.py
@@ -12,16 +12,18 @@
Used by both pipeline_multi_agent_evaluation and
pipeline_multi_tool_agent_evaluation.
"""
-import os
import json
import logging
+import os
import time
-from pathlib import Path
-from typing import Dict, List, Any
from datetime import timedelta
-from azure.monitor.query import LogsQueryClient, LogsQueryStatus
+from pathlib import Path
+from typing import Any, Dict, List
+
+from azure.core.exceptions import (HttpResponseError, ServiceRequestError,
+ ServiceResponseError)
from azure.identity import DefaultAzureCredential
-from azure.core.exceptions import HttpResponseError, ServiceRequestError, ServiceResponseError
+from azure.monitor.query import LogsQueryClient, LogsQueryStatus
from dotenv import load_dotenv
load_dotenv()
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_agent_tools.py b/tests/unit/test_agent_tools.py
new file mode 100644
index 0000000..8844ab1
--- /dev/null
+++ b/tests/unit/test_agent_tools.py
@@ -0,0 +1,157 @@
+"""Unit tests for the agent_tools module (device agent tools)."""
+
+from src.evaluations.offline.pipeline_multi_agent_evaluation.agent_inference.agent_tools import (
+ get_ac_status, get_dishwasher_status, get_tv_status, set_ac_mode,
+ set_ac_temperature, set_dishwasher_delay, set_tv_channel, set_tv_volume,
+ start_dishwasher, stop_dishwasher, turn_ac_off, turn_ac_on, turn_tv_off,
+ turn_tv_on)
+
+# ---------------------------------------------------------------------------
+# AC Tools
+# ---------------------------------------------------------------------------
+
+class TestACTools:
+ def test_set_temperature_valid(self):
+ """Valid temperature should succeed."""
+ result = set_ac_temperature(temperature=72)
+ assert "72" in result
+ assert "set to" in result.lower() or "72°F" in result
+
+ def test_set_temperature_too_low(self):
+ """Temperature below range should return error."""
+ result = set_ac_temperature(temperature=50)
+ assert "error" in result.lower() or "out of range" in result.lower()
+
+ def test_set_temperature_too_high(self):
+ """Temperature above range should return error."""
+ result = set_ac_temperature(temperature=90)
+ assert "error" in result.lower() or "out of range" in result.lower()
+
+ def test_set_temperature_boundary_low(self):
+ """60°F should be accepted."""
+ result = set_ac_temperature(temperature=60)
+ assert "60" in result
+
+ def test_set_temperature_boundary_high(self):
+ """85°F should be accepted."""
+ result = set_ac_temperature(temperature=85)
+ assert "85" in result
+
+ def test_turn_ac_on(self):
+ """Should confirm AC turned on."""
+ result = turn_ac_on()
+ assert "on" in result.lower()
+
+ def test_turn_ac_off(self):
+ """Should confirm AC turned off."""
+ result = turn_ac_off()
+ assert "off" in result.lower()
+
+ def test_set_ac_mode_valid(self):
+ """Valid modes should succeed."""
+ for mode in ["cool", "heat", "fan", "auto"]:
+ result = set_ac_mode(mode=mode)
+ assert mode in result.lower()
+
+ def test_set_ac_mode_invalid(self):
+ """Invalid mode should return error."""
+ result = set_ac_mode(mode="turbo")
+ assert "error" in result.lower() or "invalid" in result.lower()
+
+ def test_get_ac_status(self):
+ """Should return a status string."""
+ result = get_ac_status()
+ assert isinstance(result, str)
+ assert "AC" in result or "ac" in result.lower()
+
+
+# ---------------------------------------------------------------------------
+# TV Tools
+# ---------------------------------------------------------------------------
+
+class TestTVTools:
+ def test_turn_tv_on(self):
+ """Should confirm TV turned on."""
+ result = turn_tv_on()
+ assert "on" in result.lower()
+
+ def test_turn_tv_off(self):
+ """Should confirm TV turned off."""
+ result = turn_tv_off()
+ assert "off" in result.lower()
+
+ def test_set_channel_valid(self):
+ """Valid channel should succeed."""
+ result = set_tv_channel(channel=42)
+ assert "42" in result
+
+ def test_set_channel_too_low(self):
+ """Channel 0 should return error."""
+ result = set_tv_channel(channel=0)
+ assert "error" in result.lower() or "out of range" in result.lower()
+
+ def test_set_channel_too_high(self):
+ """Channel 1000 should return error."""
+ result = set_tv_channel(channel=1000)
+ assert "error" in result.lower() or "out of range" in result.lower()
+
+ def test_set_channel_boundary(self):
+ """Channels 1 and 999 should be accepted."""
+ assert "1" in set_tv_channel(channel=1)
+ assert "999" in set_tv_channel(channel=999)
+
+ def test_set_volume_valid(self):
+ """Valid volume should succeed."""
+ result = set_tv_volume(volume=50)
+ assert "50" in result
+
+ def test_set_volume_too_low(self):
+ """Volume -1 should return error."""
+ result = set_tv_volume(volume=-1)
+ assert "error" in result.lower() or "out of range" in result.lower()
+
+ def test_set_volume_too_high(self):
+ """Volume 101 should return error."""
+ result = set_tv_volume(volume=101)
+ assert "error" in result.lower() or "out of range" in result.lower()
+
+ def test_set_volume_boundaries(self):
+ """Volume 0 and 100 should be accepted."""
+ assert "0" in set_tv_volume(volume=0)
+ assert "100" in set_tv_volume(volume=100)
+
+ def test_get_tv_status(self):
+ """Should return a status string."""
+ result = get_tv_status()
+ assert isinstance(result, str)
+ assert "TV" in result or "tv" in result.lower()
+
+
+# ---------------------------------------------------------------------------
+# Dishwasher Tools
+# ---------------------------------------------------------------------------
+
+class TestDishwasherTools:
+ def test_start_dishwasher(self):
+ """Should confirm dishwasher started."""
+ result = start_dishwasher()
+ assert isinstance(result, str)
+ assert len(result) > 0
+
+ def test_stop_dishwasher(self):
+ """Should confirm dishwasher stopped."""
+ result = stop_dishwasher()
+ assert isinstance(result, str)
+ assert len(result) > 0
+
+ def test_get_dishwasher_status(self):
+ """Should return a status string."""
+ result = get_dishwasher_status()
+ assert isinstance(result, str)
+ assert len(result) > 0
+
+ def test_set_dishwasher_delay(self):
+ """Should confirm delay set."""
+ result = set_dishwasher_delay(hours=2)
+ assert isinstance(result, str)
+ assert len(result) > 0
diff --git a/tests/unit/test_base_evaluator.py b/tests/unit/test_base_evaluator.py
new file mode 100644
index 0000000..97a7b83
--- /dev/null
+++ b/tests/unit/test_base_evaluator.py
@@ -0,0 +1,207 @@
+"""Unit tests for the BaseCustomEvaluator (src/agent_evaluation/agentic_ops/base_evaluator.py)."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from src.agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+class ConcreteEvaluator(BaseCustomEvaluator):
+ """Concrete test subclass of BaseCustomEvaluator."""
+
+ def __init__(self, model_config=None):
+ super().__init__(
+ prompty_file_name="test.prompty",
+ result_key="test_score",
+ model_config=model_config,
+ )
+
+ def __call__(self, query: str, response: str, **kwargs):
+ return self.evaluate(query=query, response=response, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# _extract_score
+# ---------------------------------------------------------------------------
+
+class TestExtractScore:
+ @pytest.fixture
+ def evaluator(self):
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+ return e
+
+ def test_extracts_structured_s2_tag(self, evaluator):
+ """Should extract score from 4 format."""
+ assert evaluator._extract_score("4") == 4
+
+ def test_extracts_score_colon_format(self, evaluator):
+ """Should extract from 'Score: 5' format."""
+ assert evaluator._extract_score("The evaluation is complete. Score: 5") == 5
+
+ def test_extracts_rating_format(self, evaluator):
+ """Should extract from 'Rating: 3' format."""
+ assert evaluator._extract_score("Rating: 3 - The response is adequate") == 3
+
+ def test_returns_default_on_no_match(self, evaluator):
+ """Should return default when no score is found."""
+ assert evaluator._extract_score("No numeric content here at all") == 3
+
+ def test_custom_default(self, evaluator):
+ """Should use custom default score."""
+ assert evaluator._extract_score("no score", default_score=1) == 1
+
+ def test_ignores_out_of_range_scores(self, evaluator):
+ """Scores outside 1-5 from pattern matching should fall through."""
+ # The S2 tag extraction doesn't validate range, but pattern matching does
+ result = evaluator._extract_score("Score: 9")
+ # 9 is out of range for pattern matching fallback, should use default
+ assert result == 3
+
+ def test_s2_tag_any_value(self, evaluator):
+ """S2 tag should accept any digit value."""
+ assert evaluator._extract_score("1") == 1
+ assert evaluator._extract_score("5") == 5
+
+
+# ---------------------------------------------------------------------------
+# _create_user_prompt
+# ---------------------------------------------------------------------------
+
+class TestCreateUserPrompt:
+ @pytest.fixture
+ def evaluator(self):
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+ return e
+
+ def test_replaces_single_placeholder(self, evaluator):
+ """Should replace {{query}} with value."""
+ template = "Evaluate: {{query}}"
+ result = evaluator._create_user_prompt(template, query="What is AI?")
+ assert result == "Evaluate: What is AI?"
+
+ def test_replaces_multiple_placeholders(self, evaluator):
+ """Should replace multiple placeholders."""
+ template = "Query: {{query}}\nResponse: {{response}}"
+ result = evaluator._create_user_prompt(template, query="Q", response="A")
+ assert result == "Query: Q\nResponse: A"
+
+ def test_leaves_unknown_placeholders(self, evaluator):
+ """Unresolved placeholders should remain."""
+ template = "Query: {{query}} Context: {{context}}"
+ result = evaluator._create_user_prompt(template, query="Q")
+ assert "{{context}}" in result
+
+ def test_no_placeholders(self, evaluator):
+ """Template without placeholders should remain unchanged."""
+ template = "Plain text with no placeholders"
+ result = evaluator._create_user_prompt(template, query="Q")
+ assert result == template
+
+
+# ---------------------------------------------------------------------------
+# _load_prompt_content
+# ---------------------------------------------------------------------------
+
+class TestLoadPromptContent:
+ def test_returns_fallback_on_missing_file(self):
+ """Missing prompty file should return fallback prompt."""
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.prompty_path = "/nonexistent/path/test.prompty"
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+
+ system, user = e._load_prompt_content()
+ assert "test" in user.lower() or "evaluate" in user.lower()
+
+ def test_parses_prompty_with_system_and_user(self, tmp_path):
+ """Should parse system/user sections from prompty file."""
+ # Prompty files have: ---\nmetadata\n---\nprompt content
+ # The parser splits on '---' and expects at least 3 parts
+ prompty_content = "---\nname: test\nmodel: gpt-4\n---\nsystem:\nYou are an evaluator.\nuser:\nEvaluate this: {{query}}\n---\n"
+ prompty_file = tmp_path / "test.prompty"
+ prompty_file.write_text(prompty_content, encoding="utf-8")
+
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.prompty_path = str(prompty_file)
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+
+ system, user = e._load_prompt_content()
+ # The parser should extract content from the prompty file
+ combined = system + user
+ assert len(combined) > 0
+
+
+# ---------------------------------------------------------------------------
+# evaluate
+# ---------------------------------------------------------------------------
+
+class TestEvaluate:
+ @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient")
+ def test_evaluate_returns_score(self, mock_client_cls):
+ """evaluate() should return dict with result_key and score."""
+ mock_client = MagicMock()
+ mock_client.get_llm_response_with_prompty.return_value = "4"
+ mock_client_cls.return_value = mock_client
+
+ with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("system", "{{query}}")):
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.prompty_path = "test.prompty"
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+
+ result = e.evaluate(query="What is AI?")
+ assert result == {"test_score": 4}
+
+ @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient")
+ def test_evaluate_returns_default_on_error(self, mock_client_cls):
+ """evaluate() should return default score on LLM error."""
+ mock_client = MagicMock()
+ mock_client.get_llm_response_with_prompty.side_effect = Exception("API error")
+ mock_client_cls.return_value = mock_client
+
+ with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("system", "{{query}}")):
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.prompty_path = "test.prompty"
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+
+ result = e.evaluate(query="What is AI?")
+ assert result == {"test_score": 3}
+
+
+# ---------------------------------------------------------------------------
+# __call__
+# ---------------------------------------------------------------------------
+
+class TestCall:
+ @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient")
+ def test_call_delegates_to_evaluate(self, mock_client_cls):
+ """__call__ should delegate to evaluate."""
+ mock_client = MagicMock()
+ mock_client.get_llm_response_with_prompty.return_value = "5"
+ mock_client_cls.return_value = mock_client
+
+ with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("sys", "{{query}}")):
+ with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None):
+ e = BaseCustomEvaluator.__new__(BaseCustomEvaluator)
+ e.prompty_path = "test.prompty"
+ e.result_key = "test_score"
+ e.prompty_file_name = "test.prompty"
+
+ result = e(query="test")
+ assert result == {"test_score": 5}
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
new file mode 100644
index 0000000..ca4417c
--- /dev/null
+++ b/tests/unit/test_cli.py
@@ -0,0 +1,401 @@
+"""Unit tests for the CLI module (src/agent_evaluation/cli.py)."""
+
+import argparse
+from unittest.mock import MagicMock, mock_open, patch
+
+import pytest
+import yaml
+
+from src.agent_evaluation.cli import (EXCLUDE_DIRS, cmd_info, cmd_list,
+ cmd_run, cmd_run_all, discover_samples,
+ interactive_select, main,
+ print_samples_table, run_sample)
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+SAMPLE_YAML_CONTENT = {
+ "app_name": "TestApp",
+ "experiment_name": "test_experiment",
+ "version": "1.0",
+ "pipeline": [
+ {"config_key": "evaluation", "base_path": "evaluator", "module": "eval_main.eval_main"},
+ ],
+ "evaluation": {
+ "input_path": "datasets/",
+ "input_file": "sample.jsonl",
+ "output_path": "reports/",
+ "evaluators": {"score": "relevance_evaluator"},
+ },
+}
+
+
+@pytest.fixture
+def mock_samples():
+ return [
+ {
+ "name": "agentic_evaluation",
+ "app_name": "TestApp",
+ "experiment_name": "test_experiment",
+ "version": "1.0",
+ "config_path": "src/evaluations/offline/agentic_evaluation/experiment.yaml",
+ "stages": ["evaluation"],
+ },
+ {
+ "name": "rag_evaluation_foundry",
+ "app_name": "RAGApp",
+ "experiment_name": "rag_experiment",
+ "version": "2.0",
+ "config_path": "src/evaluations/offline/rag_evaluation_foundry/experiment.yaml",
+ "stages": ["evaluation"],
+ },
+ ]
+
+
+# ---------------------------------------------------------------------------
+# discover_samples
+# ---------------------------------------------------------------------------
+
+class TestDiscoverSamples:
+ def test_returns_list(self, tmp_path, monkeypatch):
+ """discover_samples should return a list."""
+ monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+ result = discover_samples()
+ assert isinstance(result, list)
+
+ def test_skips_excluded_dirs(self, tmp_path, monkeypatch):
+ """Directories in EXCLUDE_DIRS should be skipped."""
+ monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+ for excluded in EXCLUDE_DIRS:
+ d = tmp_path / excluded
+ d.mkdir()
+ (d / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT))
+
+ result = discover_samples()
+ assert result == []
+
+ def test_skips_dirs_without_experiment_yaml(self, tmp_path, monkeypatch):
+ """Directories without experiment.yaml should be skipped."""
+ monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+ (tmp_path / "some_dir").mkdir()
+ result = discover_samples()
+ assert result == []
+
+ def test_discovers_valid_sample(self, tmp_path, monkeypatch):
+ """A valid sample directory with experiment.yaml should be discovered."""
+ monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+ monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent)
+
+ sample_dir = tmp_path / "my_sample"
+ sample_dir.mkdir()
+ (sample_dir / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT))
+
+ result = discover_samples()
+ assert len(result) == 1
+ assert result[0]["name"] == "my_sample"
+ assert result[0]["app_name"] == "TestApp"
+ assert result[0]["experiment_name"] == "test_experiment"
+ assert result[0]["version"] == "1.0"
+ assert result[0]["stages"] == ["evaluation"]
+
+ def test_discovers_multiple_samples_sorted(self, tmp_path, monkeypatch):
+ """Multiple samples should be returned in sorted order."""
+ monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+ monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent)
+
+ for name in ["z_sample", "a_sample", "m_sample"]:
+ d = tmp_path / name
+ d.mkdir()
+ (d / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT))
+
+ result = discover_samples()
+ assert [s["name"] for s in result] == ["a_sample", "m_sample", "z_sample"]
+
+ def test_uses_defaults_for_missing_yaml_fields(self, tmp_path, monkeypatch):
+ """Missing fields in experiment.yaml should use directory name as default."""
+ monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path)
+ monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent)
+
+ sample_dir = tmp_path / "bare_sample"
+ sample_dir.mkdir()
+ (sample_dir / "experiment.yaml").write_text(yaml.dump({"pipeline": []}))
+
+ result = discover_samples()
+ assert len(result) == 1
+ assert result[0]["name"] == "bare_sample"
+ assert result[0]["app_name"] == "bare_sample"
+ assert result[0]["experiment_name"] == "bare_sample"
+ assert result[0]["version"] == ""
+ assert result[0]["stages"] == []
+
+
+# ---------------------------------------------------------------------------
+# print_samples_table
+# ---------------------------------------------------------------------------
+
+class TestPrintSamplesTable:
+ def test_empty_samples(self, capsys):
+ """Empty list should print 'No evaluation samples found.'"""
+ print_samples_table([])
+ captured = capsys.readouterr()
+ assert "No evaluation samples found." in captured.out
+
+ def test_prints_sample_names(self, capsys, mock_samples):
+ """Should print sample names in the table."""
+ print_samples_table(mock_samples)
+ captured = capsys.readouterr()
+ assert "agentic_evaluation" in captured.out
+ assert "rag_evaluation_foundry" in captured.out
+
+ def test_prints_stages(self, capsys, mock_samples):
+ """Should print stage info in the table."""
+ print_samples_table(mock_samples)
+ captured = capsys.readouterr()
+ assert "evaluation" in captured.out
+
+
+# ---------------------------------------------------------------------------
+# run_sample
+# ---------------------------------------------------------------------------
+
+class TestRunSample:
+ @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline")
+ @patch("src.agent_evaluation.agentic_ops.runner.parse_args")
+ def test_run_sample_success(self, mock_parse, mock_run, mock_samples):
+ """Successful run should return 0."""
+ mock_args = MagicMock()
+ mock_parse.return_value = mock_args
+ mock_run.return_value = None
+
+ result = run_sample(mock_samples[0])
+ assert result == 0
+
+ @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline")
+ @patch("src.agent_evaluation.agentic_ops.runner.parse_args")
+ def test_run_sample_system_exit(self, mock_parse, mock_run, mock_samples):
+ """SystemExit with code should be returned."""
+ mock_args = MagicMock()
+ mock_parse.return_value = mock_args
+ mock_run.side_effect = SystemExit(2)
+
+ result = run_sample(mock_samples[0])
+ assert result == 2
+
+ @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline")
+ @patch("src.agent_evaluation.agentic_ops.runner.parse_args")
+ def test_run_sample_with_extra_args(self, mock_parse, mock_run, mock_samples):
+ """Extra args should be passed to sys.argv."""
+ mock_args = MagicMock()
+ mock_parse.return_value = mock_args
+ mock_run.return_value = None
+
+ result = run_sample(mock_samples[0], extra_args=["--sample", "5"])
+ assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# interactive_select
+# ---------------------------------------------------------------------------
+
+class TestInteractiveSelect:
+ @patch("builtins.input", return_value="1")
+ def test_select_by_number(self, mock_input, mock_samples):
+ """Selecting by number should return the correct sample."""
+ result = interactive_select(mock_samples)
+ assert result == mock_samples[0]
+
+ @patch("builtins.input", return_value="2")
+ def test_select_by_second_number(self, mock_input, mock_samples):
+ """Selecting second item returns second sample."""
+ result = interactive_select(mock_samples)
+ assert result == mock_samples[1]
+
+ @patch("builtins.input", return_value="q")
+ def test_quit(self, mock_input, mock_samples):
+ """Typing 'q' should return None."""
+ result = interactive_select(mock_samples)
+ assert result is None
+
+ @patch("builtins.input", return_value="exit")
+ def test_exit(self, mock_input, mock_samples):
+ """Typing 'exit' should return None."""
+ result = interactive_select(mock_samples)
+ assert result is None
+
+ @patch("builtins.input", return_value="agentic")
+ def test_select_by_partial_name(self, mock_input, mock_samples):
+ """Partial name matching should work for unique matches."""
+ result = interactive_select(mock_samples)
+ assert result == mock_samples[0]
+
+ @patch("builtins.input", side_effect=EOFError)
+ def test_eof_returns_none(self, mock_input, mock_samples):
+ """EOFError should return None."""
+ result = interactive_select(mock_samples)
+ assert result is None
+
+ @patch("builtins.input", side_effect=KeyboardInterrupt)
+ def test_keyboard_interrupt_returns_none(self, mock_input, mock_samples):
+ """KeyboardInterrupt should return None."""
+ result = interactive_select(mock_samples)
+ assert result is None
+
+
+# ---------------------------------------------------------------------------
+# cmd_list
+# ---------------------------------------------------------------------------
+
+class TestCmdList:
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_cmd_list_returns_zero(self, mock_discover, mock_samples):
+ """cmd_list should always return 0."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace()
+ result = cmd_list(args)
+ assert result == 0
+
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_cmd_list_empty(self, mock_discover):
+ """cmd_list with no samples should still return 0."""
+ mock_discover.return_value = []
+ args = argparse.Namespace()
+ result = cmd_list(args)
+ assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# cmd_run
+# ---------------------------------------------------------------------------
+
+class TestCmdRun:
+ @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_run_by_exact_name(self, mock_discover, mock_run, mock_samples):
+ """Running by exact name should find and run the sample."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="agentic_evaluation", sample=0, index_fname=None)
+ result = cmd_run(args)
+ assert result == 0
+ mock_run.assert_called_once()
+
+ @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_run_by_partial_name(self, mock_discover, mock_run, mock_samples):
+ """Running by partial name should match the sample."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="agentic", sample=0, index_fname=None)
+ result = cmd_run(args)
+ assert result == 0
+
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_run_not_found(self, mock_discover, mock_samples):
+ """Running with a non-matching name should return 1."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="nonexistent", sample=0, index_fname=None)
+ result = cmd_run(args)
+ assert result == 1
+
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_run_no_samples(self, mock_discover):
+ """Running with no samples available returns 1."""
+ mock_discover.return_value = []
+ args = argparse.Namespace(name="anything", sample=0, index_fname=None)
+ result = cmd_run(args)
+ assert result == 1
+
+ @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_run_by_number(self, mock_discover, mock_run, mock_samples):
+ """Running by number index should work."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="1", sample=0, index_fname=None)
+ result = cmd_run(args)
+ assert result == 0
+
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_run_ambiguous_name(self, mock_discover, mock_samples):
+ """Ambiguous partial name should return 1."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="evaluation", sample=0, index_fname=None)
+ result = cmd_run(args)
+ assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# cmd_run_all
+# ---------------------------------------------------------------------------
+
+class TestCmdRunAll:
+ @patch("src.agent_evaluation.cli.run_sample", return_value=0)
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_all_pass(self, mock_discover, mock_run, mock_samples):
+ """All samples passing should return 0."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace()
+ result = cmd_run_all(args)
+ assert result == 0
+ assert mock_run.call_count == len(mock_samples)
+
+ @patch("src.agent_evaluation.cli.run_sample", return_value=1)
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_any_failure_returns_one(self, mock_discover, mock_run, mock_samples):
+ """Any sample failing should return 1."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace()
+ result = cmd_run_all(args)
+ assert result == 1
+
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_no_samples(self, mock_discover):
+ """No samples available should return 1."""
+ mock_discover.return_value = []
+ args = argparse.Namespace()
+ result = cmd_run_all(args)
+ assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# cmd_info
+# ---------------------------------------------------------------------------
+
+class TestCmdInfo:
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_info_not_found(self, mock_discover, mock_samples):
+ """Non-existent sample should return 1."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="nonexistent")
+ result = cmd_info(args)
+ assert result == 1
+
+ @patch("builtins.open", mock_open(read_data=yaml.dump(SAMPLE_YAML_CONTENT)))
+ @patch("src.agent_evaluation.cli.discover_samples")
+ def test_info_found(self, mock_discover, mock_samples):
+ """Found sample should return 0."""
+ mock_discover.return_value = mock_samples
+ args = argparse.Namespace(name="agentic_evaluation")
+ result = cmd_info(args)
+ assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+class TestMain:
+ @patch("src.agent_evaluation.cli.discover_samples", return_value=[])
+ def test_main_no_command_no_samples(self, mock_discover):
+ """No command and no samples should exit with 1."""
+ with patch("sys.argv", ["agent_evals"]):
+ with pytest.raises(SystemExit) as exc_info:
+ main()
+ assert exc_info.value.code == 1
+
+ @patch("src.agent_evaluation.cli.cmd_list", return_value=0)
+ def test_main_list_command(self, mock_cmd):
+ """'list' command should dispatch to cmd_list."""
+ with patch("sys.argv", ["agent_evals", "list"]):
+ with pytest.raises(SystemExit) as exc_info:
+ main()
+ assert exc_info.value.code == 0
+ mock_cmd.assert_called_once()
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
new file mode 100644
index 0000000..8fa5dc3
--- /dev/null
+++ b/tests/unit/test_client.py
@@ -0,0 +1,196 @@
+"""Unit tests for the LLM Client (src/agent_evaluation/agentic_ops/client.py)."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from src.agent_evaluation.agentic_ops.client import LLMClient
+
+# ---------------------------------------------------------------------------
+# LLMClient._validate_messages
+# ---------------------------------------------------------------------------
+
+class TestValidateMessages:
+ @pytest.fixture
+ def client(self):
+ with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+ return LLMClient(temperature=0.0)
+
+ def test_valid_messages(self, client):
+ """Valid messages should not raise."""
+ messages = [
+ {"role": "system", "content": "You are helpful."},
+ {"role": "user", "content": "Hello"},
+ ]
+ client._validate_messages(messages) # Should not raise
+
+ def test_not_a_list_raises(self, client):
+ """Non-list messages should raise ValueError."""
+ with pytest.raises(ValueError, match="must be a list"):
+ client._validate_messages("not a list")
+
+ def test_non_dict_message_raises(self, client):
+ """Non-dict message items should raise ValueError."""
+ with pytest.raises(ValueError, match="must be a dictionary"):
+ client._validate_messages(["not a dict"])
+
+ def test_missing_role_raises(self, client):
+ """Message without 'role' should raise ValueError."""
+ with pytest.raises(ValueError, match="must have 'role' and 'content'"):
+ client._validate_messages([{"content": "hello"}])
+
+ def test_missing_content_raises(self, client):
+ """Message without 'content' should raise ValueError."""
+ with pytest.raises(ValueError, match="must have 'role' and 'content'"):
+ client._validate_messages([{"role": "user"}])
+
+ def test_invalid_role_raises(self, client):
+ """Invalid role value should raise ValueError."""
+ with pytest.raises(ValueError, match="invalid role"):
+ client._validate_messages([{"role": "invalid", "content": "hi"}])
+
+ def test_valid_roles(self, client):
+ """All valid roles should pass."""
+ messages = [
+ {"role": "system", "content": "sys"},
+ {"role": "user", "content": "usr"},
+ {"role": "assistant", "content": "asst"},
+ ]
+ client._validate_messages(messages) # Should not raise
+
+
+# ---------------------------------------------------------------------------
+# LLMClient._parse_json_response
+# ---------------------------------------------------------------------------
+
+class TestParseJsonResponse:
+ @pytest.fixture
+ def client(self):
+ with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+ return LLMClient(temperature=0.0)
+
+ def test_valid_json(self, client):
+ """Valid JSON string should be parsed."""
+ result = client._parse_json_response('{"key": "value"}')
+ assert result == {"key": "value"}
+
+ def test_json_with_markdown_fencing(self, client):
+ """JSON wrapped in ```json ... ``` should be parsed."""
+ raw = '```json\n{"key": "value"}\n```'
+ result = client._parse_json_response(raw)
+ assert result == {"key": "value"}
+
+ def test_invalid_json_raises(self, client):
+ """Invalid JSON should raise ValueError."""
+ with pytest.raises(ValueError, match="Invalid JSON"):
+ client._parse_json_response("not valid json {")
+
+ def test_json_array(self, client):
+ """JSON arrays should be parsed."""
+ result = client._parse_json_response('[1, 2, 3]')
+ assert result == [1, 2, 3]
+
+ def test_whitespace_handling(self, client):
+ """Extra whitespace should be handled."""
+ result = client._parse_json_response(' \n {"a": 1} \n ')
+ assert result == {"a": 1}
+
+
+# ---------------------------------------------------------------------------
+# LLMClient.get_llm_raw_response
+# ---------------------------------------------------------------------------
+
+class TestGetLlmRawResponse:
+ @patch("src.agent_evaluation.agentic_ops.client.get_llm_response")
+ def test_builds_messages_correctly(self, mock_get_response):
+ """Should construct messages with system and user roles."""
+ mock_get_response.return_value = "response text"
+
+ with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+ client = LLMClient(temperature=0.5)
+
+ result = client.get_llm_raw_response("system prompt", "user input")
+ assert result == "response text"
+
+ call_args = mock_get_response.call_args
+ messages = call_args[0][0]
+ assert messages[0] == {"role": "system", "content": "system prompt"}
+ assert messages[1] == {"role": "user", "content": "user input"}
+
+
+# ---------------------------------------------------------------------------
+# LLMClient.get_llm_response_json
+# ---------------------------------------------------------------------------
+
+class TestGetLlmResponseJson:
+ @patch("src.agent_evaluation.agentic_ops.client.get_llm_response")
+ def test_returns_parsed_json(self, mock_get_response):
+ """Should return parsed JSON from LLM response."""
+ mock_get_response.return_value = '{"score": 4}'
+
+ with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"):
+ client = LLMClient()
+
+ result = client.get_llm_response_json("sys", "usr")
+ assert result == {"score": 4}
+
+
+# ---------------------------------------------------------------------------
+# get_llm_response (module-level)
+# ---------------------------------------------------------------------------
+
+class TestGetLlmResponse:
+ @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model")
+ @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance")
+ def test_successful_response(self, mock_get_client):
+ from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+ mock_client = MagicMock()
+ mock_response = MagicMock()
+ mock_response.choices = [MagicMock()]
+ mock_response.choices[0].message.content = "Hello!"
+ mock_client.chat.completions.create.return_value = mock_response
+ mock_get_client.return_value = mock_client
+
+ messages = [{"role": "user", "content": "Hi"}]
+ result = get_llm_response(messages)
+ assert result == "Hello!"
+
+ @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", None)
+ def test_raises_on_missing_deployment(self):
+ from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+ with pytest.raises(ValueError, match="EVAL_AZURE_OPENAI_MODEL"):
+ get_llm_response([{"role": "user", "content": "test"}])
+
+ @patch("src.agent_evaluation.agentic_ops.client.DEFAULT_RETRY_DELAY", 0)
+ @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model")
+ @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance")
+ def test_retries_on_failure(self, mock_get_client):
+ from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+ mock_client = MagicMock()
+ mock_client.chat.completions.create.side_effect = [
+ Exception("transient error"),
+ Exception("transient error"),
+ MagicMock(choices=[MagicMock(message=MagicMock(content="success"))]),
+ ]
+ mock_get_client.return_value = mock_client
+
+ messages = [{"role": "user", "content": "Hi"}]
+ result = get_llm_response(messages, max_retries=3)
+ assert result == "success"
+
+ @patch("src.agent_evaluation.agentic_ops.client.DEFAULT_RETRY_DELAY", 0)
+ @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model")
+ @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance")
+ def test_raises_after_max_retries(self, mock_get_client):
+ from src.agent_evaluation.agentic_ops.client import get_llm_response
+
+ mock_client = MagicMock()
+ mock_client.chat.completions.create.side_effect = Exception("permanent error")
+ mock_get_client.return_value = mock_client
+
+ messages = [{"role": "user", "content": "Hi"}]
+ with pytest.raises(Exception, match="Maximum retries"):
+ get_llm_response(messages, max_retries=2)
diff --git a/tests/unit/test_eval_factories.py b/tests/unit/test_eval_factories.py
new file mode 100644
index 0000000..dc9c016
--- /dev/null
+++ b/tests/unit/test_eval_factories.py
@@ -0,0 +1,254 @@
+"""Unit tests for eval_factory modules across all evaluation samples."""
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Agentic Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestAgenticEvaluationFactory:
+ def test_get_relevance_evaluator(self):
+ from src.evaluations.offline.agentic_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+ assert result is not None
+ assert "Relevance" in result.__name__
+
+ def test_get_custom_agents_evaluator(self):
+ from src.evaluations.offline.agentic_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("custom_agents_invoked_evaluator")
+ assert result is not None
+ assert result.__name__ == "EvaluateAgentsInvoked"
+
+ def test_get_task_adherence_evaluator(self):
+ from src.evaluations.offline.agentic_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+ assert result is not None
+ assert "TaskAdherence" in result.__name__
+
+ def test_get_tool_call_accuracy_evaluator(self):
+ from src.evaluations.offline.agentic_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator")
+ assert result is not None
+ assert "ToolCallAccuracy" in result.__name__
+
+ def test_invalid_evaluator_raises(self):
+ from src.evaluations.offline.agentic_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ with pytest.raises(ValueError, match="not found"):
+ EvaluatorFactory.get_evaluator_factory("nonexistent_evaluator")
+
+ def test_all_registered_evaluators_are_callable(self):
+ from src.evaluations.offline.agentic_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ for name in EvaluatorFactory.EVALUATOR_FACTORIES:
+ result = EvaluatorFactory.get_evaluator_factory(name)
+ assert callable(result), f"{name} factory is not callable"
+
+
+# ---------------------------------------------------------------------------
+# AI Judge Evaluation Custom - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestAiJudgeEvaluationFactory:
+ def test_get_custom_coherence(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("custom_coherence_evaluator")
+ assert result.__name__ == "CoherenceEvaluatorCustom"
+
+ def test_get_custom_relevance(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("custom_relevance_evaluator")
+ assert result.__name__ == "RelevanceEvaluatorCustom"
+
+ def test_get_custom_fluency(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("custom_fluency_evaluator")
+ assert result.__name__ == "FluencyEvaluatorCustom"
+
+ def test_get_custom_similarity(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("custom_similarity_evaluator")
+ assert result.__name__ == "SimilarityEvaluatorCustom"
+
+ def test_get_builtin_relevance(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+ assert "Relevance" in result.__name__
+
+ def test_get_builtin_coherence(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("coherence_evaluator")
+ assert "Coherence" in result.__name__
+
+ def test_invalid_evaluator_raises(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ with pytest.raises(ValueError, match="not found"):
+ EvaluatorFactory.get_evaluator_factory("invalid_evaluator")
+
+ def test_all_registered_evaluators_are_callable(self):
+ from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \
+ EvaluatorFactory
+
+ for name in EvaluatorFactory.EVALUATOR_FACTORIES:
+ result = EvaluatorFactory.get_evaluator_factory(name)
+ assert callable(result), f"{name} factory is not callable"
+
+
+# ---------------------------------------------------------------------------
+# Pipeline Experiment Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestPipelineExperimentFactory:
+ def test_get_relevance(self):
+ from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+ assert "Relevance" in result.__name__
+
+ def test_get_task_adherence(self):
+ from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+ assert "TaskAdherence" in result.__name__
+
+ def test_get_tool_call_accuracy(self):
+ from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator")
+ assert "ToolCallAccuracy" in result.__name__
+
+ def test_invalid_raises(self):
+ from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ with pytest.raises(ValueError, match="not found"):
+ EvaluatorFactory.get_evaluator_factory("does_not_exist")
+
+
+# ---------------------------------------------------------------------------
+# Pipeline Multi-Agent Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestPipelineMultiAgentFactory:
+ def test_get_relevance(self):
+ from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+ assert "Relevance" in result.__name__
+
+ def test_get_task_adherence(self):
+ from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+ assert "TaskAdherence" in result.__name__
+
+ def test_get_agents_invoked(self):
+ from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("evaluate_agents_invoked")
+ assert result.__name__ == "EvaluateAgentsInvoked"
+
+ def test_get_custom_agents_invoked(self):
+ from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("custom_agents_invoked_accuracy_eval")
+ assert result.__name__ == "EvaluateAgentsInvoked"
+
+ def test_invalid_raises(self):
+ from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ with pytest.raises(ValueError, match="not found"):
+ EvaluatorFactory.get_evaluator_factory("bogus")
+
+
+# ---------------------------------------------------------------------------
+# Pipeline Multi-Tool Agent Evaluation - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestPipelineMultiToolFactory:
+ def test_get_relevance(self):
+ from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+ assert "Relevance" in result.__name__
+
+ def test_get_task_adherence(self):
+ from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator")
+ assert "TaskAdherence" in result.__name__
+
+ def test_get_tool_call_accuracy(self):
+ from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator")
+ assert "ToolCallAccuracy" in result.__name__
+
+ def test_invalid_raises(self):
+ from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \
+ EvaluatorFactory
+
+ with pytest.raises(ValueError, match="not found"):
+ EvaluatorFactory.get_evaluator_factory("unknown")
+
+
+# ---------------------------------------------------------------------------
+# RAG Evaluation Foundry - EvaluatorFactory
+# ---------------------------------------------------------------------------
+
+class TestRagEvaluationFoundryFactory:
+ def test_get_relevance(self):
+ from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \
+ EvaluatorFactory
+
+ result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator")
+ assert "Relevance" in result.__name__
+
+ def test_invalid_raises(self):
+ from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \
+ EvaluatorFactory
+
+ with pytest.raises(ValueError, match="not found"):
+ EvaluatorFactory.get_evaluator_factory("missing")
+
+ def test_only_has_relevance(self):
+ from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \
+ EvaluatorFactory
+
+ assert len(EvaluatorFactory.EVALUATOR_FACTORIES) == 1
+ assert "relevance_evaluator" in EvaluatorFactory.EVALUATOR_FACTORIES
diff --git a/tests/unit/test_evaluation_utils.py b/tests/unit/test_evaluation_utils.py
new file mode 100644
index 0000000..f28a6c7
--- /dev/null
+++ b/tests/unit/test_evaluation_utils.py
@@ -0,0 +1,130 @@
+"""Unit tests for agent evaluation utility functions (evaluation_utils, agent_tools)."""
+
+import pytest
+
+from src.evaluations.offline.pipeline_multi_agent_evaluation.evaluator.evaluator_repo.eval_utils.evaluation_utils import (
+ agent_invoked_accuracy, calculate_match_percentage)
+
+# ---------------------------------------------------------------------------
+# agent_invoked_accuracy
+# ---------------------------------------------------------------------------
+
+class TestAgentInvokedAccuracy:
+ def test_exact_match(self):
+ """Same agents in same order should return True."""
+ assert agent_invoked_accuracy(["AgentA", "AgentB"], ["AgentA", "AgentB"]) is True
+
+ def test_same_agents_different_order(self):
+ """Same agents in different order should return True (set comparison)."""
+ assert agent_invoked_accuracy(["AgentB", "AgentA"], ["AgentA", "AgentB"]) is True
+
+ def test_missing_agent(self):
+ """Missing expected agent should return False."""
+ assert agent_invoked_accuracy(["AgentA"], ["AgentA", "AgentB"]) is False
+
+ def test_extra_agent(self):
+ """Extra predicted agent should return False."""
+ assert agent_invoked_accuracy(["AgentA", "AgentB", "AgentC"], ["AgentA", "AgentB"]) is False
+
+ def test_empty_both(self):
+ """Both empty should return True."""
+ assert agent_invoked_accuracy([], []) is True
+
+ def test_empty_predicted(self):
+ """Empty predicted with non-empty expected should return False."""
+ assert agent_invoked_accuracy([], ["AgentA"]) is False
+
+ def test_empty_expected(self):
+ """Non-empty predicted with empty expected should return False."""
+ assert agent_invoked_accuracy(["AgentA"], []) is False
+
+ def test_single_agent_match(self):
+ """Single agent match should return True."""
+ assert agent_invoked_accuracy(["AgentA"], ["AgentA"]) is True
+
+ def test_duplicate_agents(self):
+ """Duplicate agents should be treated as set."""
+ assert agent_invoked_accuracy(["AgentA", "AgentA"], ["AgentA"]) is True
+
+
+# ---------------------------------------------------------------------------
+# calculate_match_percentage
+# ---------------------------------------------------------------------------
+
+class TestCalculateMatchPercentage:
+ def test_full_match(self):
+ """All expected in predicted should return 1.0."""
+ assert calculate_match_percentage(["A", "B"], ["A", "B"]) == 1.0
+
+ def test_partial_match(self):
+ """Half expected in predicted should return 0.5."""
+ assert calculate_match_percentage(["A", "B"], ["A"]) == 0.5
+
+ def test_no_match(self):
+ """No overlap should return 0.0."""
+ assert calculate_match_percentage(["A", "B"], ["C", "D"]) == 0.0
+
+ def test_empty_expected(self):
+ """Empty expected should return 0.0 (avoid division by zero)."""
+ assert calculate_match_percentage([], ["A"]) == 0.0
+
+ def test_empty_predicted(self):
+ """Empty predicted should return 0.0."""
+ assert calculate_match_percentage(["A", "B"], []) == 0.0
+
+ def test_extra_predicted_agents(self):
+ """Extra predicted agents don't affect match percentage."""
+ assert calculate_match_percentage(["A"], ["A", "B", "C"]) == 1.0
+
+ def test_three_of_four(self):
+ """3 out of 4 expected should return 0.75."""
+ assert calculate_match_percentage(["A", "B", "C", "D"], ["A", "B", "C"]) == 0.75
+
+
+# ---------------------------------------------------------------------------
+# EvaluateAgentsInvoked
+# ---------------------------------------------------------------------------
+
+class TestEvaluateAgentsInvoked:
+ @pytest.fixture
+ def evaluator(self):
+ from src.evaluations.offline.pipeline_multi_agent_evaluation.evaluator.evaluator_repo.evaluate_agent_invoked import \
+ EvaluateAgentsInvoked
+ return EvaluateAgentsInvoked()
+
+ def test_exact_match_returns_accuracy_1(self, evaluator):
+ """Exact match should set accuracy to 1.0."""
+ result = evaluator(
+ expected_agents_to_invoke=["ACAgent", "TVAgent"],
+ predicted_agents_to_invoke=["ACAgent", "TVAgent"],
+ )
+ assert result["agents_invoke_accuracy"] == 1.0
+ assert result["agents_invoke_exact_match"] is True
+ assert result["agents_invoke_match_percentage"] == 1.0
+
+ def test_orchestrator_filtered(self, evaluator):
+ """OrchestratorAgent should be filtered from predicted."""
+ result = evaluator(
+ expected_agents_to_invoke=["ACAgent"],
+ predicted_agents_to_invoke=["OrchestratorAgent", "ACAgent"],
+ )
+ assert result["agents_invoke_accuracy"] == 1.0
+ assert result["agents_invoke_exact_match"] is True
+
+ def test_mismatch(self, evaluator):
+ """Mismatch should set accuracy to 0.0."""
+ result = evaluator(
+ expected_agents_to_invoke=["ACAgent", "TVAgent"],
+ predicted_agents_to_invoke=["DishwasherAgent"],
+ )
+ assert result["agents_invoke_accuracy"] == 0.0
+ assert result["agents_invoke_exact_match"] is False
+
+ def test_partial_match_percentage(self, evaluator):
+ """Partial match percentage should be calculated."""
+ result = evaluator(
+ expected_agents_to_invoke=["ACAgent", "TVAgent"],
+ predicted_agents_to_invoke=["ACAgent"],
+ )
+ assert result["agents_invoke_match_percentage"] == 0.5
+ assert result["agents_invoke_exact_match"] is False
diff --git a/tests/unit/test_run_eval.py b/tests/unit/test_run_eval.py
new file mode 100644
index 0000000..1455a5f
--- /dev/null
+++ b/tests/unit/test_run_eval.py
@@ -0,0 +1,152 @@
+"""Unit tests for run_eval module (src/agent_evaluation/agentic_ops/run_eval.py)."""
+
+from unittest.mock import MagicMock, patch
+
+from src.agent_evaluation.agentic_ops.run_eval import (setup_evaluation,
+ should_pass_config)
+
+# ---------------------------------------------------------------------------
+# should_pass_config
+# ---------------------------------------------------------------------------
+
+class TestShouldPassConfig:
+ def test_function_with_required_arg(self):
+ """Function with required arg should return True."""
+ def func(config):
+ pass
+ assert should_pass_config(func) is True
+
+ def test_function_no_args(self):
+ """Function with no args should return False."""
+ def func():
+ pass
+ assert should_pass_config(func) is False
+
+ def test_function_only_defaults(self):
+ """Function with only default args should return False."""
+ def func(x=10, y=20):
+ pass
+ assert should_pass_config(func) is False
+
+ def test_function_with_kwargs_only(self):
+ """Function with **kwargs only should return False."""
+ def func(**kwargs):
+ pass
+ assert should_pass_config(func) is False
+
+ def test_function_keyword_only_required(self):
+ """Function with keyword-only required param should return True."""
+ def func(*, config):
+ pass
+ assert should_pass_config(func) is True
+
+
+# ---------------------------------------------------------------------------
+# setup_evaluation
+# ---------------------------------------------------------------------------
+
+class TestSetupEvaluation:
+ def test_setup_with_model_config_param(self):
+ """Evaluator factory accepting model_config should get it passed."""
+ mock_factory_cls = MagicMock()
+ mock_evaluator = MagicMock()
+ mock_factory_cls.return_value = mock_evaluator
+
+ # Create a factory class with model_config parameter
+ def factory_func(model_config=None):
+ return mock_evaluator
+
+ mock_eval_factory = MagicMock()
+ mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+ config = {
+ "evaluators": {"test_eval": "test_factory"},
+ "evaluator_config": {},
+ }
+
+ with patch.dict("os.environ", {
+ "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+ "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+ "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+ }):
+ evaluators, evaluator_config = setup_evaluation(config, mock_eval_factory)
+
+ assert "test_eval" in evaluators
+ assert evaluators["test_eval"] == mock_evaluator
+
+ def test_setup_with_no_params_factory(self):
+ """Evaluator factory accepting no params should be called without args."""
+ mock_evaluator = MagicMock()
+
+ def factory_func():
+ return mock_evaluator
+
+ mock_eval_factory = MagicMock()
+ mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+ config = {
+ "evaluators": {"simple_eval": "simple_factory"},
+ "evaluator_config": {},
+ }
+
+ with patch.dict("os.environ", {
+ "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+ "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+ "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+ }):
+ evaluators, evaluator_config = setup_evaluation(config, mock_eval_factory)
+
+ assert "simple_eval" in evaluators
+ assert evaluators["simple_eval"] == mock_evaluator
+
+ def test_setup_with_azure_ai_project_param(self):
+ """Factory accepting azure_ai_project should get it."""
+ mock_evaluator = MagicMock()
+
+ def factory_func(azure_ai_project=None):
+ return mock_evaluator
+
+ mock_eval_factory = MagicMock()
+ mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+ config = {
+ "evaluators": {"proj_eval": "proj_factory"},
+ "evaluator_config": {},
+ }
+
+ mock_project = MagicMock()
+ with patch.dict("os.environ", {
+ "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+ "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+ "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+ }):
+ evaluators, _ = setup_evaluation(config, mock_eval_factory, azure_ai_project=mock_project)
+
+ assert "proj_eval" in evaluators
+
+ def test_setup_resolves_column_mapping_placeholder(self):
+ """evaluator_config with 'use_column_mapping' should resolve."""
+ mock_evaluator = MagicMock()
+
+ def factory_func():
+ return mock_evaluator
+
+ mock_eval_factory = MagicMock()
+ mock_eval_factory.get_evaluator_factory.return_value = factory_func
+
+ config = {
+ "evaluators": {"test_eval": "test_factory"},
+ "column_mapping": {"query": "${data.query}"},
+ "evaluator_config": {
+ "test_eval": {"column_mapping": "use_column_mapping"},
+ },
+ }
+
+ with patch.dict("os.environ", {
+ "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com",
+ "EVAL_AZURE_OPENAI_MODEL": "gpt-4",
+ "EVAL_AZURE_OPENAI_VERSION": "2024-01-01",
+ }):
+ _, evaluator_config = setup_evaluation(config, mock_eval_factory)
+
+ assert evaluator_config["test_eval"]["column_mapping"] == {"query": "${data.query}"}
diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py
new file mode 100644
index 0000000..bb6287d
--- /dev/null
+++ b/tests/unit/test_runner.py
@@ -0,0 +1,99 @@
+"""Unit tests for the pipeline runner (src/agent_evaluation/agentic_ops/runner.py)."""
+
+import argparse
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+from src.agent_evaluation.agentic_ops.runner import load_config, parse_args
+
+# ---------------------------------------------------------------------------
+# load_config
+# ---------------------------------------------------------------------------
+
+class TestLoadConfig:
+ def test_loads_valid_yaml(self, tmp_path):
+ """Should load and return config dict from valid YAML."""
+ config_data = {"app_name": "TestApp", "pipeline": []}
+ config_file = tmp_path / "experiment.yaml"
+ config_file.write_text(yaml.dump(config_data))
+
+ result = load_config(config_file)
+ assert result == config_data
+
+ def test_raises_on_missing_file(self, tmp_path):
+ """Should raise FileNotFoundError for missing config."""
+ missing = tmp_path / "missing.yaml"
+ with pytest.raises(FileNotFoundError):
+ load_config(missing)
+
+ def test_returns_none_for_empty_yaml(self, tmp_path):
+ """Empty YAML file should return None."""
+ config_file = tmp_path / "empty.yaml"
+ config_file.write_text("")
+ result = load_config(config_file)
+ assert result is None
+
+
+# ---------------------------------------------------------------------------
+# parse_args
+# ---------------------------------------------------------------------------
+
+class TestParseArgs:
+ def test_defaults(self):
+ """Default args should have expected values."""
+ with patch("sys.argv", ["runner"]):
+ args = parse_args()
+ assert args.config_file == "experiment.yaml"
+ assert args.index_fname is None
+ assert args.sample == 0
+
+ def test_custom_config_file(self):
+ """Should accept --config_file argument."""
+ with patch("sys.argv", ["runner", "--config_file", "custom.yaml"]):
+ args = parse_args()
+ assert args.config_file == "custom.yaml"
+
+ def test_sample_arg(self):
+ """Should accept --sample argument."""
+ with patch("sys.argv", ["runner", "--sample", "10"]):
+ args = parse_args()
+ assert args.sample == 10
+
+ def test_index_fname_arg(self):
+ """Should accept --index_fname argument."""
+ with patch("sys.argv", ["runner", "--index_fname", "file_001"]):
+ args = parse_args()
+ assert args.index_fname == "file_001"
+
+
+# ---------------------------------------------------------------------------
+# run_pipeline
+# ---------------------------------------------------------------------------
+
+class TestRunPipeline:
+ @patch("src.agent_evaluation.agentic_ops.runner.importlib.import_module")
+ @patch("src.agent_evaluation.agentic_ops.runner.load_config")
+ def test_run_pipeline_exits_on_invalid_step(self, mock_load_config, mock_import, tmp_path):
+ """Pipeline with an invalid step (missing base_path/module) should exit."""
+ from src.agent_evaluation.agentic_ops.runner import run_pipeline
+
+ mock_load_config.return_value = {
+ "experiment_name": "test",
+ "pipeline": [{"config_key": "evaluation"}], # missing base_path and module
+ "evaluation": {},
+ }
+
+ with pytest.raises(SystemExit):
+ run_pipeline("test/experiment.yaml", argparse.Namespace(sample=0, index_fname=None))
+
+ @patch("src.agent_evaluation.agentic_ops.runner.load_config")
+ def test_run_pipeline_exits_on_empty_pipeline(self, mock_load_config, tmp_path):
+ """Pipeline with no steps should exit."""
+ from src.agent_evaluation.agentic_ops.runner import run_pipeline
+
+ mock_load_config.return_value = {"pipeline": []}
+
+ with pytest.raises(SystemExit):
+ run_pipeline("test/experiment.yaml", argparse.Namespace(sample=0, index_fname=None))
diff --git a/tests/unit/test_trace_to_jsonl.py b/tests/unit/test_trace_to_jsonl.py
new file mode 100644
index 0000000..596a9a4
--- /dev/null
+++ b/tests/unit/test_trace_to_jsonl.py
@@ -0,0 +1,138 @@
+"""Unit tests for trace_to_jsonl shared module."""
+
+import json
+
+from src.evaluations.offline.utils.trace_to_jsonl import (
+ extract_tool_call_from_span, extract_tool_definitions,
+ merge_tool_definitions)
+
+# ---------------------------------------------------------------------------
+# extract_tool_definitions
+# ---------------------------------------------------------------------------
+
+class TestExtractToolDefinitions:
+ def test_extracts_function_tools(self):
+ """Should extract function-type tool definitions."""
+ tool_defs = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get weather data",
+ "parameters": {"type": "object"},
+ },
+ }
+ ]
+ custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)}
+
+ result = extract_tool_definitions(custom_dims)
+ assert len(result) == 1
+ assert result[0]["name"] == "get_weather"
+ assert result[0]["description"] == "Get weather data"
+
+ def test_empty_string(self):
+ """Empty tool definitions string should return empty list."""
+ result = extract_tool_definitions({"gen_ai.tool.definitions": ""})
+ assert result == []
+
+ def test_missing_key(self):
+ """Missing key should return empty list."""
+ result = extract_tool_definitions({})
+ assert result == []
+
+ def test_invalid_json(self):
+ """Invalid JSON should return empty list."""
+ result = extract_tool_definitions({"gen_ai.tool.definitions": "not json"})
+ assert result == []
+
+ def test_non_list_json(self):
+ """Non-list JSON should return empty list."""
+ result = extract_tool_definitions({"gen_ai.tool.definitions": '{"key": "value"}'})
+ assert result == []
+
+ def test_skips_non_function_type(self):
+ """Non-function type tools should be skipped."""
+ tool_defs = [{"type": "retrieval", "name": "search"}]
+ custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)}
+
+ result = extract_tool_definitions(custom_dims)
+ assert result == []
+
+ def test_multiple_tools(self):
+ """Multiple function tools should all be extracted."""
+ tool_defs = [
+ {"type": "function", "function": {"name": "tool_a", "description": "A", "parameters": {}}},
+ {"type": "function", "function": {"name": "tool_b", "description": "B", "parameters": {}}},
+ ]
+ custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)}
+
+ result = extract_tool_definitions(custom_dims)
+ assert len(result) == 2
+ assert result[0]["name"] == "tool_a"
+ assert result[1]["name"] == "tool_b"
+
+
+# ---------------------------------------------------------------------------
+# merge_tool_definitions
+# ---------------------------------------------------------------------------
+
+class TestMergeToolDefinitions:
+ def test_merge_new_tools(self):
+ """New tools should be added."""
+ existing = [{"name": "tool_a", "id": "tool_a"}]
+ new = [{"name": "tool_b", "id": "tool_b"}]
+
+ result = merge_tool_definitions(existing, new)
+ names = {t["name"] for t in result}
+ assert names == {"tool_a", "tool_b"}
+
+ def test_deduplicates_by_name(self):
+ """Duplicate names should not be added."""
+ existing = [{"name": "tool_a", "id": "1", "description": "first"}]
+ new = [{"name": "tool_a", "id": "2", "description": "second"}]
+
+ result = merge_tool_definitions(existing, new)
+ assert len(result) == 1
+ assert result[0]["description"] == "first" # Keeps existing
+
+ def test_empty_new(self):
+ """Empty new list should return existing unchanged."""
+ existing = [{"name": "tool_a"}]
+ result = merge_tool_definitions(existing, [])
+ assert result == existing
+
+ def test_empty_existing(self):
+ """Empty existing should return new tools."""
+ new = [{"name": "tool_a"}, {"name": "tool_b"}]
+ result = merge_tool_definitions([], new)
+ assert len(result) == 2
+
+ def test_both_empty(self):
+ """Both empty should return empty list."""
+ result = merge_tool_definitions([], [])
+ assert result == []
+
+
+# ---------------------------------------------------------------------------
+# extract_tool_call_from_span
+# ---------------------------------------------------------------------------
+
+class TestExtractToolCallFromSpan:
+ def test_extracts_from_operation_name(self):
+ """Should extract tool name from 'execute_tool ' format."""
+ result = extract_tool_call_from_span({}, "execute_tool get_weather")
+ assert result["type"] == "tool_call"
+ assert result["name"] == "get_weather"
+
+ def test_falls_back_to_custom_dims(self):
+ """Should fall back to gen_ai.tool.name from custom dims."""
+ custom_dims = {"gen_ai.tool.name": "search_tool"}
+ result = extract_tool_call_from_span(custom_dims, "some_other_operation")
+ assert result["type"] == "tool_call"
+ assert result["name"] == "search_tool"
+
+ def test_empty_name(self):
+ """Should handle missing tool name gracefully."""
+ result = extract_tool_call_from_span({}, "other_span")
+ assert result["type"] == "tool_call"
+ assert result["name"] == ""
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
new file mode 100644
index 0000000..81306b0
--- /dev/null
+++ b/tests/unit/test_utils.py
@@ -0,0 +1,151 @@
+"""Unit tests for utility modules (file_operations, constants, trace_to_jsonl)."""
+
+import json
+
+from src.evaluations.offline.utils.constants import EVAL_NAME
+from src.evaluations.offline.utils.file_operations import (
+ append_to_jsonl, get_next_run_id, load_queries_from_jsonl, save_to_jsonl)
+
+# ---------------------------------------------------------------------------
+# constants
+# ---------------------------------------------------------------------------
+
+class TestConstants:
+ def test_eval_name_value(self):
+ """EVAL_NAME should be 'experiment_name'."""
+ assert EVAL_NAME == "experiment_name"
+
+
+# ---------------------------------------------------------------------------
+# load_queries_from_jsonl
+# ---------------------------------------------------------------------------
+
+class TestLoadQueriesFromJsonl:
+ def test_loads_valid_jsonl(self, tmp_path):
+ """Should load all lines from valid JSONL file."""
+ data = [{"query": "q1"}, {"query": "q2"}, {"query": "q3"}]
+ f = tmp_path / "test.jsonl"
+ f.write_text("\n".join(json.dumps(d) for d in data))
+
+ result = load_queries_from_jsonl(str(f))
+ assert result == data
+
+ def test_skips_blank_lines(self, tmp_path):
+ """Blank lines should be skipped."""
+ f = tmp_path / "test.jsonl"
+ f.write_text('{"a":1}\n\n{"b":2}\n\n')
+
+ result = load_queries_from_jsonl(str(f))
+ assert len(result) == 2
+
+ def test_empty_file(self, tmp_path):
+ """Empty file should return empty list."""
+ f = tmp_path / "empty.jsonl"
+ f.write_text("")
+
+ result = load_queries_from_jsonl(str(f))
+ assert result == []
+
+ def test_preserves_unicode(self, tmp_path):
+ """Unicode content should be preserved."""
+ data = [{"query": "Hello world"}]
+ f = tmp_path / "unicode.jsonl"
+ f.write_text(json.dumps(data[0], ensure_ascii=True) + "\n")
+
+ result = load_queries_from_jsonl(str(f))
+ assert result[0]["query"] == "Hello world"
+
+
+# ---------------------------------------------------------------------------
+# save_to_jsonl
+# ---------------------------------------------------------------------------
+
+class TestSaveToJsonl:
+ def test_saves_data(self, tmp_path):
+ """Should save list of dicts to JSONL."""
+ data = [{"a": 1}, {"b": 2}]
+ f = tmp_path / "out.jsonl"
+
+ save_to_jsonl(str(f), data)
+
+ lines = f.read_text().strip().split("\n")
+ assert len(lines) == 2
+ assert json.loads(lines[0]) == {"a": 1}
+ assert json.loads(lines[1]) == {"b": 2}
+
+ def test_empty_list(self, tmp_path):
+ """Empty list should create empty file."""
+ f = tmp_path / "empty.jsonl"
+ save_to_jsonl(str(f), [])
+ assert f.read_text() == ""
+
+ def test_overwrites_existing(self, tmp_path):
+ """Should overwrite existing file."""
+ f = tmp_path / "overwrite.jsonl"
+ f.write_text("old content")
+
+ save_to_jsonl(str(f), [{"new": True}])
+ lines = f.read_text().strip().split("\n")
+ assert json.loads(lines[0]) == {"new": True}
+
+
+# ---------------------------------------------------------------------------
+# append_to_jsonl
+# ---------------------------------------------------------------------------
+
+class TestAppendToJsonl:
+ def test_appends_single_record(self, tmp_path):
+ """Should append one record to file."""
+ f = tmp_path / "append.jsonl"
+ f.write_text('{"a":1}\n')
+
+ append_to_jsonl(str(f), {"b": 2})
+
+ lines = f.read_text().strip().split("\n")
+ assert len(lines) == 2
+ assert json.loads(lines[1]) == {"b": 2}
+
+ def test_creates_file_if_missing(self, tmp_path):
+ """Should create file if it doesn't exist."""
+ f = tmp_path / "new.jsonl"
+ append_to_jsonl(str(f), {"first": True})
+
+ assert f.exists()
+ assert json.loads(f.read_text().strip()) == {"first": True}
+
+
+# ---------------------------------------------------------------------------
+# get_next_run_id
+# ---------------------------------------------------------------------------
+
+class TestGetNextRunId:
+ def test_empty_directory(self, tmp_path):
+ """Empty dir should return 1."""
+ assert get_next_run_id(str(tmp_path)) == 1
+
+ def test_nonexistent_directory(self, tmp_path):
+ """Non-existent dir should return 1."""
+ assert get_next_run_id(str(tmp_path / "nonexistent")) == 1
+
+ def test_sequential_numbering(self, tmp_path):
+ """Should return one more than the highest existing number."""
+ (tmp_path / "1_eval_result.json").write_text("{}")
+ (tmp_path / "2_eval_result.json").write_text("{}")
+ (tmp_path / "3_eval_result.json").write_text("{}")
+
+ assert get_next_run_id(str(tmp_path)) == 4
+
+ def test_ignores_non_matching_files(self, tmp_path):
+ """Files not matching the pattern should be ignored."""
+ (tmp_path / "2_eval_result.json").write_text("{}")
+ (tmp_path / "readme.md").write_text("# Hi")
+ (tmp_path / "config.yaml").write_text("")
+
+ assert get_next_run_id(str(tmp_path)) == 3
+
+ def test_handles_gaps(self, tmp_path):
+ """Should use the max, not count."""
+ (tmp_path / "1_a.json").write_text("{}")
+ (tmp_path / "5_b.json").write_text("{}")
+
+ assert get_next_run_id(str(tmp_path)) == 6