diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..986f4fb --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,34 @@ +name: "CodeQL" + +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + - cron: "25 14 * * 1" + +jobs: + analyze: + name: Analyze (Python) + runs-on: ubuntu-latest + permissions: + security-events: write + packages: read + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: python + queries: security-and-quality + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:python" diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml new file mode 100644 index 0000000..df39cd3 --- /dev/null +++ b/.github/workflows/pr-checks.yml @@ -0,0 +1,42 @@ +name: PR Checks + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + checks: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install flake8 isort + + - name: Lint with flake8 + run: | + flake8 src/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 src/ tests/ --count --max-line-length=120 --statistics --exit-zero + + - name: Check import ordering with isort + run: | + isort --check-only --diff src/ tests/ + + - name: Run unit tests + run: | + python -m pytest tests/ -v --tb=short diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..9855d94 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v --tb=short diff --git a/requirements.txt b/requirements.txt index cd6ed8e..713b8b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Azure SDK dependencies azure-identity>=1.25.3 azure-ai-projects>=2.2.0 -azure-ai-evaluation==1.16.9 +azure-ai-evaluation==1.17.0 azure-ai-inference>=1.0.0b9 # Core Python packages python-dotenv>=1.2.2 @@ -9,8 +9,11 @@ pyyaml>=6.0.3 pip-system-certs>=5.3 azure-monitor-query>=2.0.0 azure-monitor-opentelemetry>=1.8.8 -aiohttp>=3.13.5 -agent-framework==1.7.0 +aiohttp>=3.14.1 +agent-framework>=1.9.0 streamlit>=1.58.0 pandas==2.3.3 -plotly>=6.7.0 +plotly>=6.8.0 +# Test dependencies +pytest>=9.0.0 +pytest-asyncio>=1.3.0 diff --git a/src/agent_evaluation/agentic_ops/base_evaluator.py b/src/agent_evaluation/agentic_ops/base_evaluator.py index ba6e9f3..2ba3971 100644 --- a/src/agent_evaluation/agentic_ops/base_evaluator.py +++ b/src/agent_evaluation/agentic_ops/base_evaluator.py @@ -1,9 +1,9 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +import logging import os import re -import logging from typing import Dict, Union from .client import LLMClient diff --git a/src/agent_evaluation/agentic_ops/client.py b/src/agent_evaluation/agentic_ops/client.py index c34aeb1..199f127 100644 --- a/src/agent_evaluation/agentic_ops/client.py +++ b/src/agent_evaluation/agentic_ops/client.py @@ -8,12 +8,13 @@ import json import logging -from openai import AzureOpenAI -from azure.identity import DefaultAzureCredential, get_bearer_token_provider import os -from dotenv import load_dotenv import time +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from dotenv import load_dotenv +from openai import AzureOpenAI + load_dotenv() # Configure logging diff --git a/src/agent_evaluation/agentic_ops/run_eval.py b/src/agent_evaluation/agentic_ops/run_eval.py index f06928e..ca4dd4e 100644 --- a/src/agent_evaluation/agentic_ops/run_eval.py +++ b/src/agent_evaluation/agentic_ops/run_eval.py @@ -1,11 +1,13 @@ -import os import inspect +import logging +import os import uuid -from dotenv import load_dotenv + from azure.ai.evaluation import evaluate from azure.ai.projects import AIProjectClient -import logging from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/agent_evaluation/agentic_ops/runner.py b/src/agent_evaluation/agentic_ops/runner.py index 5126171..a6719dd 100644 --- a/src/agent_evaluation/agentic_ops/runner.py +++ b/src/agent_evaluation/agentic_ops/runner.py @@ -1,12 +1,14 @@ import argparse import importlib +import logging +import os import sys import time from pathlib import Path -from typing import Optional, Any, Dict +from typing import Any, Dict, Optional + import yaml -import os -import logging + def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/agent_evaluation/cli.py b/src/agent_evaluation/cli.py index 495a8fd..013efb5 100644 --- a/src/agent_evaluation/cli.py +++ b/src/agent_evaluation/cli.py @@ -9,11 +9,10 @@ import argparse import sys from pathlib import Path -from typing import List, Dict, Optional +from typing import Dict, List, Optional import yaml - # Root of the project (two levels up from this file) ROOT_DIR = Path(__file__).resolve().parents[2] SAMPLES_DIR = ROOT_DIR / "src" / "evaluations" / "offline" @@ -74,7 +73,8 @@ def print_samples_table(samples: List[Dict[str, str]]) -> None: def run_sample(sample: Dict[str, str], extra_args: Optional[List[str]] = None) -> int: """Run a selected evaluation sample.""" - from src.agent_evaluation.agentic_ops.runner import run_pipeline, parse_args + from src.agent_evaluation.agentic_ops.runner import (parse_args, + run_pipeline) config_path = sample["config_path"] print(f"\n{'='*70}") diff --git a/src/evaluations/offline/agentic_evaluation/eval_factory.py b/src/evaluations/offline/agentic_evaluation/eval_factory.py index 19f9c89..440fff9 100644 --- a/src/evaluations/offline/agentic_evaluation/eval_factory.py +++ b/src/evaluations/offline/agentic_evaluation/eval_factory.py @@ -1,8 +1,12 @@ -from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator -from .evaluator.evaluator_repo.evaluate_agent_invoked import EvaluateAgentsInvoked - -import os import logging +import os + +from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator, + ToolCallAccuracyEvaluator) + +from .evaluator.evaluator_repo.evaluate_agent_invoked import \ + EvaluateAgentsInvoked + def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py b/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py index 62254d2..919f28e 100644 --- a/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py +++ b/src/evaluations/offline/agentic_evaluation/evaluator/eval_main.py @@ -1,9 +1,11 @@ +import logging import os from pathlib import Path + from src.agent_evaluation.agentic_ops.run_eval import execute_eval -import logging from src.evaluations.offline.utils.constants import EVAL_NAME from src.evaluations.offline.utils.file_operations import get_next_run_id + from ..eval_factory import EvaluatorFactory diff --git a/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py b/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py index 1158b21..1eb437f 100644 --- a/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py +++ b/src/evaluations/offline/agentic_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py @@ -1,5 +1,6 @@ from .eval_utils.evaluation_utils import agent_invoked_accuracy, compute_recall + class EvaluateAgentsInvoked: def __init__(self): pass diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py b/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py index 81545b1..312fea0 100644 --- a/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py +++ b/src/evaluations/offline/ai_judge_evaluation_custom/eval_factory.py @@ -1,11 +1,13 @@ -from azure.ai.evaluation import RelevanceEvaluator, CoherenceEvaluator +import logging +import os + +from azure.ai.evaluation import CoherenceEvaluator, RelevanceEvaluator + from .evaluator.evaluator_repo.coherence import CoherenceEvaluatorCustom -from .evaluator.evaluator_repo.relevance import RelevanceEvaluatorCustom from .evaluator.evaluator_repo.fluency import FluencyEvaluatorCustom +from .evaluator.evaluator_repo.relevance import RelevanceEvaluatorCustom from .evaluator.evaluator_repo.similarity import SimilarityEvaluatorCustom -import os -import logging def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py index 14245ef..141df04 100644 --- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py +++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/eval_main.py @@ -1,9 +1,11 @@ +import logging import os from pathlib import Path + from src.agent_evaluation.agentic_ops.run_eval import execute_eval -import logging from src.evaluations.offline.utils.constants import EVAL_NAME from src.evaluations.offline.utils.file_operations import get_next_run_id + from ..eval_factory import EvaluatorFactory diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py index 3d06c39..7fec400 100644 --- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py +++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/coherence.py @@ -1,5 +1,7 @@ from typing import Dict, Union -from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator + +from ......agent_evaluation.agentic_ops.base_evaluator import \ + BaseCustomEvaluator class CoherenceEvaluatorCustom(BaseCustomEvaluator): diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py index 5a03f3d..aeff3a1 100644 --- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py +++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/fluency.py @@ -2,7 +2,9 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing import Dict, Union -from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator + +from ......agent_evaluation.agentic_ops.base_evaluator import \ + BaseCustomEvaluator class FluencyEvaluatorCustom(BaseCustomEvaluator): diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py index 71cd9f3..3cd9b73 100644 --- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py +++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/relevance.py @@ -1,5 +1,7 @@ from typing import Dict, Union -from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator + +from ......agent_evaluation.agentic_ops.base_evaluator import \ + BaseCustomEvaluator class RelevanceEvaluatorCustom(BaseCustomEvaluator): diff --git a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py index b2a34bd..3dd147f 100644 --- a/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py +++ b/src/evaluations/offline/ai_judge_evaluation_custom/evaluator/evaluator_repo/similarity.py @@ -2,7 +2,9 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing import Dict, Union -from ......agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator + +from ......agent_evaluation.agentic_ops.base_evaluator import \ + BaseCustomEvaluator class SimilarityEvaluatorCustom(BaseCustomEvaluator): diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py index 1fd459c..1fa5560 100644 --- a/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py +++ b/src/evaluations/offline/pipeline_experiment_evaluation/eval_factory.py @@ -1,7 +1,9 @@ -from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator - -import os import logging +import os + +from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator, + ToolCallAccuracyEvaluator) + def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py index 14245ef..141df04 100644 --- a/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py +++ b/src/evaluations/offline/pipeline_experiment_evaluation/evaluator/eval_main.py @@ -1,9 +1,11 @@ +import logging import os from pathlib import Path + from src.agent_evaluation.agentic_ops.run_eval import execute_eval -import logging from src.evaluations.offline.utils.constants import EVAL_NAME from src.evaluations.offline.utils.file_operations import get_next_run_id + from ..eval_factory import EvaluatorFactory diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py index ddb1e73..8def77a 100644 --- a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py +++ b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/agent_inference.py @@ -11,11 +11,14 @@ FLOW: Load Queries → Run Inference → Save Responses """ -import os import logging +import os import random from pathlib import Path -from src.evaluations.offline.utils.file_operations import load_queries_from_jsonl + +from src.evaluations.offline.utils.file_operations import \ + load_queries_from_jsonl + from .experiment_utils import get_file_paths, prepare_output_file, save_result @@ -101,7 +104,7 @@ def inference_main(config: dict, args=None) -> None: if __name__ == "__main__": # For standalone execution, load config from experiment.yaml import yaml - + # Get project root (go up 5 levels from this file) current_file = Path(__file__) # .../experiment/agent_inference.py project_root = current_file.parent.parent.parent.parent.parent.parent # Go up to project root diff --git a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py index 0f27fc0..379241b 100644 --- a/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py +++ b/src/evaluations/offline/pipeline_experiment_evaluation/experiment/experiment_utils/file_utils.py @@ -3,11 +3,11 @@ ============== Helper functions for file management in inference pipelines. """ -import os import logging +import os from pathlib import Path -from src.evaluations.offline.utils.file_operations import append_to_jsonl +from src.evaluations.offline.utils.file_operations import append_to_jsonl logger = logging.getLogger(__name__) diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py index 24d2636..dd14b67 100644 --- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py +++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/agent_tools.py @@ -1,9 +1,10 @@ """Tool functions for the device agents in the Multi-Agent system.""" +from random import choice, randint from typing import Annotated -from pydantic import Field -from random import randint, choice + from agent_framework import tool +from pydantic import Field # ============================================================================= diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py index b7f7de5..6efccf7 100644 --- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py +++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_inference/multi_agent_orchestrator.py @@ -22,60 +22,36 @@ """ import asyncio import json -import os import logging -from typing import Annotated +import os from pathlib import Path +from typing import Annotated + from agent_framework import Agent, tool from agent_framework.observability import enable_instrumentation, get_tracer +from agent_framework.openai import OpenAIChatClient +from azure.identity import AzureCliCredential from azure.monitor.opentelemetry import configure_azure_monitor from opentelemetry import context as otel_context from opentelemetry.trace import SpanKind from opentelemetry.trace.span import format_trace_id -from agent_framework.openai import OpenAIChatClient -from azure.identity import AzureCliCredential from pydantic import Field # Handle both standalone and package execution try: # When run as part of pipeline (package import) - from .agent_tools import ( - # AC tools - set_ac_temperature, - turn_ac_on, - turn_ac_off, - set_ac_mode, - get_ac_status, - # TV tools - turn_tv_on, - turn_tv_off, - set_tv_channel, - set_tv_volume, - get_tv_status, - # Dishwasher tools - start_dishwasher, - stop_dishwasher, - get_dishwasher_status, - set_dishwasher_delay, - ) + from .agent_tools import ( # AC tools; TV tools; Dishwasher tools + get_ac_status, get_dishwasher_status, get_tv_status, set_ac_mode, + set_ac_temperature, set_dishwasher_delay, set_tv_channel, + set_tv_volume, start_dishwasher, stop_dishwasher, turn_ac_off, + turn_ac_on, turn_tv_off, turn_tv_on) except ImportError: # When run standalone - from agent_tools import ( - set_ac_temperature, - turn_ac_on, - turn_ac_off, - set_ac_mode, - get_ac_status, - turn_tv_on, - turn_tv_off, - set_tv_channel, - set_tv_volume, - get_tv_status, - start_dishwasher, - stop_dishwasher, - get_dishwasher_status, - set_dishwasher_delay, - ) + from agent_tools import (get_ac_status, get_dishwasher_status, + get_tv_status, set_ac_mode, set_ac_temperature, + set_dishwasher_delay, set_tv_channel, + set_tv_volume, start_dishwasher, stop_dishwasher, + turn_ac_off, turn_ac_on, turn_tv_off, turn_tv_on) try: from src.evaluations.offline.utils.file_operations import append_to_jsonl @@ -86,6 +62,7 @@ from src.evaluations.offline.utils.file_operations import append_to_jsonl from dotenv import load_dotenv + load_dotenv() diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py index 11c82b7..eef116d 100644 --- a/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py +++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py @@ -3,12 +3,14 @@ Delegates to the shared trace_to_jsonl module in utils/. """ -from src.evaluations.offline.utils.trace_to_jsonl import get_trace_main # noqa: F401 +from src.evaluations.offline.utils.trace_to_jsonl import \ + get_trace_main # noqa: F401 if __name__ == "__main__": - import yaml from pathlib import Path + import yaml + script_dir = Path(__file__).parent config_path = script_dir.parent / "experiment.yaml" diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py index 8ae2cb8..fbd30ad 100644 --- a/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py +++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/eval_factory.py @@ -1,8 +1,12 @@ -from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator -from .evaluator.evaluator_repo.evaluate_agent_invoked import EvaluateAgentsInvoked - -import os import logging +import os + +from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator, + ToolCallAccuracyEvaluator) + +from .evaluator.evaluator_repo.evaluate_agent_invoked import \ + EvaluateAgentsInvoked + def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py index 7e3748c..141df04 100644 --- a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py +++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/eval_main.py @@ -1,11 +1,12 @@ +import logging import os from pathlib import Path + from src.agent_evaluation.agentic_ops.run_eval import execute_eval -import logging from src.evaluations.offline.utils.constants import EVAL_NAME from src.evaluations.offline.utils.file_operations import get_next_run_id -from ..eval_factory import EvaluatorFactory +from ..eval_factory import EvaluatorFactory def get_logger(name: str): diff --git a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py index 1102f3e..77a61df 100644 --- a/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py +++ b/src/evaluations/offline/pipeline_multi_agent_evaluation/evaluator/evaluator_repo/evaluate_agent_invoked.py @@ -1,4 +1,6 @@ -from .eval_utils.evaluation_utils import agent_invoked_accuracy, calculate_match_percentage +from .eval_utils.evaluation_utils import (agent_invoked_accuracy, + calculate_match_percentage) + class EvaluateAgentsInvoked: def __init__(self): diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py index 7eec989..1d80b7b 100644 --- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py +++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/agent_tools.py @@ -1,8 +1,10 @@ """Tool functions for the Multi-Tool Agent.""" +from random import randint from typing import Annotated + from pydantic import Field -from random import randint + def get_weather( location: Annotated[str, Field(description="The location to get the weather for.")], diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py index 4382c79..faac3a9 100644 --- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py +++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_inference/multi_tool_agent.py @@ -9,44 +9,31 @@ """ import asyncio import json -import os import logging +import os from datetime import datetime from pathlib import Path + from agent_framework import Agent, ChatOptions from agent_framework.observability import enable_instrumentation, get_tracer +from agent_framework.openai import OpenAIChatClient +from azure.identity import AzureCliCredential from azure.monitor.opentelemetry import configure_azure_monitor from opentelemetry import context as otel_context from opentelemetry.trace import SpanKind from opentelemetry.trace.span import format_trace_id -from agent_framework.openai import OpenAIChatClient -from azure.identity import AzureCliCredential # Handle both standalone and package execution try: # When run as part of pipeline (package import) - from .agent_tools import ( - get_current_datetime, - calculate_sum, - calculate_product, - convert_temperature, - count_words, - generate_uuid, - format_json, - get_weather - ) + from .agent_tools import (calculate_product, calculate_sum, + convert_temperature, count_words, format_json, + generate_uuid, get_current_datetime, get_weather) except ImportError: # When run standalone - from agent_tools import ( - get_current_datetime, - calculate_sum, - calculate_product, - convert_temperature, - count_words, - generate_uuid, - format_json, - get_weather - ) + from agent_tools import (calculate_product, calculate_sum, + convert_temperature, count_words, format_json, + generate_uuid, get_current_datetime, get_weather) try: from src.evaluations.offline.utils.file_operations import append_to_jsonl @@ -57,6 +44,7 @@ from src.evaluations.offline.utils.file_operations import append_to_jsonl from dotenv import load_dotenv + load_dotenv() @@ -193,7 +181,8 @@ async def run_inference_async(config: dict) -> None: # agent_framework's own spans, so disabling just this instrumentor # is safe. try: - from azure.ai.projects.telemetry._responses_instrumentor import ResponsesInstrumentor + from azure.ai.projects.telemetry._responses_instrumentor import \ + ResponsesInstrumentor if ResponsesInstrumentor().is_instrumented(): ResponsesInstrumentor().uninstrument() logger.info("[AGENT] Disabled azure-ai-projects ResponsesInstrumentor (parallel-tool-call bug workaround)") @@ -275,7 +264,7 @@ def inference_main(config: dict, args=None) -> None: # ============================================================================= if __name__ == "__main__": import yaml - + # Get config path relative to this file script_dir = Path(__file__).parent config_path = script_dir.parent / "experiment.yaml" diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py index 74eaee7..79b0532 100644 --- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py +++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/agent_telemetry_extraction/trace_to_jsonl.py @@ -4,12 +4,14 @@ Delegates to the shared trace_to_jsonl module in utils/. """ -from src.evaluations.offline.utils.trace_to_jsonl import get_trace_main # noqa: F401 +from src.evaluations.offline.utils.trace_to_jsonl import \ + get_trace_main # noqa: F401 if __name__ == "__main__": - import yaml from pathlib import Path + import yaml + script_dir = Path(__file__).parent config_path = script_dir.parent / "experiment.yaml" diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py index 1fd459c..1fa5560 100644 --- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py +++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/eval_factory.py @@ -1,7 +1,9 @@ -from azure.ai.evaluation import RelevanceEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator - -import os import logging +import os + +from azure.ai.evaluation import (RelevanceEvaluator, TaskAdherenceEvaluator, + ToolCallAccuracyEvaluator) + def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py index 7e3748c..141df04 100644 --- a/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py +++ b/src/evaluations/offline/pipeline_multi_tool_agent_evaluation/evaluator/eval_main.py @@ -1,11 +1,12 @@ +import logging import os from pathlib import Path + from src.agent_evaluation.agentic_ops.run_eval import execute_eval -import logging from src.evaluations.offline.utils.constants import EVAL_NAME from src.evaluations.offline.utils.file_operations import get_next_run_id -from ..eval_factory import EvaluatorFactory +from ..eval_factory import EvaluatorFactory def get_logger(name: str): diff --git a/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py b/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py index a4af652..949a336 100644 --- a/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py +++ b/src/evaluations/offline/rag_evaluation_foundry/eval_factory.py @@ -1,7 +1,8 @@ +import logging +import os + from azure.ai.evaluation import RelevanceEvaluator -import os -import logging def get_logger(name: str): level = os.environ.get("LOG_LEVEL", "INFO").upper() diff --git a/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py b/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py index 14245ef..141df04 100644 --- a/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py +++ b/src/evaluations/offline/rag_evaluation_foundry/evaluator/eval_main.py @@ -1,9 +1,11 @@ +import logging import os from pathlib import Path + from src.agent_evaluation.agentic_ops.run_eval import execute_eval -import logging from src.evaluations.offline.utils.constants import EVAL_NAME from src.evaluations.offline.utils.file_operations import get_next_run_id + from ..eval_factory import EvaluatorFactory diff --git a/src/evaluations/offline/utils/trace_to_jsonl.py b/src/evaluations/offline/utils/trace_to_jsonl.py index fbe0696..b56f6ff 100644 --- a/src/evaluations/offline/utils/trace_to_jsonl.py +++ b/src/evaluations/offline/utils/trace_to_jsonl.py @@ -12,16 +12,18 @@ Used by both pipeline_multi_agent_evaluation and pipeline_multi_tool_agent_evaluation. """ -import os import json import logging +import os import time -from pathlib import Path -from typing import Dict, List, Any from datetime import timedelta -from azure.monitor.query import LogsQueryClient, LogsQueryStatus +from pathlib import Path +from typing import Any, Dict, List + +from azure.core.exceptions import (HttpResponseError, ServiceRequestError, + ServiceResponseError) from azure.identity import DefaultAzureCredential -from azure.core.exceptions import HttpResponseError, ServiceRequestError, ServiceResponseError +from azure.monitor.query import LogsQueryClient, LogsQueryStatus from dotenv import load_dotenv load_dotenv() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_agent_tools.py b/tests/unit/test_agent_tools.py new file mode 100644 index 0000000..8844ab1 --- /dev/null +++ b/tests/unit/test_agent_tools.py @@ -0,0 +1,157 @@ +"""Unit tests for the agent_tools module (device agent tools).""" + +from src.evaluations.offline.pipeline_multi_agent_evaluation.agent_inference.agent_tools import ( + get_ac_status, get_dishwasher_status, get_tv_status, set_ac_mode, + set_ac_temperature, set_dishwasher_delay, set_tv_channel, set_tv_volume, + start_dishwasher, stop_dishwasher, turn_ac_off, turn_ac_on, turn_tv_off, + turn_tv_on) + +# --------------------------------------------------------------------------- +# AC Tools +# --------------------------------------------------------------------------- + +class TestACTools: + def test_set_temperature_valid(self): + """Valid temperature should succeed.""" + result = set_ac_temperature(temperature=72) + assert "72" in result + assert "set to" in result.lower() or "72°F" in result + + def test_set_temperature_too_low(self): + """Temperature below range should return error.""" + result = set_ac_temperature(temperature=50) + assert "error" in result.lower() or "out of range" in result.lower() + + def test_set_temperature_too_high(self): + """Temperature above range should return error.""" + result = set_ac_temperature(temperature=90) + assert "error" in result.lower() or "out of range" in result.lower() + + def test_set_temperature_boundary_low(self): + """60°F should be accepted.""" + result = set_ac_temperature(temperature=60) + assert "60" in result + + def test_set_temperature_boundary_high(self): + """85°F should be accepted.""" + result = set_ac_temperature(temperature=85) + assert "85" in result + + def test_turn_ac_on(self): + """Should confirm AC turned on.""" + result = turn_ac_on() + assert "on" in result.lower() + + def test_turn_ac_off(self): + """Should confirm AC turned off.""" + result = turn_ac_off() + assert "off" in result.lower() + + def test_set_ac_mode_valid(self): + """Valid modes should succeed.""" + for mode in ["cool", "heat", "fan", "auto"]: + result = set_ac_mode(mode=mode) + assert mode in result.lower() + + def test_set_ac_mode_invalid(self): + """Invalid mode should return error.""" + result = set_ac_mode(mode="turbo") + assert "error" in result.lower() or "invalid" in result.lower() + + def test_get_ac_status(self): + """Should return a status string.""" + result = get_ac_status() + assert isinstance(result, str) + assert "AC" in result or "ac" in result.lower() + + +# --------------------------------------------------------------------------- +# TV Tools +# --------------------------------------------------------------------------- + +class TestTVTools: + def test_turn_tv_on(self): + """Should confirm TV turned on.""" + result = turn_tv_on() + assert "on" in result.lower() + + def test_turn_tv_off(self): + """Should confirm TV turned off.""" + result = turn_tv_off() + assert "off" in result.lower() + + def test_set_channel_valid(self): + """Valid channel should succeed.""" + result = set_tv_channel(channel=42) + assert "42" in result + + def test_set_channel_too_low(self): + """Channel 0 should return error.""" + result = set_tv_channel(channel=0) + assert "error" in result.lower() or "out of range" in result.lower() + + def test_set_channel_too_high(self): + """Channel 1000 should return error.""" + result = set_tv_channel(channel=1000) + assert "error" in result.lower() or "out of range" in result.lower() + + def test_set_channel_boundary(self): + """Channels 1 and 999 should be accepted.""" + assert "1" in set_tv_channel(channel=1) + assert "999" in set_tv_channel(channel=999) + + def test_set_volume_valid(self): + """Valid volume should succeed.""" + result = set_tv_volume(volume=50) + assert "50" in result + + def test_set_volume_too_low(self): + """Volume -1 should return error.""" + result = set_tv_volume(volume=-1) + assert "error" in result.lower() or "out of range" in result.lower() + + def test_set_volume_too_high(self): + """Volume 101 should return error.""" + result = set_tv_volume(volume=101) + assert "error" in result.lower() or "out of range" in result.lower() + + def test_set_volume_boundaries(self): + """Volume 0 and 100 should be accepted.""" + assert "0" in set_tv_volume(volume=0) + assert "100" in set_tv_volume(volume=100) + + def test_get_tv_status(self): + """Should return a status string.""" + result = get_tv_status() + assert isinstance(result, str) + assert "TV" in result or "tv" in result.lower() + + +# --------------------------------------------------------------------------- +# Dishwasher Tools +# --------------------------------------------------------------------------- + +class TestDishwasherTools: + def test_start_dishwasher(self): + """Should confirm dishwasher started.""" + result = start_dishwasher() + assert isinstance(result, str) + assert len(result) > 0 + + def test_stop_dishwasher(self): + """Should confirm dishwasher stopped.""" + result = stop_dishwasher() + assert isinstance(result, str) + assert len(result) > 0 + + def test_get_dishwasher_status(self): + """Should return a status string.""" + result = get_dishwasher_status() + assert isinstance(result, str) + assert len(result) > 0 + + def test_set_dishwasher_delay(self): + """Should confirm delay set.""" + result = set_dishwasher_delay(hours=2) + assert isinstance(result, str) + assert len(result) > 0 diff --git a/tests/unit/test_base_evaluator.py b/tests/unit/test_base_evaluator.py new file mode 100644 index 0000000..97a7b83 --- /dev/null +++ b/tests/unit/test_base_evaluator.py @@ -0,0 +1,207 @@ +"""Unit tests for the BaseCustomEvaluator (src/agent_evaluation/agentic_ops/base_evaluator.py).""" + +from unittest.mock import MagicMock, patch + +import pytest + +from src.agent_evaluation.agentic_ops.base_evaluator import BaseCustomEvaluator + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +class ConcreteEvaluator(BaseCustomEvaluator): + """Concrete test subclass of BaseCustomEvaluator.""" + + def __init__(self, model_config=None): + super().__init__( + prompty_file_name="test.prompty", + result_key="test_score", + model_config=model_config, + ) + + def __call__(self, query: str, response: str, **kwargs): + return self.evaluate(query=query, response=response, **kwargs) + + +# --------------------------------------------------------------------------- +# _extract_score +# --------------------------------------------------------------------------- + +class TestExtractScore: + @pytest.fixture + def evaluator(self): + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + return e + + def test_extracts_structured_s2_tag(self, evaluator): + """Should extract score from 4 format.""" + assert evaluator._extract_score("4") == 4 + + def test_extracts_score_colon_format(self, evaluator): + """Should extract from 'Score: 5' format.""" + assert evaluator._extract_score("The evaluation is complete. Score: 5") == 5 + + def test_extracts_rating_format(self, evaluator): + """Should extract from 'Rating: 3' format.""" + assert evaluator._extract_score("Rating: 3 - The response is adequate") == 3 + + def test_returns_default_on_no_match(self, evaluator): + """Should return default when no score is found.""" + assert evaluator._extract_score("No numeric content here at all") == 3 + + def test_custom_default(self, evaluator): + """Should use custom default score.""" + assert evaluator._extract_score("no score", default_score=1) == 1 + + def test_ignores_out_of_range_scores(self, evaluator): + """Scores outside 1-5 from pattern matching should fall through.""" + # The S2 tag extraction doesn't validate range, but pattern matching does + result = evaluator._extract_score("Score: 9") + # 9 is out of range for pattern matching fallback, should use default + assert result == 3 + + def test_s2_tag_any_value(self, evaluator): + """S2 tag should accept any digit value.""" + assert evaluator._extract_score("1") == 1 + assert evaluator._extract_score("5") == 5 + + +# --------------------------------------------------------------------------- +# _create_user_prompt +# --------------------------------------------------------------------------- + +class TestCreateUserPrompt: + @pytest.fixture + def evaluator(self): + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + return e + + def test_replaces_single_placeholder(self, evaluator): + """Should replace {{query}} with value.""" + template = "Evaluate: {{query}}" + result = evaluator._create_user_prompt(template, query="What is AI?") + assert result == "Evaluate: What is AI?" + + def test_replaces_multiple_placeholders(self, evaluator): + """Should replace multiple placeholders.""" + template = "Query: {{query}}\nResponse: {{response}}" + result = evaluator._create_user_prompt(template, query="Q", response="A") + assert result == "Query: Q\nResponse: A" + + def test_leaves_unknown_placeholders(self, evaluator): + """Unresolved placeholders should remain.""" + template = "Query: {{query}} Context: {{context}}" + result = evaluator._create_user_prompt(template, query="Q") + assert "{{context}}" in result + + def test_no_placeholders(self, evaluator): + """Template without placeholders should remain unchanged.""" + template = "Plain text with no placeholders" + result = evaluator._create_user_prompt(template, query="Q") + assert result == template + + +# --------------------------------------------------------------------------- +# _load_prompt_content +# --------------------------------------------------------------------------- + +class TestLoadPromptContent: + def test_returns_fallback_on_missing_file(self): + """Missing prompty file should return fallback prompt.""" + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.prompty_path = "/nonexistent/path/test.prompty" + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + + system, user = e._load_prompt_content() + assert "test" in user.lower() or "evaluate" in user.lower() + + def test_parses_prompty_with_system_and_user(self, tmp_path): + """Should parse system/user sections from prompty file.""" + # Prompty files have: ---\nmetadata\n---\nprompt content + # The parser splits on '---' and expects at least 3 parts + prompty_content = "---\nname: test\nmodel: gpt-4\n---\nsystem:\nYou are an evaluator.\nuser:\nEvaluate this: {{query}}\n---\n" + prompty_file = tmp_path / "test.prompty" + prompty_file.write_text(prompty_content, encoding="utf-8") + + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.prompty_path = str(prompty_file) + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + + system, user = e._load_prompt_content() + # The parser should extract content from the prompty file + combined = system + user + assert len(combined) > 0 + + +# --------------------------------------------------------------------------- +# evaluate +# --------------------------------------------------------------------------- + +class TestEvaluate: + @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient") + def test_evaluate_returns_score(self, mock_client_cls): + """evaluate() should return dict with result_key and score.""" + mock_client = MagicMock() + mock_client.get_llm_response_with_prompty.return_value = "4" + mock_client_cls.return_value = mock_client + + with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("system", "{{query}}")): + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.prompty_path = "test.prompty" + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + + result = e.evaluate(query="What is AI?") + assert result == {"test_score": 4} + + @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient") + def test_evaluate_returns_default_on_error(self, mock_client_cls): + """evaluate() should return default score on LLM error.""" + mock_client = MagicMock() + mock_client.get_llm_response_with_prompty.side_effect = Exception("API error") + mock_client_cls.return_value = mock_client + + with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("system", "{{query}}")): + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.prompty_path = "test.prompty" + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + + result = e.evaluate(query="What is AI?") + assert result == {"test_score": 3} + + +# --------------------------------------------------------------------------- +# __call__ +# --------------------------------------------------------------------------- + +class TestCall: + @patch("src.agent_evaluation.agentic_ops.base_evaluator.LLMClient") + def test_call_delegates_to_evaluate(self, mock_client_cls): + """__call__ should delegate to evaluate.""" + mock_client = MagicMock() + mock_client.get_llm_response_with_prompty.return_value = "5" + mock_client_cls.return_value = mock_client + + with patch.object(BaseCustomEvaluator, "_load_prompt_content", return_value=("sys", "{{query}}")): + with patch.object(BaseCustomEvaluator, "__init__", lambda self, *a, **kw: None): + e = BaseCustomEvaluator.__new__(BaseCustomEvaluator) + e.prompty_path = "test.prompty" + e.result_key = "test_score" + e.prompty_file_name = "test.prompty" + + result = e(query="test") + assert result == {"test_score": 5} diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py new file mode 100644 index 0000000..ca4417c --- /dev/null +++ b/tests/unit/test_cli.py @@ -0,0 +1,401 @@ +"""Unit tests for the CLI module (src/agent_evaluation/cli.py).""" + +import argparse +from unittest.mock import MagicMock, mock_open, patch + +import pytest +import yaml + +from src.agent_evaluation.cli import (EXCLUDE_DIRS, cmd_info, cmd_list, + cmd_run, cmd_run_all, discover_samples, + interactive_select, main, + print_samples_table, run_sample) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +SAMPLE_YAML_CONTENT = { + "app_name": "TestApp", + "experiment_name": "test_experiment", + "version": "1.0", + "pipeline": [ + {"config_key": "evaluation", "base_path": "evaluator", "module": "eval_main.eval_main"}, + ], + "evaluation": { + "input_path": "datasets/", + "input_file": "sample.jsonl", + "output_path": "reports/", + "evaluators": {"score": "relevance_evaluator"}, + }, +} + + +@pytest.fixture +def mock_samples(): + return [ + { + "name": "agentic_evaluation", + "app_name": "TestApp", + "experiment_name": "test_experiment", + "version": "1.0", + "config_path": "src/evaluations/offline/agentic_evaluation/experiment.yaml", + "stages": ["evaluation"], + }, + { + "name": "rag_evaluation_foundry", + "app_name": "RAGApp", + "experiment_name": "rag_experiment", + "version": "2.0", + "config_path": "src/evaluations/offline/rag_evaluation_foundry/experiment.yaml", + "stages": ["evaluation"], + }, + ] + + +# --------------------------------------------------------------------------- +# discover_samples +# --------------------------------------------------------------------------- + +class TestDiscoverSamples: + def test_returns_list(self, tmp_path, monkeypatch): + """discover_samples should return a list.""" + monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path) + result = discover_samples() + assert isinstance(result, list) + + def test_skips_excluded_dirs(self, tmp_path, monkeypatch): + """Directories in EXCLUDE_DIRS should be skipped.""" + monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path) + for excluded in EXCLUDE_DIRS: + d = tmp_path / excluded + d.mkdir() + (d / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT)) + + result = discover_samples() + assert result == [] + + def test_skips_dirs_without_experiment_yaml(self, tmp_path, monkeypatch): + """Directories without experiment.yaml should be skipped.""" + monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path) + (tmp_path / "some_dir").mkdir() + result = discover_samples() + assert result == [] + + def test_discovers_valid_sample(self, tmp_path, monkeypatch): + """A valid sample directory with experiment.yaml should be discovered.""" + monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path) + monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent) + + sample_dir = tmp_path / "my_sample" + sample_dir.mkdir() + (sample_dir / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT)) + + result = discover_samples() + assert len(result) == 1 + assert result[0]["name"] == "my_sample" + assert result[0]["app_name"] == "TestApp" + assert result[0]["experiment_name"] == "test_experiment" + assert result[0]["version"] == "1.0" + assert result[0]["stages"] == ["evaluation"] + + def test_discovers_multiple_samples_sorted(self, tmp_path, monkeypatch): + """Multiple samples should be returned in sorted order.""" + monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path) + monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent) + + for name in ["z_sample", "a_sample", "m_sample"]: + d = tmp_path / name + d.mkdir() + (d / "experiment.yaml").write_text(yaml.dump(SAMPLE_YAML_CONTENT)) + + result = discover_samples() + assert [s["name"] for s in result] == ["a_sample", "m_sample", "z_sample"] + + def test_uses_defaults_for_missing_yaml_fields(self, tmp_path, monkeypatch): + """Missing fields in experiment.yaml should use directory name as default.""" + monkeypatch.setattr("src.agent_evaluation.cli.SAMPLES_DIR", tmp_path) + monkeypatch.setattr("src.agent_evaluation.cli.ROOT_DIR", tmp_path.parent) + + sample_dir = tmp_path / "bare_sample" + sample_dir.mkdir() + (sample_dir / "experiment.yaml").write_text(yaml.dump({"pipeline": []})) + + result = discover_samples() + assert len(result) == 1 + assert result[0]["name"] == "bare_sample" + assert result[0]["app_name"] == "bare_sample" + assert result[0]["experiment_name"] == "bare_sample" + assert result[0]["version"] == "" + assert result[0]["stages"] == [] + + +# --------------------------------------------------------------------------- +# print_samples_table +# --------------------------------------------------------------------------- + +class TestPrintSamplesTable: + def test_empty_samples(self, capsys): + """Empty list should print 'No evaluation samples found.'""" + print_samples_table([]) + captured = capsys.readouterr() + assert "No evaluation samples found." in captured.out + + def test_prints_sample_names(self, capsys, mock_samples): + """Should print sample names in the table.""" + print_samples_table(mock_samples) + captured = capsys.readouterr() + assert "agentic_evaluation" in captured.out + assert "rag_evaluation_foundry" in captured.out + + def test_prints_stages(self, capsys, mock_samples): + """Should print stage info in the table.""" + print_samples_table(mock_samples) + captured = capsys.readouterr() + assert "evaluation" in captured.out + + +# --------------------------------------------------------------------------- +# run_sample +# --------------------------------------------------------------------------- + +class TestRunSample: + @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline") + @patch("src.agent_evaluation.agentic_ops.runner.parse_args") + def test_run_sample_success(self, mock_parse, mock_run, mock_samples): + """Successful run should return 0.""" + mock_args = MagicMock() + mock_parse.return_value = mock_args + mock_run.return_value = None + + result = run_sample(mock_samples[0]) + assert result == 0 + + @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline") + @patch("src.agent_evaluation.agentic_ops.runner.parse_args") + def test_run_sample_system_exit(self, mock_parse, mock_run, mock_samples): + """SystemExit with code should be returned.""" + mock_args = MagicMock() + mock_parse.return_value = mock_args + mock_run.side_effect = SystemExit(2) + + result = run_sample(mock_samples[0]) + assert result == 2 + + @patch("src.agent_evaluation.agentic_ops.runner.run_pipeline") + @patch("src.agent_evaluation.agentic_ops.runner.parse_args") + def test_run_sample_with_extra_args(self, mock_parse, mock_run, mock_samples): + """Extra args should be passed to sys.argv.""" + mock_args = MagicMock() + mock_parse.return_value = mock_args + mock_run.return_value = None + + result = run_sample(mock_samples[0], extra_args=["--sample", "5"]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# interactive_select +# --------------------------------------------------------------------------- + +class TestInteractiveSelect: + @patch("builtins.input", return_value="1") + def test_select_by_number(self, mock_input, mock_samples): + """Selecting by number should return the correct sample.""" + result = interactive_select(mock_samples) + assert result == mock_samples[0] + + @patch("builtins.input", return_value="2") + def test_select_by_second_number(self, mock_input, mock_samples): + """Selecting second item returns second sample.""" + result = interactive_select(mock_samples) + assert result == mock_samples[1] + + @patch("builtins.input", return_value="q") + def test_quit(self, mock_input, mock_samples): + """Typing 'q' should return None.""" + result = interactive_select(mock_samples) + assert result is None + + @patch("builtins.input", return_value="exit") + def test_exit(self, mock_input, mock_samples): + """Typing 'exit' should return None.""" + result = interactive_select(mock_samples) + assert result is None + + @patch("builtins.input", return_value="agentic") + def test_select_by_partial_name(self, mock_input, mock_samples): + """Partial name matching should work for unique matches.""" + result = interactive_select(mock_samples) + assert result == mock_samples[0] + + @patch("builtins.input", side_effect=EOFError) + def test_eof_returns_none(self, mock_input, mock_samples): + """EOFError should return None.""" + result = interactive_select(mock_samples) + assert result is None + + @patch("builtins.input", side_effect=KeyboardInterrupt) + def test_keyboard_interrupt_returns_none(self, mock_input, mock_samples): + """KeyboardInterrupt should return None.""" + result = interactive_select(mock_samples) + assert result is None + + +# --------------------------------------------------------------------------- +# cmd_list +# --------------------------------------------------------------------------- + +class TestCmdList: + @patch("src.agent_evaluation.cli.discover_samples") + def test_cmd_list_returns_zero(self, mock_discover, mock_samples): + """cmd_list should always return 0.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace() + result = cmd_list(args) + assert result == 0 + + @patch("src.agent_evaluation.cli.discover_samples") + def test_cmd_list_empty(self, mock_discover): + """cmd_list with no samples should still return 0.""" + mock_discover.return_value = [] + args = argparse.Namespace() + result = cmd_list(args) + assert result == 0 + + +# --------------------------------------------------------------------------- +# cmd_run +# --------------------------------------------------------------------------- + +class TestCmdRun: + @patch("src.agent_evaluation.cli.run_sample", return_value=0) + @patch("src.agent_evaluation.cli.discover_samples") + def test_run_by_exact_name(self, mock_discover, mock_run, mock_samples): + """Running by exact name should find and run the sample.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="agentic_evaluation", sample=0, index_fname=None) + result = cmd_run(args) + assert result == 0 + mock_run.assert_called_once() + + @patch("src.agent_evaluation.cli.run_sample", return_value=0) + @patch("src.agent_evaluation.cli.discover_samples") + def test_run_by_partial_name(self, mock_discover, mock_run, mock_samples): + """Running by partial name should match the sample.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="agentic", sample=0, index_fname=None) + result = cmd_run(args) + assert result == 0 + + @patch("src.agent_evaluation.cli.discover_samples") + def test_run_not_found(self, mock_discover, mock_samples): + """Running with a non-matching name should return 1.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="nonexistent", sample=0, index_fname=None) + result = cmd_run(args) + assert result == 1 + + @patch("src.agent_evaluation.cli.discover_samples") + def test_run_no_samples(self, mock_discover): + """Running with no samples available returns 1.""" + mock_discover.return_value = [] + args = argparse.Namespace(name="anything", sample=0, index_fname=None) + result = cmd_run(args) + assert result == 1 + + @patch("src.agent_evaluation.cli.run_sample", return_value=0) + @patch("src.agent_evaluation.cli.discover_samples") + def test_run_by_number(self, mock_discover, mock_run, mock_samples): + """Running by number index should work.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="1", sample=0, index_fname=None) + result = cmd_run(args) + assert result == 0 + + @patch("src.agent_evaluation.cli.discover_samples") + def test_run_ambiguous_name(self, mock_discover, mock_samples): + """Ambiguous partial name should return 1.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="evaluation", sample=0, index_fname=None) + result = cmd_run(args) + assert result == 1 + + +# --------------------------------------------------------------------------- +# cmd_run_all +# --------------------------------------------------------------------------- + +class TestCmdRunAll: + @patch("src.agent_evaluation.cli.run_sample", return_value=0) + @patch("src.agent_evaluation.cli.discover_samples") + def test_all_pass(self, mock_discover, mock_run, mock_samples): + """All samples passing should return 0.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace() + result = cmd_run_all(args) + assert result == 0 + assert mock_run.call_count == len(mock_samples) + + @patch("src.agent_evaluation.cli.run_sample", return_value=1) + @patch("src.agent_evaluation.cli.discover_samples") + def test_any_failure_returns_one(self, mock_discover, mock_run, mock_samples): + """Any sample failing should return 1.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace() + result = cmd_run_all(args) + assert result == 1 + + @patch("src.agent_evaluation.cli.discover_samples") + def test_no_samples(self, mock_discover): + """No samples available should return 1.""" + mock_discover.return_value = [] + args = argparse.Namespace() + result = cmd_run_all(args) + assert result == 1 + + +# --------------------------------------------------------------------------- +# cmd_info +# --------------------------------------------------------------------------- + +class TestCmdInfo: + @patch("src.agent_evaluation.cli.discover_samples") + def test_info_not_found(self, mock_discover, mock_samples): + """Non-existent sample should return 1.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="nonexistent") + result = cmd_info(args) + assert result == 1 + + @patch("builtins.open", mock_open(read_data=yaml.dump(SAMPLE_YAML_CONTENT))) + @patch("src.agent_evaluation.cli.discover_samples") + def test_info_found(self, mock_discover, mock_samples): + """Found sample should return 0.""" + mock_discover.return_value = mock_samples + args = argparse.Namespace(name="agentic_evaluation") + result = cmd_info(args) + assert result == 0 + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + +class TestMain: + @patch("src.agent_evaluation.cli.discover_samples", return_value=[]) + def test_main_no_command_no_samples(self, mock_discover): + """No command and no samples should exit with 1.""" + with patch("sys.argv", ["agent_evals"]): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 1 + + @patch("src.agent_evaluation.cli.cmd_list", return_value=0) + def test_main_list_command(self, mock_cmd): + """'list' command should dispatch to cmd_list.""" + with patch("sys.argv", ["agent_evals", "list"]): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 0 + mock_cmd.assert_called_once() diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py new file mode 100644 index 0000000..8fa5dc3 --- /dev/null +++ b/tests/unit/test_client.py @@ -0,0 +1,196 @@ +"""Unit tests for the LLM Client (src/agent_evaluation/agentic_ops/client.py).""" + +from unittest.mock import MagicMock, patch + +import pytest + +from src.agent_evaluation.agentic_ops.client import LLMClient + +# --------------------------------------------------------------------------- +# LLMClient._validate_messages +# --------------------------------------------------------------------------- + +class TestValidateMessages: + @pytest.fixture + def client(self): + with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"): + return LLMClient(temperature=0.0) + + def test_valid_messages(self, client): + """Valid messages should not raise.""" + messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + ] + client._validate_messages(messages) # Should not raise + + def test_not_a_list_raises(self, client): + """Non-list messages should raise ValueError.""" + with pytest.raises(ValueError, match="must be a list"): + client._validate_messages("not a list") + + def test_non_dict_message_raises(self, client): + """Non-dict message items should raise ValueError.""" + with pytest.raises(ValueError, match="must be a dictionary"): + client._validate_messages(["not a dict"]) + + def test_missing_role_raises(self, client): + """Message without 'role' should raise ValueError.""" + with pytest.raises(ValueError, match="must have 'role' and 'content'"): + client._validate_messages([{"content": "hello"}]) + + def test_missing_content_raises(self, client): + """Message without 'content' should raise ValueError.""" + with pytest.raises(ValueError, match="must have 'role' and 'content'"): + client._validate_messages([{"role": "user"}]) + + def test_invalid_role_raises(self, client): + """Invalid role value should raise ValueError.""" + with pytest.raises(ValueError, match="invalid role"): + client._validate_messages([{"role": "invalid", "content": "hi"}]) + + def test_valid_roles(self, client): + """All valid roles should pass.""" + messages = [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "usr"}, + {"role": "assistant", "content": "asst"}, + ] + client._validate_messages(messages) # Should not raise + + +# --------------------------------------------------------------------------- +# LLMClient._parse_json_response +# --------------------------------------------------------------------------- + +class TestParseJsonResponse: + @pytest.fixture + def client(self): + with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"): + return LLMClient(temperature=0.0) + + def test_valid_json(self, client): + """Valid JSON string should be parsed.""" + result = client._parse_json_response('{"key": "value"}') + assert result == {"key": "value"} + + def test_json_with_markdown_fencing(self, client): + """JSON wrapped in ```json ... ``` should be parsed.""" + raw = '```json\n{"key": "value"}\n```' + result = client._parse_json_response(raw) + assert result == {"key": "value"} + + def test_invalid_json_raises(self, client): + """Invalid JSON should raise ValueError.""" + with pytest.raises(ValueError, match="Invalid JSON"): + client._parse_json_response("not valid json {") + + def test_json_array(self, client): + """JSON arrays should be parsed.""" + result = client._parse_json_response('[1, 2, 3]') + assert result == [1, 2, 3] + + def test_whitespace_handling(self, client): + """Extra whitespace should be handled.""" + result = client._parse_json_response(' \n {"a": 1} \n ') + assert result == {"a": 1} + + +# --------------------------------------------------------------------------- +# LLMClient.get_llm_raw_response +# --------------------------------------------------------------------------- + +class TestGetLlmRawResponse: + @patch("src.agent_evaluation.agentic_ops.client.get_llm_response") + def test_builds_messages_correctly(self, mock_get_response): + """Should construct messages with system and user roles.""" + mock_get_response.return_value = "response text" + + with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"): + client = LLMClient(temperature=0.5) + + result = client.get_llm_raw_response("system prompt", "user input") + assert result == "response text" + + call_args = mock_get_response.call_args + messages = call_args[0][0] + assert messages[0] == {"role": "system", "content": "system prompt"} + assert messages[1] == {"role": "user", "content": "user input"} + + +# --------------------------------------------------------------------------- +# LLMClient.get_llm_response_json +# --------------------------------------------------------------------------- + +class TestGetLlmResponseJson: + @patch("src.agent_evaluation.agentic_ops.client.get_llm_response") + def test_returns_parsed_json(self, mock_get_response): + """Should return parsed JSON from LLM response.""" + mock_get_response.return_value = '{"score": 4}' + + with patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance"): + client = LLMClient() + + result = client.get_llm_response_json("sys", "usr") + assert result == {"score": 4} + + +# --------------------------------------------------------------------------- +# get_llm_response (module-level) +# --------------------------------------------------------------------------- + +class TestGetLlmResponse: + @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model") + @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance") + def test_successful_response(self, mock_get_client): + from src.agent_evaluation.agentic_ops.client import get_llm_response + + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = "Hello!" + mock_client.chat.completions.create.return_value = mock_response + mock_get_client.return_value = mock_client + + messages = [{"role": "user", "content": "Hi"}] + result = get_llm_response(messages) + assert result == "Hello!" + + @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", None) + def test_raises_on_missing_deployment(self): + from src.agent_evaluation.agentic_ops.client import get_llm_response + + with pytest.raises(ValueError, match="EVAL_AZURE_OPENAI_MODEL"): + get_llm_response([{"role": "user", "content": "test"}]) + + @patch("src.agent_evaluation.agentic_ops.client.DEFAULT_RETRY_DELAY", 0) + @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model") + @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance") + def test_retries_on_failure(self, mock_get_client): + from src.agent_evaluation.agentic_ops.client import get_llm_response + + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = [ + Exception("transient error"), + Exception("transient error"), + MagicMock(choices=[MagicMock(message=MagicMock(content="success"))]), + ] + mock_get_client.return_value = mock_client + + messages = [{"role": "user", "content": "Hi"}] + result = get_llm_response(messages, max_retries=3) + assert result == "success" + + @patch("src.agent_evaluation.agentic_ops.client.DEFAULT_RETRY_DELAY", 0) + @patch("src.agent_evaluation.agentic_ops.client.DEPLOYMENT_NAME", "test-model") + @patch("src.agent_evaluation.agentic_ops.client.get_llm_client_instance") + def test_raises_after_max_retries(self, mock_get_client): + from src.agent_evaluation.agentic_ops.client import get_llm_response + + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = Exception("permanent error") + mock_get_client.return_value = mock_client + + messages = [{"role": "user", "content": "Hi"}] + with pytest.raises(Exception, match="Maximum retries"): + get_llm_response(messages, max_retries=2) diff --git a/tests/unit/test_eval_factories.py b/tests/unit/test_eval_factories.py new file mode 100644 index 0000000..dc9c016 --- /dev/null +++ b/tests/unit/test_eval_factories.py @@ -0,0 +1,254 @@ +"""Unit tests for eval_factory modules across all evaluation samples.""" + +import pytest + +# --------------------------------------------------------------------------- +# Agentic Evaluation - EvaluatorFactory +# --------------------------------------------------------------------------- + +class TestAgenticEvaluationFactory: + def test_get_relevance_evaluator(self): + from src.evaluations.offline.agentic_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator") + assert result is not None + assert "Relevance" in result.__name__ + + def test_get_custom_agents_evaluator(self): + from src.evaluations.offline.agentic_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("custom_agents_invoked_evaluator") + assert result is not None + assert result.__name__ == "EvaluateAgentsInvoked" + + def test_get_task_adherence_evaluator(self): + from src.evaluations.offline.agentic_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator") + assert result is not None + assert "TaskAdherence" in result.__name__ + + def test_get_tool_call_accuracy_evaluator(self): + from src.evaluations.offline.agentic_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator") + assert result is not None + assert "ToolCallAccuracy" in result.__name__ + + def test_invalid_evaluator_raises(self): + from src.evaluations.offline.agentic_evaluation.eval_factory import \ + EvaluatorFactory + + with pytest.raises(ValueError, match="not found"): + EvaluatorFactory.get_evaluator_factory("nonexistent_evaluator") + + def test_all_registered_evaluators_are_callable(self): + from src.evaluations.offline.agentic_evaluation.eval_factory import \ + EvaluatorFactory + + for name in EvaluatorFactory.EVALUATOR_FACTORIES: + result = EvaluatorFactory.get_evaluator_factory(name) + assert callable(result), f"{name} factory is not callable" + + +# --------------------------------------------------------------------------- +# AI Judge Evaluation Custom - EvaluatorFactory +# --------------------------------------------------------------------------- + +class TestAiJudgeEvaluationFactory: + def test_get_custom_coherence(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("custom_coherence_evaluator") + assert result.__name__ == "CoherenceEvaluatorCustom" + + def test_get_custom_relevance(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("custom_relevance_evaluator") + assert result.__name__ == "RelevanceEvaluatorCustom" + + def test_get_custom_fluency(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("custom_fluency_evaluator") + assert result.__name__ == "FluencyEvaluatorCustom" + + def test_get_custom_similarity(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("custom_similarity_evaluator") + assert result.__name__ == "SimilarityEvaluatorCustom" + + def test_get_builtin_relevance(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator") + assert "Relevance" in result.__name__ + + def test_get_builtin_coherence(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("coherence_evaluator") + assert "Coherence" in result.__name__ + + def test_invalid_evaluator_raises(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + with pytest.raises(ValueError, match="not found"): + EvaluatorFactory.get_evaluator_factory("invalid_evaluator") + + def test_all_registered_evaluators_are_callable(self): + from src.evaluations.offline.ai_judge_evaluation_custom.eval_factory import \ + EvaluatorFactory + + for name in EvaluatorFactory.EVALUATOR_FACTORIES: + result = EvaluatorFactory.get_evaluator_factory(name) + assert callable(result), f"{name} factory is not callable" + + +# --------------------------------------------------------------------------- +# Pipeline Experiment Evaluation - EvaluatorFactory +# --------------------------------------------------------------------------- + +class TestPipelineExperimentFactory: + def test_get_relevance(self): + from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator") + assert "Relevance" in result.__name__ + + def test_get_task_adherence(self): + from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator") + assert "TaskAdherence" in result.__name__ + + def test_get_tool_call_accuracy(self): + from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator") + assert "ToolCallAccuracy" in result.__name__ + + def test_invalid_raises(self): + from src.evaluations.offline.pipeline_experiment_evaluation.eval_factory import \ + EvaluatorFactory + + with pytest.raises(ValueError, match="not found"): + EvaluatorFactory.get_evaluator_factory("does_not_exist") + + +# --------------------------------------------------------------------------- +# Pipeline Multi-Agent Evaluation - EvaluatorFactory +# --------------------------------------------------------------------------- + +class TestPipelineMultiAgentFactory: + def test_get_relevance(self): + from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator") + assert "Relevance" in result.__name__ + + def test_get_task_adherence(self): + from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator") + assert "TaskAdherence" in result.__name__ + + def test_get_agents_invoked(self): + from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("evaluate_agents_invoked") + assert result.__name__ == "EvaluateAgentsInvoked" + + def test_get_custom_agents_invoked(self): + from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("custom_agents_invoked_accuracy_eval") + assert result.__name__ == "EvaluateAgentsInvoked" + + def test_invalid_raises(self): + from src.evaluations.offline.pipeline_multi_agent_evaluation.eval_factory import \ + EvaluatorFactory + + with pytest.raises(ValueError, match="not found"): + EvaluatorFactory.get_evaluator_factory("bogus") + + +# --------------------------------------------------------------------------- +# Pipeline Multi-Tool Agent Evaluation - EvaluatorFactory +# --------------------------------------------------------------------------- + +class TestPipelineMultiToolFactory: + def test_get_relevance(self): + from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator") + assert "Relevance" in result.__name__ + + def test_get_task_adherence(self): + from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("task_adherence_evaluator") + assert "TaskAdherence" in result.__name__ + + def test_get_tool_call_accuracy(self): + from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("tool_call_accuracy_evaluator") + assert "ToolCallAccuracy" in result.__name__ + + def test_invalid_raises(self): + from src.evaluations.offline.pipeline_multi_tool_agent_evaluation.eval_factory import \ + EvaluatorFactory + + with pytest.raises(ValueError, match="not found"): + EvaluatorFactory.get_evaluator_factory("unknown") + + +# --------------------------------------------------------------------------- +# RAG Evaluation Foundry - EvaluatorFactory +# --------------------------------------------------------------------------- + +class TestRagEvaluationFoundryFactory: + def test_get_relevance(self): + from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \ + EvaluatorFactory + + result = EvaluatorFactory.get_evaluator_factory("relevance_evaluator") + assert "Relevance" in result.__name__ + + def test_invalid_raises(self): + from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \ + EvaluatorFactory + + with pytest.raises(ValueError, match="not found"): + EvaluatorFactory.get_evaluator_factory("missing") + + def test_only_has_relevance(self): + from src.evaluations.offline.rag_evaluation_foundry.eval_factory import \ + EvaluatorFactory + + assert len(EvaluatorFactory.EVALUATOR_FACTORIES) == 1 + assert "relevance_evaluator" in EvaluatorFactory.EVALUATOR_FACTORIES diff --git a/tests/unit/test_evaluation_utils.py b/tests/unit/test_evaluation_utils.py new file mode 100644 index 0000000..f28a6c7 --- /dev/null +++ b/tests/unit/test_evaluation_utils.py @@ -0,0 +1,130 @@ +"""Unit tests for agent evaluation utility functions (evaluation_utils, agent_tools).""" + +import pytest + +from src.evaluations.offline.pipeline_multi_agent_evaluation.evaluator.evaluator_repo.eval_utils.evaluation_utils import ( + agent_invoked_accuracy, calculate_match_percentage) + +# --------------------------------------------------------------------------- +# agent_invoked_accuracy +# --------------------------------------------------------------------------- + +class TestAgentInvokedAccuracy: + def test_exact_match(self): + """Same agents in same order should return True.""" + assert agent_invoked_accuracy(["AgentA", "AgentB"], ["AgentA", "AgentB"]) is True + + def test_same_agents_different_order(self): + """Same agents in different order should return True (set comparison).""" + assert agent_invoked_accuracy(["AgentB", "AgentA"], ["AgentA", "AgentB"]) is True + + def test_missing_agent(self): + """Missing expected agent should return False.""" + assert agent_invoked_accuracy(["AgentA"], ["AgentA", "AgentB"]) is False + + def test_extra_agent(self): + """Extra predicted agent should return False.""" + assert agent_invoked_accuracy(["AgentA", "AgentB", "AgentC"], ["AgentA", "AgentB"]) is False + + def test_empty_both(self): + """Both empty should return True.""" + assert agent_invoked_accuracy([], []) is True + + def test_empty_predicted(self): + """Empty predicted with non-empty expected should return False.""" + assert agent_invoked_accuracy([], ["AgentA"]) is False + + def test_empty_expected(self): + """Non-empty predicted with empty expected should return False.""" + assert agent_invoked_accuracy(["AgentA"], []) is False + + def test_single_agent_match(self): + """Single agent match should return True.""" + assert agent_invoked_accuracy(["AgentA"], ["AgentA"]) is True + + def test_duplicate_agents(self): + """Duplicate agents should be treated as set.""" + assert agent_invoked_accuracy(["AgentA", "AgentA"], ["AgentA"]) is True + + +# --------------------------------------------------------------------------- +# calculate_match_percentage +# --------------------------------------------------------------------------- + +class TestCalculateMatchPercentage: + def test_full_match(self): + """All expected in predicted should return 1.0.""" + assert calculate_match_percentage(["A", "B"], ["A", "B"]) == 1.0 + + def test_partial_match(self): + """Half expected in predicted should return 0.5.""" + assert calculate_match_percentage(["A", "B"], ["A"]) == 0.5 + + def test_no_match(self): + """No overlap should return 0.0.""" + assert calculate_match_percentage(["A", "B"], ["C", "D"]) == 0.0 + + def test_empty_expected(self): + """Empty expected should return 0.0 (avoid division by zero).""" + assert calculate_match_percentage([], ["A"]) == 0.0 + + def test_empty_predicted(self): + """Empty predicted should return 0.0.""" + assert calculate_match_percentage(["A", "B"], []) == 0.0 + + def test_extra_predicted_agents(self): + """Extra predicted agents don't affect match percentage.""" + assert calculate_match_percentage(["A"], ["A", "B", "C"]) == 1.0 + + def test_three_of_four(self): + """3 out of 4 expected should return 0.75.""" + assert calculate_match_percentage(["A", "B", "C", "D"], ["A", "B", "C"]) == 0.75 + + +# --------------------------------------------------------------------------- +# EvaluateAgentsInvoked +# --------------------------------------------------------------------------- + +class TestEvaluateAgentsInvoked: + @pytest.fixture + def evaluator(self): + from src.evaluations.offline.pipeline_multi_agent_evaluation.evaluator.evaluator_repo.evaluate_agent_invoked import \ + EvaluateAgentsInvoked + return EvaluateAgentsInvoked() + + def test_exact_match_returns_accuracy_1(self, evaluator): + """Exact match should set accuracy to 1.0.""" + result = evaluator( + expected_agents_to_invoke=["ACAgent", "TVAgent"], + predicted_agents_to_invoke=["ACAgent", "TVAgent"], + ) + assert result["agents_invoke_accuracy"] == 1.0 + assert result["agents_invoke_exact_match"] is True + assert result["agents_invoke_match_percentage"] == 1.0 + + def test_orchestrator_filtered(self, evaluator): + """OrchestratorAgent should be filtered from predicted.""" + result = evaluator( + expected_agents_to_invoke=["ACAgent"], + predicted_agents_to_invoke=["OrchestratorAgent", "ACAgent"], + ) + assert result["agents_invoke_accuracy"] == 1.0 + assert result["agents_invoke_exact_match"] is True + + def test_mismatch(self, evaluator): + """Mismatch should set accuracy to 0.0.""" + result = evaluator( + expected_agents_to_invoke=["ACAgent", "TVAgent"], + predicted_agents_to_invoke=["DishwasherAgent"], + ) + assert result["agents_invoke_accuracy"] == 0.0 + assert result["agents_invoke_exact_match"] is False + + def test_partial_match_percentage(self, evaluator): + """Partial match percentage should be calculated.""" + result = evaluator( + expected_agents_to_invoke=["ACAgent", "TVAgent"], + predicted_agents_to_invoke=["ACAgent"], + ) + assert result["agents_invoke_match_percentage"] == 0.5 + assert result["agents_invoke_exact_match"] is False diff --git a/tests/unit/test_run_eval.py b/tests/unit/test_run_eval.py new file mode 100644 index 0000000..1455a5f --- /dev/null +++ b/tests/unit/test_run_eval.py @@ -0,0 +1,152 @@ +"""Unit tests for run_eval module (src/agent_evaluation/agentic_ops/run_eval.py).""" + +from unittest.mock import MagicMock, patch + +from src.agent_evaluation.agentic_ops.run_eval import (setup_evaluation, + should_pass_config) + +# --------------------------------------------------------------------------- +# should_pass_config +# --------------------------------------------------------------------------- + +class TestShouldPassConfig: + def test_function_with_required_arg(self): + """Function with required arg should return True.""" + def func(config): + pass + assert should_pass_config(func) is True + + def test_function_no_args(self): + """Function with no args should return False.""" + def func(): + pass + assert should_pass_config(func) is False + + def test_function_only_defaults(self): + """Function with only default args should return False.""" + def func(x=10, y=20): + pass + assert should_pass_config(func) is False + + def test_function_with_kwargs_only(self): + """Function with **kwargs only should return False.""" + def func(**kwargs): + pass + assert should_pass_config(func) is False + + def test_function_keyword_only_required(self): + """Function with keyword-only required param should return True.""" + def func(*, config): + pass + assert should_pass_config(func) is True + + +# --------------------------------------------------------------------------- +# setup_evaluation +# --------------------------------------------------------------------------- + +class TestSetupEvaluation: + def test_setup_with_model_config_param(self): + """Evaluator factory accepting model_config should get it passed.""" + mock_factory_cls = MagicMock() + mock_evaluator = MagicMock() + mock_factory_cls.return_value = mock_evaluator + + # Create a factory class with model_config parameter + def factory_func(model_config=None): + return mock_evaluator + + mock_eval_factory = MagicMock() + mock_eval_factory.get_evaluator_factory.return_value = factory_func + + config = { + "evaluators": {"test_eval": "test_factory"}, + "evaluator_config": {}, + } + + with patch.dict("os.environ", { + "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com", + "EVAL_AZURE_OPENAI_MODEL": "gpt-4", + "EVAL_AZURE_OPENAI_VERSION": "2024-01-01", + }): + evaluators, evaluator_config = setup_evaluation(config, mock_eval_factory) + + assert "test_eval" in evaluators + assert evaluators["test_eval"] == mock_evaluator + + def test_setup_with_no_params_factory(self): + """Evaluator factory accepting no params should be called without args.""" + mock_evaluator = MagicMock() + + def factory_func(): + return mock_evaluator + + mock_eval_factory = MagicMock() + mock_eval_factory.get_evaluator_factory.return_value = factory_func + + config = { + "evaluators": {"simple_eval": "simple_factory"}, + "evaluator_config": {}, + } + + with patch.dict("os.environ", { + "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com", + "EVAL_AZURE_OPENAI_MODEL": "gpt-4", + "EVAL_AZURE_OPENAI_VERSION": "2024-01-01", + }): + evaluators, evaluator_config = setup_evaluation(config, mock_eval_factory) + + assert "simple_eval" in evaluators + assert evaluators["simple_eval"] == mock_evaluator + + def test_setup_with_azure_ai_project_param(self): + """Factory accepting azure_ai_project should get it.""" + mock_evaluator = MagicMock() + + def factory_func(azure_ai_project=None): + return mock_evaluator + + mock_eval_factory = MagicMock() + mock_eval_factory.get_evaluator_factory.return_value = factory_func + + config = { + "evaluators": {"proj_eval": "proj_factory"}, + "evaluator_config": {}, + } + + mock_project = MagicMock() + with patch.dict("os.environ", { + "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com", + "EVAL_AZURE_OPENAI_MODEL": "gpt-4", + "EVAL_AZURE_OPENAI_VERSION": "2024-01-01", + }): + evaluators, _ = setup_evaluation(config, mock_eval_factory, azure_ai_project=mock_project) + + assert "proj_eval" in evaluators + + def test_setup_resolves_column_mapping_placeholder(self): + """evaluator_config with 'use_column_mapping' should resolve.""" + mock_evaluator = MagicMock() + + def factory_func(): + return mock_evaluator + + mock_eval_factory = MagicMock() + mock_eval_factory.get_evaluator_factory.return_value = factory_func + + config = { + "evaluators": {"test_eval": "test_factory"}, + "column_mapping": {"query": "${data.query}"}, + "evaluator_config": { + "test_eval": {"column_mapping": "use_column_mapping"}, + }, + } + + with patch.dict("os.environ", { + "EVAL_AZURE_OPENAI_ENDPOINT": "https://test.openai.azure.com", + "EVAL_AZURE_OPENAI_MODEL": "gpt-4", + "EVAL_AZURE_OPENAI_VERSION": "2024-01-01", + }): + _, evaluator_config = setup_evaluation(config, mock_eval_factory) + + assert evaluator_config["test_eval"]["column_mapping"] == {"query": "${data.query}"} diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py new file mode 100644 index 0000000..bb6287d --- /dev/null +++ b/tests/unit/test_runner.py @@ -0,0 +1,99 @@ +"""Unit tests for the pipeline runner (src/agent_evaluation/agentic_ops/runner.py).""" + +import argparse +from unittest.mock import patch + +import pytest +import yaml + +from src.agent_evaluation.agentic_ops.runner import load_config, parse_args + +# --------------------------------------------------------------------------- +# load_config +# --------------------------------------------------------------------------- + +class TestLoadConfig: + def test_loads_valid_yaml(self, tmp_path): + """Should load and return config dict from valid YAML.""" + config_data = {"app_name": "TestApp", "pipeline": []} + config_file = tmp_path / "experiment.yaml" + config_file.write_text(yaml.dump(config_data)) + + result = load_config(config_file) + assert result == config_data + + def test_raises_on_missing_file(self, tmp_path): + """Should raise FileNotFoundError for missing config.""" + missing = tmp_path / "missing.yaml" + with pytest.raises(FileNotFoundError): + load_config(missing) + + def test_returns_none_for_empty_yaml(self, tmp_path): + """Empty YAML file should return None.""" + config_file = tmp_path / "empty.yaml" + config_file.write_text("") + result = load_config(config_file) + assert result is None + + +# --------------------------------------------------------------------------- +# parse_args +# --------------------------------------------------------------------------- + +class TestParseArgs: + def test_defaults(self): + """Default args should have expected values.""" + with patch("sys.argv", ["runner"]): + args = parse_args() + assert args.config_file == "experiment.yaml" + assert args.index_fname is None + assert args.sample == 0 + + def test_custom_config_file(self): + """Should accept --config_file argument.""" + with patch("sys.argv", ["runner", "--config_file", "custom.yaml"]): + args = parse_args() + assert args.config_file == "custom.yaml" + + def test_sample_arg(self): + """Should accept --sample argument.""" + with patch("sys.argv", ["runner", "--sample", "10"]): + args = parse_args() + assert args.sample == 10 + + def test_index_fname_arg(self): + """Should accept --index_fname argument.""" + with patch("sys.argv", ["runner", "--index_fname", "file_001"]): + args = parse_args() + assert args.index_fname == "file_001" + + +# --------------------------------------------------------------------------- +# run_pipeline +# --------------------------------------------------------------------------- + +class TestRunPipeline: + @patch("src.agent_evaluation.agentic_ops.runner.importlib.import_module") + @patch("src.agent_evaluation.agentic_ops.runner.load_config") + def test_run_pipeline_exits_on_invalid_step(self, mock_load_config, mock_import, tmp_path): + """Pipeline with an invalid step (missing base_path/module) should exit.""" + from src.agent_evaluation.agentic_ops.runner import run_pipeline + + mock_load_config.return_value = { + "experiment_name": "test", + "pipeline": [{"config_key": "evaluation"}], # missing base_path and module + "evaluation": {}, + } + + with pytest.raises(SystemExit): + run_pipeline("test/experiment.yaml", argparse.Namespace(sample=0, index_fname=None)) + + @patch("src.agent_evaluation.agentic_ops.runner.load_config") + def test_run_pipeline_exits_on_empty_pipeline(self, mock_load_config, tmp_path): + """Pipeline with no steps should exit.""" + from src.agent_evaluation.agentic_ops.runner import run_pipeline + + mock_load_config.return_value = {"pipeline": []} + + with pytest.raises(SystemExit): + run_pipeline("test/experiment.yaml", argparse.Namespace(sample=0, index_fname=None)) diff --git a/tests/unit/test_trace_to_jsonl.py b/tests/unit/test_trace_to_jsonl.py new file mode 100644 index 0000000..596a9a4 --- /dev/null +++ b/tests/unit/test_trace_to_jsonl.py @@ -0,0 +1,138 @@ +"""Unit tests for trace_to_jsonl shared module.""" + +import json + +from src.evaluations.offline.utils.trace_to_jsonl import ( + extract_tool_call_from_span, extract_tool_definitions, + merge_tool_definitions) + +# --------------------------------------------------------------------------- +# extract_tool_definitions +# --------------------------------------------------------------------------- + +class TestExtractToolDefinitions: + def test_extracts_function_tools(self): + """Should extract function-type tool definitions.""" + tool_defs = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather data", + "parameters": {"type": "object"}, + }, + } + ] + custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)} + + result = extract_tool_definitions(custom_dims) + assert len(result) == 1 + assert result[0]["name"] == "get_weather" + assert result[0]["description"] == "Get weather data" + + def test_empty_string(self): + """Empty tool definitions string should return empty list.""" + result = extract_tool_definitions({"gen_ai.tool.definitions": ""}) + assert result == [] + + def test_missing_key(self): + """Missing key should return empty list.""" + result = extract_tool_definitions({}) + assert result == [] + + def test_invalid_json(self): + """Invalid JSON should return empty list.""" + result = extract_tool_definitions({"gen_ai.tool.definitions": "not json"}) + assert result == [] + + def test_non_list_json(self): + """Non-list JSON should return empty list.""" + result = extract_tool_definitions({"gen_ai.tool.definitions": '{"key": "value"}'}) + assert result == [] + + def test_skips_non_function_type(self): + """Non-function type tools should be skipped.""" + tool_defs = [{"type": "retrieval", "name": "search"}] + custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)} + + result = extract_tool_definitions(custom_dims) + assert result == [] + + def test_multiple_tools(self): + """Multiple function tools should all be extracted.""" + tool_defs = [ + {"type": "function", "function": {"name": "tool_a", "description": "A", "parameters": {}}}, + {"type": "function", "function": {"name": "tool_b", "description": "B", "parameters": {}}}, + ] + custom_dims = {"gen_ai.tool.definitions": json.dumps(tool_defs)} + + result = extract_tool_definitions(custom_dims) + assert len(result) == 2 + assert result[0]["name"] == "tool_a" + assert result[1]["name"] == "tool_b" + + +# --------------------------------------------------------------------------- +# merge_tool_definitions +# --------------------------------------------------------------------------- + +class TestMergeToolDefinitions: + def test_merge_new_tools(self): + """New tools should be added.""" + existing = [{"name": "tool_a", "id": "tool_a"}] + new = [{"name": "tool_b", "id": "tool_b"}] + + result = merge_tool_definitions(existing, new) + names = {t["name"] for t in result} + assert names == {"tool_a", "tool_b"} + + def test_deduplicates_by_name(self): + """Duplicate names should not be added.""" + existing = [{"name": "tool_a", "id": "1", "description": "first"}] + new = [{"name": "tool_a", "id": "2", "description": "second"}] + + result = merge_tool_definitions(existing, new) + assert len(result) == 1 + assert result[0]["description"] == "first" # Keeps existing + + def test_empty_new(self): + """Empty new list should return existing unchanged.""" + existing = [{"name": "tool_a"}] + result = merge_tool_definitions(existing, []) + assert result == existing + + def test_empty_existing(self): + """Empty existing should return new tools.""" + new = [{"name": "tool_a"}, {"name": "tool_b"}] + result = merge_tool_definitions([], new) + assert len(result) == 2 + + def test_both_empty(self): + """Both empty should return empty list.""" + result = merge_tool_definitions([], []) + assert result == [] + + +# --------------------------------------------------------------------------- +# extract_tool_call_from_span +# --------------------------------------------------------------------------- + +class TestExtractToolCallFromSpan: + def test_extracts_from_operation_name(self): + """Should extract tool name from 'execute_tool ' format.""" + result = extract_tool_call_from_span({}, "execute_tool get_weather") + assert result["type"] == "tool_call" + assert result["name"] == "get_weather" + + def test_falls_back_to_custom_dims(self): + """Should fall back to gen_ai.tool.name from custom dims.""" + custom_dims = {"gen_ai.tool.name": "search_tool"} + result = extract_tool_call_from_span(custom_dims, "some_other_operation") + assert result["type"] == "tool_call" + assert result["name"] == "search_tool" + + def test_empty_name(self): + """Should handle missing tool name gracefully.""" + result = extract_tool_call_from_span({}, "other_span") + assert result["type"] == "tool_call" + assert result["name"] == "" diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 0000000..81306b0 --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,151 @@ +"""Unit tests for utility modules (file_operations, constants, trace_to_jsonl).""" + +import json + +from src.evaluations.offline.utils.constants import EVAL_NAME +from src.evaluations.offline.utils.file_operations import ( + append_to_jsonl, get_next_run_id, load_queries_from_jsonl, save_to_jsonl) + +# --------------------------------------------------------------------------- +# constants +# --------------------------------------------------------------------------- + +class TestConstants: + def test_eval_name_value(self): + """EVAL_NAME should be 'experiment_name'.""" + assert EVAL_NAME == "experiment_name" + + +# --------------------------------------------------------------------------- +# load_queries_from_jsonl +# --------------------------------------------------------------------------- + +class TestLoadQueriesFromJsonl: + def test_loads_valid_jsonl(self, tmp_path): + """Should load all lines from valid JSONL file.""" + data = [{"query": "q1"}, {"query": "q2"}, {"query": "q3"}] + f = tmp_path / "test.jsonl" + f.write_text("\n".join(json.dumps(d) for d in data)) + + result = load_queries_from_jsonl(str(f)) + assert result == data + + def test_skips_blank_lines(self, tmp_path): + """Blank lines should be skipped.""" + f = tmp_path / "test.jsonl" + f.write_text('{"a":1}\n\n{"b":2}\n\n') + + result = load_queries_from_jsonl(str(f)) + assert len(result) == 2 + + def test_empty_file(self, tmp_path): + """Empty file should return empty list.""" + f = tmp_path / "empty.jsonl" + f.write_text("") + + result = load_queries_from_jsonl(str(f)) + assert result == [] + + def test_preserves_unicode(self, tmp_path): + """Unicode content should be preserved.""" + data = [{"query": "Hello world"}] + f = tmp_path / "unicode.jsonl" + f.write_text(json.dumps(data[0], ensure_ascii=True) + "\n") + + result = load_queries_from_jsonl(str(f)) + assert result[0]["query"] == "Hello world" + + +# --------------------------------------------------------------------------- +# save_to_jsonl +# --------------------------------------------------------------------------- + +class TestSaveToJsonl: + def test_saves_data(self, tmp_path): + """Should save list of dicts to JSONL.""" + data = [{"a": 1}, {"b": 2}] + f = tmp_path / "out.jsonl" + + save_to_jsonl(str(f), data) + + lines = f.read_text().strip().split("\n") + assert len(lines) == 2 + assert json.loads(lines[0]) == {"a": 1} + assert json.loads(lines[1]) == {"b": 2} + + def test_empty_list(self, tmp_path): + """Empty list should create empty file.""" + f = tmp_path / "empty.jsonl" + save_to_jsonl(str(f), []) + assert f.read_text() == "" + + def test_overwrites_existing(self, tmp_path): + """Should overwrite existing file.""" + f = tmp_path / "overwrite.jsonl" + f.write_text("old content") + + save_to_jsonl(str(f), [{"new": True}]) + lines = f.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"new": True} + + +# --------------------------------------------------------------------------- +# append_to_jsonl +# --------------------------------------------------------------------------- + +class TestAppendToJsonl: + def test_appends_single_record(self, tmp_path): + """Should append one record to file.""" + f = tmp_path / "append.jsonl" + f.write_text('{"a":1}\n') + + append_to_jsonl(str(f), {"b": 2}) + + lines = f.read_text().strip().split("\n") + assert len(lines) == 2 + assert json.loads(lines[1]) == {"b": 2} + + def test_creates_file_if_missing(self, tmp_path): + """Should create file if it doesn't exist.""" + f = tmp_path / "new.jsonl" + append_to_jsonl(str(f), {"first": True}) + + assert f.exists() + assert json.loads(f.read_text().strip()) == {"first": True} + + +# --------------------------------------------------------------------------- +# get_next_run_id +# --------------------------------------------------------------------------- + +class TestGetNextRunId: + def test_empty_directory(self, tmp_path): + """Empty dir should return 1.""" + assert get_next_run_id(str(tmp_path)) == 1 + + def test_nonexistent_directory(self, tmp_path): + """Non-existent dir should return 1.""" + assert get_next_run_id(str(tmp_path / "nonexistent")) == 1 + + def test_sequential_numbering(self, tmp_path): + """Should return one more than the highest existing number.""" + (tmp_path / "1_eval_result.json").write_text("{}") + (tmp_path / "2_eval_result.json").write_text("{}") + (tmp_path / "3_eval_result.json").write_text("{}") + + assert get_next_run_id(str(tmp_path)) == 4 + + def test_ignores_non_matching_files(self, tmp_path): + """Files not matching the pattern should be ignored.""" + (tmp_path / "2_eval_result.json").write_text("{}") + (tmp_path / "readme.md").write_text("# Hi") + (tmp_path / "config.yaml").write_text("") + + assert get_next_run_id(str(tmp_path)) == 3 + + def test_handles_gaps(self, tmp_path): + """Should use the max, not count.""" + (tmp_path / "1_a.json").write_text("{}") + (tmp_path / "5_b.json").write_text("{}") + + assert get_next_run_id(str(tmp_path)) == 6