From 580b5840145f0029ec1c816d7fac4fb8e7b264d3 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Wed, 18 Mar 2026 22:50:03 -0400 Subject: [PATCH 1/3] chore: evals --- sdk/python/src/p95/__init__.py | 19 + sdk/python/src/p95/client.py | 10 + sdk/python/src/p95/evaluation.py | 818 +++++++++++++++++++++++++++++++ sdk/python/src/p95/run.py | 56 +++ 4 files changed, 903 insertions(+) create mode 100644 sdk/python/src/p95/evaluation.py diff --git a/sdk/python/src/p95/__init__.py b/sdk/python/src/p95/__init__.py index b7597bd..bfcc9cd 100644 --- a/sdk/python/src/p95/__init__.py +++ b/sdk/python/src/p95/__init__.py @@ -41,6 +41,16 @@ from p95.server import start_server, stop_server from p95.sweep import sweep, agent, should_prune, SweepConfig, ParameterSpec from p95.worker import Worker, WorkerCapabilities, Job, start_worker +from p95.evaluation import ( + Dataset, + Scorer, + Evaluation, + EvaluationConfig, + EvaluationTarget, + EvaluationResult, + EvaluationClient, + evaluate, +) __version__ = "0.1.0" __all__ = [ @@ -62,6 +72,15 @@ "WorkerCapabilities", "Job", "start_worker", + # Evaluations + "Dataset", + "Scorer", + "Evaluation", + "EvaluationConfig", + "EvaluationTarget", + "EvaluationResult", + "EvaluationClient", + "evaluate", # Exceptions "P95Error", "AuthenticationError", diff --git a/sdk/python/src/p95/client.py b/sdk/python/src/p95/client.py index 8b1c33f..b95aba3 100644 --- a/sdk/python/src/p95/client.py +++ b/sdk/python/src/p95/client.py @@ -277,3 +277,13 @@ def link_run_to_job(self, job_id: str, run_id: str) -> Dict[str, Any]: return self._request( "POST", f"/jobs/{job_id}/link-run", data={"run_id": run_id} ) + + def log_eval(self, run_id: str, eval_data: Dict[str, Any]) -> None: + """ + Log a qualitative evaluation annotation. + + Args: + run_id: The run ID + eval_data: Evaluation data containing message, step, timestamp, etc. + """ + self._request("POST", f"/runs/{run_id}/evals", data=eval_data) diff --git a/sdk/python/src/p95/evaluation.py b/sdk/python/src/p95/evaluation.py new file mode 100644 index 0000000..988dae0 --- /dev/null +++ b/sdk/python/src/p95/evaluation.py @@ -0,0 +1,818 @@ +"""Evaluation module for p95 SDK. + +This module provides functionality for running evaluations against +models or endpoints using datasets and scorers. +""" + +import json +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Union +from pathlib import Path + +from p95.client import P95Client +from p95.config import SDKConfig + + +@dataclass +class Dataset: + """Represents an evaluation dataset. + + Can be created from: + - A local file (JSON, JSONL, CSV) + - A pandas DataFrame + - An external URL + - Inline data (list of dicts) + """ + name: str + data: Optional[List[Dict[str, Any]]] = None + source_url: Optional[str] = None + format: str = "json" + has_expected: bool = False + input_field: str = "input" + expected_field: Optional[str] = None + + # Set after upload + id: Optional[str] = None + + @classmethod + def from_file(cls, path: str, name: Optional[str] = None) -> "Dataset": + """Create a dataset from a local file. + + Args: + path: Path to the file (JSON, JSONL, or CSV) + name: Optional name for the dataset + + Returns: + Dataset instance with loaded data + """ + filepath = Path(path) + if not filepath.exists(): + raise FileNotFoundError(f"File not found: {path}") + + name = name or filepath.stem + + # Determine format from extension + ext = filepath.suffix.lower() + if ext == ".json": + with open(filepath) as f: + data = json.load(f) + if not isinstance(data, list): + data = [data] + fmt = "json" + elif ext == ".jsonl": + data = [] + with open(filepath) as f: + for line in f: + if line.strip(): + data.append(json.loads(line)) + fmt = "jsonl" + elif ext == ".csv": + import csv + data = [] + with open(filepath) as f: + reader = csv.DictReader(f) + for row in reader: + data.append(row) + fmt = "csv" + else: + raise ValueError(f"Unsupported file format: {ext}") + + # Detect if dataset has expected field + has_expected = False + expected_field = None + if data: + sample = data[0] + for field_name in ["expected", "ground_truth", "answer", "label", "target"]: + if field_name in sample: + has_expected = True + expected_field = field_name + break + + return cls( + name=name, + data=data, + format=fmt, + has_expected=has_expected, + expected_field=expected_field, + ) + + @classmethod + def from_dataframe(cls, df: "pandas.DataFrame", name: str) -> "Dataset": + """Create a dataset from a pandas DataFrame. + + Args: + df: pandas DataFrame + name: Name for the dataset + + Returns: + Dataset instance with DataFrame data + """ + data = df.to_dict(orient="records") + + # Detect expected field + has_expected = False + expected_field = None + for field_name in ["expected", "ground_truth", "answer", "label", "target"]: + if field_name in df.columns: + has_expected = True + expected_field = field_name + break + + return cls( + name=name, + data=data, + format="json", + has_expected=has_expected, + expected_field=expected_field, + ) + + @classmethod + def from_url(cls, url: str, name: str, format: str = "json") -> "Dataset": + """Create a dataset from an external URL. + + Args: + url: URL to the dataset + name: Name for the dataset + format: Data format (json, jsonl, csv) + + Returns: + Dataset instance referencing the URL + """ + return cls( + name=name, + source_url=url, + format=format, + ) + + @classmethod + def from_list( + cls, + data: List[Dict[str, Any]], + name: str, + input_field: str = "input", + expected_field: Optional[str] = None, + ) -> "Dataset": + """Create a dataset from a list of dictionaries. + + Args: + data: List of dictionaries with input/output pairs + name: Name for the dataset + input_field: Field name for input data + expected_field: Field name for expected output + + Returns: + Dataset instance + """ + return cls( + name=name, + data=data, + format="json", + has_expected=expected_field is not None, + input_field=input_field, + expected_field=expected_field, + ) + + +@dataclass +class Scorer: + """Represents a scorer for evaluating model outputs. + + Scorers can be: + - Builtin (accuracy, bleu, rouge, etc.) + - LLM-as-judge (uses an LLM to evaluate) + - Custom (user-defined Python function) + """ + name: str + type: str # "builtin", "llm_judge", "custom" + config: Dict[str, Any] = field(default_factory=dict) + requires_expected: bool = False + + # Set after creation on server + id: Optional[str] = None + + @classmethod + def builtin(cls, name: str, **params) -> "Scorer": + """Create a builtin scorer. + + Available builtin scorers: + - exact_match: Exact string match + - contains: Substring match + - bleu: BLEU score for text generation + - rouge-1, rouge-2, rouge-l: ROUGE scores + - accuracy: Classification accuracy + - f1, precision, recall: Classification metrics + - length: Response character length + - word_count: Response word count + - json_valid: Check if output is valid JSON + - toxicity: Basic toxicity detection + + Args: + name: Name of the builtin scorer + **params: Additional parameters for the scorer + + Returns: + Scorer instance + """ + # Scorers that require expected output + requires_expected = name in [ + "exact_match", "contains", "bleu", + "rouge-1", "rouge-2", "rouge-l", + "accuracy", "f1", "precision", "recall", + ] + + return cls( + name=name, + type="builtin", + config={ + "builtin_name": name, + "parameters": params, + }, + requires_expected=requires_expected, + ) + + @classmethod + def llm_judge( + cls, + name: str, + model: str = "gpt-4o-mini", + system_prompt: str = "", + user_prompt: str = "", + output_parser: str = "numeric", + requires_expected: bool = False, + ) -> "Scorer": + """Create an LLM-as-judge scorer. + + The user_prompt can contain template variables: + - {input}: The input from the dataset + - {output}: The model's output + - {expected}: The expected output (if available) + + Args: + name: Name for this scorer + model: LLM model to use (e.g., "gpt-4", "claude-3-opus") + system_prompt: System prompt for the judge + user_prompt: User prompt template + output_parser: How to parse the response ("numeric", "boolean", "json") + requires_expected: Whether this scorer needs ground truth + + Returns: + Scorer instance + """ + return cls( + name=name, + type="llm_judge", + config={ + "model": model, + "system_prompt": system_prompt, + "user_prompt": user_prompt, + "output_parser": output_parser, + }, + requires_expected=requires_expected, + ) + + @classmethod + def custom( + cls, + name: str, + fn: Callable[[Any, Any, Any], float], + requires_expected: bool = False, + ) -> "Scorer": + """Create a custom scorer from a Python function. + + The function signature should be: + fn(input, output, expected) -> float + + Note: Custom scorers run locally, not on the server. + + Args: + name: Name for this scorer + fn: Scoring function + requires_expected: Whether this scorer needs ground truth + + Returns: + Scorer instance + """ + return cls( + name=name, + type="custom", + config={ + "_local_fn": fn, + }, + requires_expected=requires_expected, + ) + + +@dataclass +class EvaluationTarget: + """Specifies what to evaluate.""" + run_id: Optional[str] = None + endpoint: Optional[str] = None + config: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_run(cls, run_id: str, **config) -> "EvaluationTarget": + """Evaluate a trained model from a run.""" + return cls(run_id=run_id, config=config) + + @classmethod + def from_endpoint(cls, url: str, **config) -> "EvaluationTarget": + """Evaluate an external API endpoint.""" + return cls(endpoint=url, config=config) + + +@dataclass +class EvaluationConfig: + """Configuration for an evaluation.""" + name: str + dataset: Union[Dataset, str] # Dataset object or ID + target: EvaluationTarget + scorers: List[Union[Scorer, str]] = field(default_factory=list) # Scorer objects or IDs + description: Optional[str] = None + config: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class EvaluationResult: + """Result of a single evaluation row.""" + row_index: int + input: Any + model_output: Any + expected: Any + scores: Dict[str, float] + scorer_outputs: Dict[str, Any] + latency_ms: Optional[float] = None + error: Optional[str] = None + + +@dataclass +class Evaluation: + """Represents an evaluation run.""" + id: str + name: str + status: str + dataset_id: str + scorer_ids: List[str] + target: Dict[str, Any] + overall_scores: Optional[Dict[str, float]] = None + rows_processed: int = 0 + rows_failed: int = 0 + created_at: Optional[str] = None + + def is_complete(self) -> bool: + return self.status in ["completed", "failed", "canceled"] + + def is_running(self) -> bool: + return self.status == "running" + + +class EvaluationClient: + """Client for managing evaluations.""" + + def __init__(self, client: P95Client, team_slug: str, app_slug: str): + """ + Initialize the evaluation client. + + Args: + client: P95 API client + team_slug: Team slug + app_slug: App slug + """ + self.client = client + self.team_slug = team_slug + self.app_slug = app_slug + self._base_path = f"/teams/{team_slug}/apps/{app_slug}" + + def upload_dataset(self, dataset: Dataset) -> str: + """ + Upload a dataset to the server. + + Args: + dataset: Dataset to upload + + Returns: + Dataset ID + """ + data = { + "name": dataset.name, + "format": dataset.format, + "has_expected": dataset.has_expected, + "input_field": dataset.input_field, + } + + if dataset.expected_field: + data["expected_field"] = dataset.expected_field + + if dataset.source_url: + data["source_type"] = "url" + data["source_url"] = dataset.source_url + else: + data["source_type"] = "inline" + data["data"] = dataset.data + + response = self.client._request( + "POST", + f"{self._base_path}/datasets", + data=data, + ) + + dataset.id = response["id"] + return response["id"] + + def create_scorer(self, scorer: Scorer) -> str: + """ + Create a scorer on the server. + + Args: + scorer: Scorer to create + + Returns: + Scorer ID + """ + if scorer.type == "custom": + raise ValueError("Custom scorers run locally and cannot be uploaded") + + data = { + "name": scorer.name, + "type": scorer.type, + "config": scorer.config, + "requires_expected": scorer.requires_expected, + } + + response = self.client._request( + "POST", + f"{self._base_path}/scorers", + data=data, + ) + + scorer.id = response["id"] + return response["id"] + + def create_evaluation(self, config: EvaluationConfig, start: bool = False) -> Evaluation: + """ + Create an evaluation. + + Args: + config: Evaluation configuration + start: Whether to start the evaluation immediately + + Returns: + Evaluation instance + """ + # Upload dataset if needed + dataset_id = config.dataset if isinstance(config.dataset, str) else config.dataset.id + if not dataset_id: + if isinstance(config.dataset, Dataset): + dataset_id = self.upload_dataset(config.dataset) + else: + raise ValueError("Dataset must be uploaded or provide an ID") + + # Create scorers if needed + scorer_ids = [] + custom_scorers = [] + for scorer in config.scorers: + if isinstance(scorer, str): + scorer_ids.append(scorer) + elif scorer.type == "custom": + custom_scorers.append(scorer) + elif not scorer.id: + scorer_id = self.create_scorer(scorer) + scorer_ids.append(scorer_id) + else: + scorer_ids.append(scorer.id) + + # Build target + target = {} + if config.target.run_id: + target["run_id"] = config.target.run_id + if config.target.endpoint: + target["endpoint"] = config.target.endpoint + if config.target.config: + target["config"] = config.target.config + + data = { + "name": config.name, + "description": config.description, + "dataset_id": dataset_id, + "target": target, + "scorer_ids": scorer_ids, + "config": config.config, + } + + response = self.client._request( + "POST", + f"{self._base_path}/evaluations", + data=data, + ) + + evaluation = Evaluation( + id=response["id"], + name=response["name"], + status=response["status"], + dataset_id=response["dataset_id"], + scorer_ids=response["scorer_ids"], + target=response["target"], + created_at=response.get("created_at"), + ) + + if start: + return self.start(evaluation.id) + + return evaluation + + def start(self, evaluation_id: str) -> Evaluation: + """ + Start an evaluation. + + Args: + evaluation_id: Evaluation ID + + Returns: + Updated Evaluation instance + """ + response = self.client._request( + "POST", + f"{self._base_path}/evaluations/{evaluation_id}/start", + ) + + return Evaluation( + id=response["id"], + name=response["name"], + status=response["status"], + dataset_id=response["dataset_id"], + scorer_ids=response["scorer_ids"], + target=response["target"], + overall_scores=response.get("overall_scores"), + rows_processed=response.get("rows_processed", 0), + rows_failed=response.get("rows_failed", 0), + created_at=response.get("created_at"), + ) + + def get(self, evaluation_id: str) -> Evaluation: + """ + Get an evaluation by ID. + + Args: + evaluation_id: Evaluation ID + + Returns: + Evaluation instance + """ + response = self.client._request( + "GET", + f"{self._base_path}/evaluations/{evaluation_id}", + ) + + return Evaluation( + id=response["id"], + name=response["name"], + status=response["status"], + dataset_id=response["dataset_id"], + scorer_ids=response["scorer_ids"], + target=response["target"], + overall_scores=response.get("overall_scores"), + rows_processed=response.get("rows_processed", 0), + rows_failed=response.get("rows_failed", 0), + created_at=response.get("created_at"), + ) + + def wait(self, evaluation_id: str, poll_interval: float = 2.0, timeout: Optional[float] = None) -> Evaluation: + """ + Wait for an evaluation to complete. + + Args: + evaluation_id: Evaluation ID + poll_interval: Seconds between status checks + timeout: Maximum seconds to wait (None = no timeout) + + Returns: + Completed Evaluation instance + """ + start_time = time.time() + + while True: + evaluation = self.get(evaluation_id) + + if evaluation.is_complete(): + return evaluation + + if timeout and (time.time() - start_time) > timeout: + raise TimeoutError(f"Evaluation {evaluation_id} did not complete within {timeout} seconds") + + time.sleep(poll_interval) + + def get_results(self, evaluation_id: str, limit: int = 100, offset: int = 0) -> List[EvaluationResult]: + """ + Get results for an evaluation. + + Args: + evaluation_id: Evaluation ID + limit: Maximum results to return + offset: Offset for pagination + + Returns: + List of EvaluationResult instances + """ + response = self.client._request( + "GET", + f"{self._base_path}/evaluations/{evaluation_id}/results", + params={"limit": limit, "offset": offset}, + ) + + results = [] + for item in response.get("results", []): + results.append(EvaluationResult( + row_index=item["row_index"], + input=item["input"], + model_output=item.get("model_output"), + expected=item.get("expected"), + scores=item.get("scores", {}), + scorer_outputs=item.get("scorer_outputs", {}), + latency_ms=item.get("latency_ms"), + error=item.get("error"), + )) + + return results + + def get_scores_summary(self, evaluation_id: str) -> Dict[str, Dict[str, float]]: + """ + Get aggregated scores for an evaluation. + + Args: + evaluation_id: Evaluation ID + + Returns: + Dictionary mapping scorer names to summary stats + """ + response = self.client._request( + "GET", + f"{self._base_path}/evaluations/{evaluation_id}/scores", + ) + + return response.get("scorer_summaries", {}) + + def cancel(self, evaluation_id: str) -> Evaluation: + """ + Cancel a running evaluation. + + Args: + evaluation_id: Evaluation ID + + Returns: + Updated Evaluation instance + """ + response = self.client._request( + "POST", + f"{self._base_path}/evaluations/{evaluation_id}/cancel", + ) + + return Evaluation( + id=response["id"], + name=response["name"], + status=response["status"], + dataset_id=response["dataset_id"], + scorer_ids=response["scorer_ids"], + target=response["target"], + ) + + def list_datasets(self, limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]: + """List datasets in the app.""" + response = self.client._request( + "GET", + f"{self._base_path}/datasets", + params={"limit": limit, "offset": offset}, + ) + return response.get("datasets", []) + + def list_scorers(self, limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]: + """List scorers in the app.""" + response = self.client._request( + "GET", + f"{self._base_path}/scorers", + params={"limit": limit, "offset": offset}, + ) + return response.get("scorers", []) + + def list_evaluations(self, limit: int = 50, offset: int = 0, status: Optional[str] = None) -> List[Evaluation]: + """List evaluations in the app.""" + params = {"limit": limit, "offset": offset} + if status: + params["status"] = status + + response = self.client._request( + "GET", + f"{self._base_path}/evaluations", + params=params, + ) + + evaluations = [] + for item in response.get("evaluations", []): + evaluations.append(Evaluation( + id=item["id"], + name=item["name"], + status=item["status"], + dataset_id=item["dataset_id"], + scorer_ids=item["scorer_ids"], + target=item["target"], + overall_scores=item.get("overall_scores"), + rows_processed=item.get("rows_processed", 0), + rows_failed=item.get("rows_failed", 0), + created_at=item.get("created_at"), + )) + + return evaluations + + def get_builtin_scorers(self) -> List[Dict[str, Any]]: + """Get list of available builtin scorers.""" + response = self.client._request( + "GET", + f"{self._base_path}/scorers/builtin", + ) + return response.get("scorers", []) + + +# Convenience functions for quick evaluations + + +def evaluate( + project: str, + dataset: Union[Dataset, str, List[Dict[str, Any]]], + target: Union[EvaluationTarget, str], + scorers: List[Union[Scorer, str]], + name: Optional[str] = None, + wait: bool = True, + api_key: Optional[str] = None, +) -> Evaluation: + """ + Run an evaluation. + + This is a convenience function for quick evaluations. + + Args: + project: Project in format "team/app" + dataset: Dataset object, ID, or list of dicts + target: EvaluationTarget or endpoint URL + scorers: List of Scorer objects or builtin scorer names + name: Optional evaluation name + wait: Whether to wait for completion + api_key: Optional API key + + Returns: + Completed Evaluation instance + + Example: + result = p95.evaluate( + project="my-team/my-app", + dataset=[ + {"input": "What is 2+2?", "expected": "4"}, + {"input": "What is 3+3?", "expected": "6"}, + ], + target="https://api.openai.com/v1/chat/completions", + scorers=["exact_match", "contains"], + ) + print(result.overall_scores) + """ + # Parse project + parts = project.split("/") + if len(parts) != 2: + raise ValueError("Project must be in format 'team/app'") + team_slug, app_slug = parts + + # Create client + config = SDKConfig.from_env() + if api_key: + config.api_key = api_key + client = P95Client(config) + eval_client = EvaluationClient(client, team_slug, app_slug) + + # Convert dataset if needed + if isinstance(dataset, list): + dataset = Dataset.from_list(dataset, name or "inline-dataset") + + # Convert target if needed + if isinstance(target, str): + target = EvaluationTarget.from_endpoint(target) + + # Convert scorer names to Scorer objects + processed_scorers = [] + for scorer in scorers: + if isinstance(scorer, str): + processed_scorers.append(Scorer.builtin(scorer)) + else: + processed_scorers.append(scorer) + + # Create evaluation config + eval_config = EvaluationConfig( + name=name or f"evaluation-{int(time.time())}", + dataset=dataset, + target=target, + scorers=processed_scorers, + ) + + # Run evaluation + evaluation = eval_client.create_evaluation(eval_config, start=True) + + if wait: + return eval_client.wait(evaluation.id) + + return evaluation diff --git a/sdk/python/src/p95/run.py b/sdk/python/src/p95/run.py index d907b37..2bef4b7 100644 --- a/sdk/python/src/p95/run.py +++ b/sdk/python/src/p95/run.py @@ -367,6 +367,62 @@ def log(self, name: str, value: float, step: Optional[int] = None) -> None: """ self.log_metrics({name: value}, step=step) + def log_eval( + self, + message: str, + rating: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Log a qualitative evaluation annotation at the current step. + + Use this to record human judgments, observations, or notes about + model outputs during training or inference. + + Args: + message: The evaluation message or annotation + rating: Optional rating (e.g., "good", "bad", "neutral") + metadata: Optional additional metadata + + Example: + # Simple annotation + run.log_eval("Output looks coherent and relevant") + + # With rating + run.log_eval("Response was off-topic", rating="bad") + + # With metadata + run.log_eval( + "Great creative writing sample", + rating="good", + metadata={"sample_id": "abc123", "category": "creativity"} + ) + """ + with self._lock: + step = self._step # Always use current step + ts = time.time() + + eval_data = { + "message": message, + "step": step, + "timestamp": ts, + } + if rating: + eval_data["rating"] = rating + if metadata: + eval_data["metadata"] = metadata + + if self._config.mode == "local": + # For local mode, append to eval_logs in meta + meta = self._local_writer._read_meta() + eval_logs = meta.get("eval_logs", []) + eval_logs.append(eval_data) + meta["eval_logs"] = eval_logs + self._local_writer._write_meta(meta) + else: + # For remote mode, send to API + self._remote_client.log_eval(self._run_id, eval_data) + def flush(self) -> None: """Force flush all buffered metrics.""" if self._config.mode == "local": From 1908c623fc50db32a5947d37565879531c8b0de5 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Wed, 18 Mar 2026 22:56:41 -0400 Subject: [PATCH 2/3] chore: tests --- sdk/python/tests/test_evaluation.py | 644 ++++++++++++++++++++++++++++ sdk/python/tests/test_log_eval.py | 276 ++++++++++++ sdk/python/uv.lock | 2 +- 3 files changed, 921 insertions(+), 1 deletion(-) create mode 100644 sdk/python/tests/test_evaluation.py create mode 100644 sdk/python/tests/test_log_eval.py diff --git a/sdk/python/tests/test_evaluation.py b/sdk/python/tests/test_evaluation.py new file mode 100644 index 0000000..cb604a3 --- /dev/null +++ b/sdk/python/tests/test_evaluation.py @@ -0,0 +1,644 @@ +"""Tests for p95 evaluation module.""" + +import json +import os +import tempfile +from unittest import mock + +import pytest + +from p95.evaluation import ( + Dataset, + Scorer, + Evaluation, + EvaluationConfig, + EvaluationTarget, + EvaluationResult, + EvaluationClient, +) + + +class TestDataset: + """Tests for Dataset dataclass.""" + + def test_dataset_creation(self): + """Test creating a dataset with inline data.""" + data = [ + {"input": "What is 2+2?", "expected": "4"}, + {"input": "What is 3+3?", "expected": "6"}, + ] + dataset = Dataset(name="test-dataset", data=data) + + assert dataset.name == "test-dataset" + assert dataset.data == data + assert dataset.format == "json" + assert dataset.id is None + + def test_dataset_from_list(self): + """Test creating a dataset from a list.""" + data = [ + {"prompt": "Hello", "answer": "Hi"}, + {"prompt": "Bye", "answer": "Goodbye"}, + ] + dataset = Dataset.from_list( + data, + name="greeting-dataset", + input_field="prompt", + expected_field="answer", + ) + + assert dataset.name == "greeting-dataset" + assert dataset.data == data + assert dataset.input_field == "prompt" + assert dataset.expected_field == "answer" + assert dataset.has_expected is True + + def test_dataset_from_list_no_expected(self): + """Test creating a dataset without expected field.""" + data = [{"input": "Hello"}, {"input": "World"}] + dataset = Dataset.from_list(data, name="simple-dataset") + + assert dataset.has_expected is False + assert dataset.expected_field is None + + def test_dataset_from_file_json(self): + """Test loading dataset from JSON file.""" + data = [ + {"input": "test1", "expected": "result1"}, + {"input": "test2", "expected": "result2"}, + ] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(data, f) + f.flush() + + try: + dataset = Dataset.from_file(f.name) + + assert len(dataset.data) == 2 + assert dataset.format == "json" + assert dataset.has_expected is True + assert dataset.expected_field == "expected" + finally: + os.unlink(f.name) + + def test_dataset_from_file_jsonl(self): + """Test loading dataset from JSONL file.""" + lines = [ + '{"input": "line1", "label": "a"}', + '{"input": "line2", "label": "b"}', + ] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: + f.write("\n".join(lines)) + f.flush() + + try: + dataset = Dataset.from_file(f.name) + + assert len(dataset.data) == 2 + assert dataset.format == "jsonl" + assert dataset.has_expected is True + assert dataset.expected_field == "label" + finally: + os.unlink(f.name) + + def test_dataset_from_file_csv(self): + """Test loading dataset from CSV file.""" + csv_content = "input,target\ntest1,result1\ntest2,result2" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write(csv_content) + f.flush() + + try: + dataset = Dataset.from_file(f.name) + + assert len(dataset.data) == 2 + assert dataset.format == "csv" + assert dataset.has_expected is True + assert dataset.expected_field == "target" + finally: + os.unlink(f.name) + + def test_dataset_from_file_not_found(self): + """Test that FileNotFoundError is raised for missing files.""" + with pytest.raises(FileNotFoundError): + Dataset.from_file("/nonexistent/path/data.json") + + def test_dataset_from_file_unsupported_format(self): + """Test that ValueError is raised for unsupported formats.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: + f.write("test content") + f.flush() + + try: + with pytest.raises(ValueError) as exc: + Dataset.from_file(f.name) + assert "Unsupported file format" in str(exc.value) + finally: + os.unlink(f.name) + + def test_dataset_from_url(self): + """Test creating a dataset from URL reference.""" + dataset = Dataset.from_url( + "https://example.com/data.json", + name="remote-dataset", + format="json", + ) + + assert dataset.name == "remote-dataset" + assert dataset.source_url == "https://example.com/data.json" + assert dataset.data is None + assert dataset.format == "json" + + def test_dataset_name_from_filename(self): + """Test that dataset name is inferred from filename.""" + data = [{"input": "test"}] + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="my_dataset_", delete=False + ) as f: + json.dump(data, f) + f.flush() + + try: + dataset = Dataset.from_file(f.name) + # Name should be the stem of the filename + assert "my_dataset_" in dataset.name + finally: + os.unlink(f.name) + + +class TestScorer: + """Tests for Scorer dataclass.""" + + def test_builtin_scorer_exact_match(self): + """Test creating an exact_match builtin scorer.""" + scorer = Scorer.builtin("exact_match") + + assert scorer.name == "exact_match" + assert scorer.type == "builtin" + assert scorer.config["builtin_name"] == "exact_match" + assert scorer.requires_expected is True + + def test_builtin_scorer_no_expected(self): + """Test builtin scorer that doesn't require expected.""" + scorer = Scorer.builtin("length") + + assert scorer.name == "length" + assert scorer.requires_expected is False + + def test_builtin_scorer_with_params(self): + """Test builtin scorer with parameters.""" + scorer = Scorer.builtin("bleu", n_gram=4, smooth=True) + + assert scorer.config["parameters"]["n_gram"] == 4 + assert scorer.config["parameters"]["smooth"] is True + + def test_llm_judge_scorer(self): + """Test creating an LLM-as-judge scorer.""" + scorer = Scorer.llm_judge( + name="quality-judge", + model="gpt-4", + system_prompt="You are a quality judge.", + user_prompt="Rate this output: {output}", + output_parser="numeric", + ) + + assert scorer.name == "quality-judge" + assert scorer.type == "llm_judge" + assert scorer.config["model"] == "gpt-4" + assert scorer.config["system_prompt"] == "You are a quality judge." + assert scorer.config["output_parser"] == "numeric" + + def test_llm_judge_scorer_defaults(self): + """Test LLM judge with default values.""" + scorer = Scorer.llm_judge( + name="simple-judge", + user_prompt="Is this good? {output}", + ) + + assert scorer.config["model"] == "gpt-4o-mini" + assert scorer.config["system_prompt"] == "" + assert scorer.config["output_parser"] == "numeric" + assert scorer.requires_expected is False + + def test_custom_scorer(self): + """Test creating a custom scorer.""" + + def my_scorer(input, output, expected): + return 1.0 if output == expected else 0.0 + + scorer = Scorer.custom("my-scorer", my_scorer, requires_expected=True) + + assert scorer.name == "my-scorer" + assert scorer.type == "custom" + assert scorer.config["_local_fn"] == my_scorer + assert scorer.requires_expected is True + + +class TestEvaluationTarget: + """Tests for EvaluationTarget dataclass.""" + + def test_target_from_run(self): + """Test creating target from a run ID.""" + target = EvaluationTarget.from_run("run-abc123", temperature=0.7) + + assert target.run_id == "run-abc123" + assert target.endpoint is None + assert target.config["temperature"] == 0.7 + + def test_target_from_endpoint(self): + """Test creating target from an endpoint URL.""" + target = EvaluationTarget.from_endpoint( + "https://api.openai.com/v1/chat/completions", + model="gpt-4", + max_tokens=100, + ) + + assert target.endpoint == "https://api.openai.com/v1/chat/completions" + assert target.run_id is None + assert target.config["model"] == "gpt-4" + assert target.config["max_tokens"] == 100 + + +class TestEvaluationConfig: + """Tests for EvaluationConfig dataclass.""" + + def test_evaluation_config(self): + """Test creating an evaluation config.""" + dataset = Dataset.from_list([{"input": "test"}], name="test-ds") + target = EvaluationTarget.from_endpoint("https://example.com/api") + scorers = [Scorer.builtin("exact_match")] + + config = EvaluationConfig( + name="test-eval", + dataset=dataset, + target=target, + scorers=scorers, + description="A test evaluation", + ) + + assert config.name == "test-eval" + assert config.dataset == dataset + assert config.target == target + assert len(config.scorers) == 1 + assert config.description == "A test evaluation" + + def test_evaluation_config_with_ids(self): + """Test evaluation config with string IDs instead of objects.""" + config = EvaluationConfig( + name="test-eval", + dataset="dataset-id-123", + target=EvaluationTarget.from_endpoint("https://example.com"), + scorers=["scorer-id-1", "scorer-id-2"], + ) + + assert config.dataset == "dataset-id-123" + assert config.scorers == ["scorer-id-1", "scorer-id-2"] + + +class TestEvaluationResult: + """Tests for EvaluationResult dataclass.""" + + def test_evaluation_result(self): + """Test creating an evaluation result.""" + result = EvaluationResult( + row_index=0, + input={"prompt": "Hello"}, + model_output="Hi there!", + expected="Hello!", + scores={"exact_match": 0.0, "contains": 1.0}, + scorer_outputs={"exact_match": {"matched": False}}, + latency_ms=150.5, + ) + + assert result.row_index == 0 + assert result.input == {"prompt": "Hello"} + assert result.model_output == "Hi there!" + assert result.scores["exact_match"] == 0.0 + assert result.scores["contains"] == 1.0 + assert result.latency_ms == 150.5 + assert result.error is None + + def test_evaluation_result_with_error(self): + """Test evaluation result with error.""" + result = EvaluationResult( + row_index=5, + input={"prompt": "test"}, + model_output=None, + expected="expected", + scores={}, + scorer_outputs={}, + error="API timeout", + ) + + assert result.error == "API timeout" + assert result.model_output is None + + +class TestEvaluation: + """Tests for Evaluation dataclass.""" + + def test_evaluation_creation(self): + """Test creating an evaluation.""" + evaluation = Evaluation( + id="eval-123", + name="test-evaluation", + status="pending", + dataset_id="ds-456", + scorer_ids=["scorer-1", "scorer-2"], + target={"endpoint": "https://example.com"}, + ) + + assert evaluation.id == "eval-123" + assert evaluation.name == "test-evaluation" + assert evaluation.status == "pending" + assert evaluation.is_complete() is False + assert evaluation.is_running() is False + + def test_evaluation_is_complete(self): + """Test is_complete for various statuses.""" + for status in ["completed", "failed", "canceled"]: + evaluation = Evaluation( + id="eval-123", + name="test", + status=status, + dataset_id="ds-1", + scorer_ids=[], + target={}, + ) + assert evaluation.is_complete() is True + + for status in ["pending", "running"]: + evaluation = Evaluation( + id="eval-123", + name="test", + status=status, + dataset_id="ds-1", + scorer_ids=[], + target={}, + ) + assert evaluation.is_complete() is False + + def test_evaluation_is_running(self): + """Test is_running for various statuses.""" + evaluation = Evaluation( + id="eval-123", + name="test", + status="running", + dataset_id="ds-1", + scorer_ids=[], + target={}, + ) + assert evaluation.is_running() is True + + evaluation.status = "pending" + assert evaluation.is_running() is False + + def test_evaluation_with_scores(self): + """Test evaluation with overall scores.""" + evaluation = Evaluation( + id="eval-123", + name="test", + status="completed", + dataset_id="ds-1", + scorer_ids=["scorer-1"], + target={}, + overall_scores={"exact_match": 0.85, "bleu": 0.72}, + rows_processed=100, + rows_failed=5, + ) + + assert evaluation.overall_scores["exact_match"] == 0.85 + assert evaluation.rows_processed == 100 + assert evaluation.rows_failed == 5 + + +class TestEvaluationClient: + """Tests for EvaluationClient.""" + + def test_client_initialization(self): + """Test client initialization.""" + mock_client = mock.MagicMock() + eval_client = EvaluationClient(mock_client, "my-team", "my-app") + + assert eval_client.team_slug == "my-team" + assert eval_client.app_slug == "my-app" + assert eval_client._base_path == "/teams/my-team/apps/my-app" + + def test_upload_dataset_inline(self): + """Test uploading inline dataset.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = {"id": "ds-new-123"} + + eval_client = EvaluationClient(mock_client, "team", "app") + dataset = Dataset.from_list( + [{"input": "test", "expected": "result"}], + name="test-ds", + expected_field="expected", + ) + + result = eval_client.upload_dataset(dataset) + + assert result == "ds-new-123" + assert dataset.id == "ds-new-123" + mock_client._request.assert_called_once() + call_args = mock_client._request.call_args + assert call_args[0][0] == "POST" + assert "/datasets" in call_args[0][1] + assert call_args[1]["data"]["name"] == "test-ds" + assert call_args[1]["data"]["source_type"] == "inline" + + def test_upload_dataset_url(self): + """Test uploading URL-referenced dataset.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = {"id": "ds-url-456"} + + eval_client = EvaluationClient(mock_client, "team", "app") + dataset = Dataset.from_url( + "https://example.com/data.json", + name="remote-ds", + ) + + result = eval_client.upload_dataset(dataset) + + assert result == "ds-url-456" + call_args = mock_client._request.call_args + assert call_args[1]["data"]["source_type"] == "url" + assert call_args[1]["data"]["source_url"] == "https://example.com/data.json" + + def test_create_scorer(self): + """Test creating a scorer on the server.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = {"id": "scorer-new-789"} + + eval_client = EvaluationClient(mock_client, "team", "app") + scorer = Scorer.builtin("exact_match") + + result = eval_client.create_scorer(scorer) + + assert result == "scorer-new-789" + assert scorer.id == "scorer-new-789" + + def test_create_scorer_custom_raises(self): + """Test that creating a custom scorer raises an error.""" + mock_client = mock.MagicMock() + eval_client = EvaluationClient(mock_client, "team", "app") + + scorer = Scorer.custom("my-scorer", lambda i, o, e: 1.0) + + with pytest.raises(ValueError) as exc: + eval_client.create_scorer(scorer) + assert "Custom scorers run locally" in str(exc.value) + + def test_create_evaluation(self): + """Test creating an evaluation.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = { + "id": "eval-new-001", + "name": "test-eval", + "status": "pending", + "dataset_id": "ds-123", + "scorer_ids": ["scorer-1"], + "target": {"endpoint": "https://example.com"}, + } + + eval_client = EvaluationClient(mock_client, "team", "app") + + # Create with existing IDs + config = EvaluationConfig( + name="test-eval", + dataset="ds-123", + target=EvaluationTarget.from_endpoint("https://example.com"), + scorers=["scorer-1"], + ) + + result = eval_client.create_evaluation(config) + + assert result.id == "eval-new-001" + assert result.status == "pending" + + def test_get_evaluation(self): + """Test getting an evaluation by ID.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = { + "id": "eval-123", + "name": "test", + "status": "completed", + "dataset_id": "ds-1", + "scorer_ids": ["s-1"], + "target": {}, + "overall_scores": {"accuracy": 0.95}, + "rows_processed": 50, + } + + eval_client = EvaluationClient(mock_client, "team", "app") + result = eval_client.get("eval-123") + + assert result.id == "eval-123" + assert result.status == "completed" + assert result.overall_scores["accuracy"] == 0.95 + + def test_get_results(self): + """Test getting evaluation results.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = { + "results": [ + { + "row_index": 0, + "input": {"text": "hello"}, + "model_output": "hi", + "expected": "hello", + "scores": {"exact_match": 0.0}, + "scorer_outputs": {}, + "latency_ms": 100.0, + }, + { + "row_index": 1, + "input": {"text": "world"}, + "model_output": "world", + "expected": "world", + "scores": {"exact_match": 1.0}, + "scorer_outputs": {}, + "latency_ms": 95.0, + }, + ] + } + + eval_client = EvaluationClient(mock_client, "team", "app") + results = eval_client.get_results("eval-123") + + assert len(results) == 2 + assert results[0].row_index == 0 + assert results[0].scores["exact_match"] == 0.0 + assert results[1].scores["exact_match"] == 1.0 + + def test_cancel_evaluation(self): + """Test canceling an evaluation.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = { + "id": "eval-123", + "name": "test", + "status": "canceled", + "dataset_id": "ds-1", + "scorer_ids": [], + "target": {}, + } + + eval_client = EvaluationClient(mock_client, "team", "app") + result = eval_client.cancel("eval-123") + + assert result.status == "canceled" + mock_client._request.assert_called_with( + "POST", + "/teams/team/apps/app/evaluations/eval-123/cancel", + ) + + def test_list_evaluations(self): + """Test listing evaluations.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = { + "evaluations": [ + { + "id": "eval-1", + "name": "eval-one", + "status": "completed", + "dataset_id": "ds-1", + "scorer_ids": [], + "target": {}, + }, + { + "id": "eval-2", + "name": "eval-two", + "status": "running", + "dataset_id": "ds-2", + "scorer_ids": [], + "target": {}, + }, + ] + } + + eval_client = EvaluationClient(mock_client, "team", "app") + results = eval_client.list_evaluations() + + assert len(results) == 2 + assert results[0].name == "eval-one" + assert results[1].status == "running" + + def test_get_builtin_scorers(self): + """Test getting builtin scorers.""" + mock_client = mock.MagicMock() + mock_client._request.return_value = { + "scorers": [ + {"name": "exact_match", "description": "Exact string match"}, + {"name": "bleu", "description": "BLEU score"}, + ] + } + + eval_client = EvaluationClient(mock_client, "team", "app") + result = eval_client.get_builtin_scorers() + + assert len(result) == 2 + assert result[0]["name"] == "exact_match" diff --git a/sdk/python/tests/test_log_eval.py b/sdk/python/tests/test_log_eval.py new file mode 100644 index 0000000..1a57cc8 --- /dev/null +++ b/sdk/python/tests/test_log_eval.py @@ -0,0 +1,276 @@ +"""Tests for the log_eval functionality in p95.Run.""" + +import json +import os +import tempfile +import time +from unittest import mock + +import pytest + + +class TestLogEvalLocal: + """Tests for log_eval in local mode.""" + + def test_log_eval_basic(self): + """Test basic log_eval call in local mode.""" + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + # Log some metrics to advance step + run.log_metrics({"loss": 0.5}, step=10) + + # Log an eval + run.log_eval("This output looks great") + + # Check that eval was logged + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + assert "eval_logs" in meta + assert len(meta["eval_logs"]) == 1 + assert meta["eval_logs"][0]["message"] == "This output looks great" + assert meta["eval_logs"][0]["step"] == 11 # Step after log_metrics incremented it + + def test_log_eval_with_rating(self): + """Test log_eval with rating.""" + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + run.log_eval("Bad response", rating="bad") + + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + assert meta["eval_logs"][0]["rating"] == "bad" + + def test_log_eval_with_metadata(self): + """Test log_eval with metadata.""" + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + run.log_eval( + "Interesting output", + metadata={"sample_id": "abc123", "category": "test"}, + ) + + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + assert meta["eval_logs"][0]["metadata"]["sample_id"] == "abc123" + assert meta["eval_logs"][0]["metadata"]["category"] == "test" + + def test_log_eval_multiple(self): + """Test logging multiple evals.""" + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + run.log_metrics({"loss": 0.5}, step=0) + run.log_eval("First eval") + + run.log_metrics({"loss": 0.4}, step=1) + run.log_eval("Second eval") + + run.log_metrics({"loss": 0.3}, step=2) + run.log_eval("Third eval", rating="good") + + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + assert len(meta["eval_logs"]) == 3 + assert meta["eval_logs"][0]["message"] == "First eval" + assert meta["eval_logs"][1]["message"] == "Second eval" + assert meta["eval_logs"][2]["message"] == "Third eval" + assert meta["eval_logs"][2]["rating"] == "good" + + def test_log_eval_uses_current_step(self): + """Test that log_eval uses the current step value.""" + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + # Log at specific steps + run.log_metrics({"loss": 0.5}, step=100) + run.log_eval("At step 100-ish") + + run.log_metrics({"loss": 0.3}, step=200) + run.log_eval("At step 200-ish") + + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + # After log_metrics(step=100), internal step becomes 101 + assert meta["eval_logs"][0]["step"] == 101 + # After log_metrics(step=200), internal step becomes 201 + assert meta["eval_logs"][1]["step"] == 201 + + def test_log_eval_has_timestamp(self): + """Test that log_eval includes a timestamp.""" + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + before = time.time() + + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + run.log_eval("Timed eval") + + after = time.time() + + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + ts = meta["eval_logs"][0]["timestamp"] + assert before <= ts <= after + + +class TestLogEvalRemote: + """Tests for log_eval in remote mode.""" + + def test_log_eval_remote_basic(self): + """Test log_eval in remote mode calls the API.""" + from p95.run import Run + + with mock.patch("p95.client.P95Client") as mock_client_class, \ + mock.patch("p95.metrics.MetricsBatcher") as mock_batcher_class: + + mock_client = mock.MagicMock() + mock_client.create_run.return_value = "run-123" + mock_client_class.return_value = mock_client + + mock_batcher = mock.MagicMock() + mock_batcher_class.return_value = mock_batcher + + # Patch the imports within run module + with mock.patch.object(Run, "_init_remote_mode") as mock_init: + # Create run with mocked internals + run = object.__new__(Run) + run._config = mock.MagicMock() + run._config.mode = "remote" + run._run_id = "run-123" + run._remote_client = mock_client + run._remote_batcher = mock_batcher + run._step = 0 + run._closed = False + run._lock = __import__("threading").Lock() + + run.log_eval("Test message", rating="good") + + # Verify log_eval was called on the client + mock_client.log_eval.assert_called_once() + call_args = mock_client.log_eval.call_args + assert call_args[0][0] == "run-123" # run_id + assert call_args[0][1]["message"] == "Test message" + assert call_args[0][1]["rating"] == "good" + assert "step" in call_args[0][1] + assert "timestamp" in call_args[0][1] + + def test_log_eval_remote_with_metadata(self): + """Test log_eval in remote mode with metadata.""" + from p95.run import Run + + mock_client = mock.MagicMock() + + run = object.__new__(Run) + run._config = mock.MagicMock() + run._config.mode = "remote" + run._run_id = "run-456" + run._remote_client = mock_client + run._step = 0 + run._closed = False + run._lock = __import__("threading").Lock() + + run.log_eval( + "Complex eval", + rating="neutral", + metadata={"key": "value", "num": 42}, + ) + + call_args = mock_client.log_eval.call_args + assert call_args[0][1]["metadata"]["key"] == "value" + assert call_args[0][1]["metadata"]["num"] == 42 + + def test_log_eval_remote_no_rating(self): + """Test log_eval in remote mode without rating.""" + from p95.run import Run + + mock_client = mock.MagicMock() + + run = object.__new__(Run) + run._config = mock.MagicMock() + run._config.mode = "remote" + run._run_id = "run-789" + run._remote_client = mock_client + run._step = 5 + run._closed = False + run._lock = __import__("threading").Lock() + + run.log_eval("Simple message") + + call_args = mock_client.log_eval.call_args + assert "rating" not in call_args[0][1] + assert "metadata" not in call_args[0][1] + assert call_args[0][1]["step"] == 5 + + +class TestLogEvalThreadSafety: + """Tests for log_eval thread safety.""" + + def test_log_eval_thread_safe(self): + """Test that log_eval is thread-safe.""" + import threading + from p95.run import Run + + with tempfile.TemporaryDirectory() as tmpdir: + with Run(project="test-project", mode="local", logdir=tmpdir) as run: + errors = [] + + def log_evals(n): + try: + for i in range(n): + run.log_eval(f"Message from thread at {i}") + except Exception as e: + errors.append(e) + + threads = [ + threading.Thread(target=log_evals, args=(10,)) + for _ in range(5) + ] + + for t in threads: + t.start() + for t in threads: + t.join() + + assert len(errors) == 0 + + # Verify all evals were logged + run_dir = os.path.join(tmpdir, "test-project", run.name) + meta_path = os.path.join(run_dir, "meta.json") + + with open(meta_path) as f: + meta = json.load(f) + + # 5 threads * 10 evals each = 50 total + assert len(meta["eval_logs"]) == 50 diff --git a/sdk/python/uv.lock b/sdk/python/uv.lock index 511e1d0..70ca62c 100644 --- a/sdk/python/uv.lock +++ b/sdk/python/uv.lock @@ -569,7 +569,7 @@ wheels = [ [[package]] name = "p95" -version = "0.7.0" +version = "0.8.0" source = { editable = "." } dependencies = [ { name = "requests", version = "2.32.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, From d7005ad337b97f921df276e91ef184208fdea819 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Wed, 18 Mar 2026 23:23:04 -0400 Subject: [PATCH 3/3] chore: tests --- sdk/python/examples/train_with_evals.py | 151 ++++++++++++++++++++++++ sdk/python/src/p95/evaluation.py | 5 +- sdk/python/tests/test_log_eval.py | 4 +- 3 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 sdk/python/examples/train_with_evals.py diff --git a/sdk/python/examples/train_with_evals.py b/sdk/python/examples/train_with_evals.py new file mode 100644 index 0000000..4d987dc --- /dev/null +++ b/sdk/python/examples/train_with_evals.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +"""Train a model with qualitative evaluation annotations. + +This example demonstrates using p95's log_eval() feature to add +human-readable annotations during training. These annotations appear +in the "Notes" tab of the run detail page in the UI. + +Usage: + # Local mode (default) + python examples/train_with_evals.py + + # Remote mode + P95_URL=http://localhost:8080 P95_API_KEY=xxx python examples/train_with_evals.py +""" + +import os +import time +import numpy as np +import p95 + + +def generate_text_output(epoch: int) -> str: + """Simulate a model generating text output.""" + outputs = [ + "The quick brown fox jumps over the lazy dog.", + "Machine learning is transforming how we build software.", + "The weather today is sunny with a chance of clouds.", + "Python is a versatile programming language.", + "Neural networks learn patterns from data.", + "The cat sat on the mat and looked at the window.", + "Artificial intelligence is advancing rapidly.", + "Data science combines statistics and programming.", + "Deep learning requires large amounts of data.", + "Transfer learning helps with limited datasets.", + ] + # Add some variation based on epoch + base = outputs[epoch % len(outputs)] + if epoch > 20: + # Outputs get more coherent as training progresses + return base + elif epoch > 10: + # Medium quality - some minor issues + return base.replace("the", "teh").replace(".", "") + else: + # Early training - lower quality + words = base.split() + np.random.shuffle(words) + return " ".join(words[:len(words)//2]) + + +def evaluate_output(output: str, epoch: int) -> tuple[str, str]: + """Simulate human evaluation of model output. + + Returns: + tuple of (message, rating) + """ + # Simple heuristics to simulate evaluation + if len(output) < 20: + return "Output too short, model not generating enough content", "bad" + + if "teh" in output or not output.endswith("."): + return "Minor quality issues detected - typos or missing punctuation", "neutral" + + if len(output) > 40 and output[0].isupper(): + return "Good output quality - coherent and well-formed", "good" + + return "Acceptable output, room for improvement", "neutral" + + +def main(): + config = { + "epochs": int(os.environ.get("P95_CONFIG_EPOCHS", "30")), + "lr": float(os.environ.get("P95_CONFIG_LR", "0.001")), + "batch_size": int(os.environ.get("P95_CONFIG_BATCH_SIZE", "32")), + "model_type": "transformer", + "eval_frequency": 5, # Evaluate every N epochs + } + + project = os.environ.get("P95_PROJECT", "text-generation") + + print("Training text generation model with qualitative evals") + print(f"Config: {config}") + + with p95.Run(project=project, config=config) as run: + print(f"Run ID: {run.id}") + print(f"Mode: {run.mode}") + if run.mode == "local": + print(f"Log dir: {run.logdir}") + print("\nTo view in UI, run: pnf --logdir ") + else: + print("\nView in UI at: ///runs/") + print("Look for the 'Notes' tab to see evaluation annotations") + + np.random.seed(42) + + for epoch in range(config["epochs"]): + # Simulate training metrics + base_loss = 2.0 * np.exp(-epoch / 10) + 0.1 + loss = base_loss + np.random.normal(0, 0.05) + perplexity = np.exp(loss) + + run.log_metrics({ + "train/loss": loss, + "train/perplexity": perplexity, + }, step=epoch) + + # Periodically evaluate and log qualitative feedback + if epoch % config["eval_frequency"] == 0: + # Generate sample output + output = generate_text_output(epoch) + + # Evaluate the output + message, rating = evaluate_output(output, epoch) + + # Log the evaluation annotation + run.log_eval( + message=f"Epoch {epoch}: {message}\nSample output: \"{output}\"", + rating=rating, + metadata={ + "epoch": epoch, + "output_length": len(output), + "sample_output": output, + } + ) + + print(f"Epoch {epoch}: {rating.upper()} - {message}") + + # Print progress + if (epoch + 1) % 10 == 0: + print(f"Epoch {epoch + 1}/{config['epochs']} - loss: {loss:.4f}") + + time.sleep(0.1) # Simulate training time + + # Final evaluation + final_output = generate_text_output(config["epochs"]) + run.log_eval( + message=f"Final model evaluation: Output quality is {'excellent' if len(final_output) > 40 else 'acceptable'}", + rating="good" if len(final_output) > 40 else "neutral", + metadata={ + "final_output": final_output, + "total_epochs": config["epochs"], + } + ) + + print("\nTraining complete!") + print(f"Final loss: {loss:.4f}") + print("\nEvaluation annotations logged. View them in the 'Notes' tab.") + + +if __name__ == "__main__": + main() diff --git a/sdk/python/src/p95/evaluation.py b/sdk/python/src/p95/evaluation.py index 988dae0..8339c69 100644 --- a/sdk/python/src/p95/evaluation.py +++ b/sdk/python/src/p95/evaluation.py @@ -7,12 +7,15 @@ import json import time from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from pathlib import Path from p95.client import P95Client from p95.config import SDKConfig +if TYPE_CHECKING: + import pandas + @dataclass class Dataset: diff --git a/sdk/python/tests/test_log_eval.py b/sdk/python/tests/test_log_eval.py index 1a57cc8..1e445d1 100644 --- a/sdk/python/tests/test_log_eval.py +++ b/sdk/python/tests/test_log_eval.py @@ -6,8 +6,6 @@ import time from unittest import mock -import pytest - class TestLogEvalLocal: """Tests for log_eval in local mode.""" @@ -163,7 +161,7 @@ def test_log_eval_remote_basic(self): mock_batcher_class.return_value = mock_batcher # Patch the imports within run module - with mock.patch.object(Run, "_init_remote_mode") as mock_init: + with mock.patch.object(Run, "_init_remote_mode"): # Create run with mocked internals run = object.__new__(Run) run._config = mock.MagicMock()