From 580b5840145f0029ec1c816d7fac4fb8e7b264d3 Mon Sep 17 00:00:00 2001
From: Brodey Newman <brodeynewman@gmail.com>
Date: Wed, 18 Mar 2026 22:50:03 -0400
Subject: [PATCH 1/3] chore: evals

---
 sdk/python/src/p95/__init__.py   |  19 +
 sdk/python/src/p95/client.py     |  10 +
 sdk/python/src/p95/evaluation.py | 818 +++++++++++++++++++++++++++++++
 sdk/python/src/p95/run.py        |  56 +++
 4 files changed, 903 insertions(+)
 create mode 100644 sdk/python/src/p95/evaluation.py

diff --git a/sdk/python/src/p95/__init__.py b/sdk/python/src/p95/__init__.py
index b7597bd..bfcc9cd 100644
--- a/sdk/python/src/p95/__init__.py
+++ b/sdk/python/src/p95/__init__.py
@@ -41,6 +41,16 @@
 from p95.server import start_server, stop_server
 from p95.sweep import sweep, agent, should_prune, SweepConfig, ParameterSpec
 from p95.worker import Worker, WorkerCapabilities, Job, start_worker
+from p95.evaluation import (
+    Dataset,
+    Scorer,
+    Evaluation,
+    EvaluationConfig,
+    EvaluationTarget,
+    EvaluationResult,
+    EvaluationClient,
+    evaluate,
+)
 
 __version__ = "0.1.0"
 __all__ = [
@@ -62,6 +72,15 @@
     "WorkerCapabilities",
     "Job",
     "start_worker",
+    # Evaluations
+    "Dataset",
+    "Scorer",
+    "Evaluation",
+    "EvaluationConfig",
+    "EvaluationTarget",
+    "EvaluationResult",
+    "EvaluationClient",
+    "evaluate",
     # Exceptions
     "P95Error",
     "AuthenticationError",
diff --git a/sdk/python/src/p95/client.py b/sdk/python/src/p95/client.py
index 8b1c33f..b95aba3 100644
--- a/sdk/python/src/p95/client.py
+++ b/sdk/python/src/p95/client.py
@@ -277,3 +277,13 @@ def link_run_to_job(self, job_id: str, run_id: str) -> Dict[str, Any]:
         return self._request(
             "POST", f"/jobs/{job_id}/link-run", data={"run_id": run_id}
         )
+
+    def log_eval(self, run_id: str, eval_data: Dict[str, Any]) -> None:
+        """
+        Log a qualitative evaluation annotation.
+
+        Args:
+            run_id: The run ID
+            eval_data: Evaluation data containing message, step, timestamp, etc.
+        """
+        self._request("POST", f"/runs/{run_id}/evals", data=eval_data)
diff --git a/sdk/python/src/p95/evaluation.py b/sdk/python/src/p95/evaluation.py
new file mode 100644
index 0000000..988dae0
--- /dev/null
+++ b/sdk/python/src/p95/evaluation.py
@@ -0,0 +1,818 @@
+"""Evaluation module for p95 SDK.
+
+This module provides functionality for running evaluations against
+models or endpoints using datasets and scorers.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Union
+from pathlib import Path
+
+from p95.client import P95Client
+from p95.config import SDKConfig
+
+
+@dataclass
+class Dataset:
+    """Represents an evaluation dataset.
+
+    Can be created from:
+    - A local file (JSON, JSONL, CSV)
+    - A pandas DataFrame
+    - An external URL
+    - Inline data (list of dicts)
+    """
+    name: str
+    data: Optional[List[Dict[str, Any]]] = None
+    source_url: Optional[str] = None
+    format: str = "json"
+    has_expected: bool = False
+    input_field: str = "input"
+    expected_field: Optional[str] = None
+
+    # Set after upload
+    id: Optional[str] = None
+
+    @classmethod
+    def from_file(cls, path: str, name: Optional[str] = None) -> "Dataset":
+        """Create a dataset from a local file.
+
+        Args:
+            path: Path to the file (JSON, JSONL, or CSV)
+            name: Optional name for the dataset
+
+        Returns:
+            Dataset instance with loaded data
+        """
+        filepath = Path(path)
+        if not filepath.exists():
+            raise FileNotFoundError(f"File not found: {path}")
+
+        name = name or filepath.stem
+
+        # Determine format from extension
+        ext = filepath.suffix.lower()
+        if ext == ".json":
+            with open(filepath) as f:
+                data = json.load(f)
+            if not isinstance(data, list):
+                data = [data]
+            fmt = "json"
+        elif ext == ".jsonl":
+            data = []
+            with open(filepath) as f:
+                for line in f:
+                    if line.strip():
+                        data.append(json.loads(line))
+            fmt = "jsonl"
+        elif ext == ".csv":
+            import csv
+            data = []
+            with open(filepath) as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    data.append(row)
+            fmt = "csv"
+        else:
+            raise ValueError(f"Unsupported file format: {ext}")
+
+        # Detect if dataset has expected field
+        has_expected = False
+        expected_field = None
+        if data:
+            sample = data[0]
+            for field_name in ["expected", "ground_truth", "answer", "label", "target"]:
+                if field_name in sample:
+                    has_expected = True
+                    expected_field = field_name
+                    break
+
+        return cls(
+            name=name,
+            data=data,
+            format=fmt,
+            has_expected=has_expected,
+            expected_field=expected_field,
+        )
+
+    @classmethod
+    def from_dataframe(cls, df: "pandas.DataFrame", name: str) -> "Dataset":
+        """Create a dataset from a pandas DataFrame.
+
+        Args:
+            df: pandas DataFrame
+            name: Name for the dataset
+
+        Returns:
+            Dataset instance with DataFrame data
+        """
+        data = df.to_dict(orient="records")
+
+        # Detect expected field
+        has_expected = False
+        expected_field = None
+        for field_name in ["expected", "ground_truth", "answer", "label", "target"]:
+            if field_name in df.columns:
+                has_expected = True
+                expected_field = field_name
+                break
+
+        return cls(
+            name=name,
+            data=data,
+            format="json",
+            has_expected=has_expected,
+            expected_field=expected_field,
+        )
+
+    @classmethod
+    def from_url(cls, url: str, name: str, format: str = "json") -> "Dataset":
+        """Create a dataset from an external URL.
+
+        Args:
+            url: URL to the dataset
+            name: Name for the dataset
+            format: Data format (json, jsonl, csv)
+
+        Returns:
+            Dataset instance referencing the URL
+        """
+        return cls(
+            name=name,
+            source_url=url,
+            format=format,
+        )
+
+    @classmethod
+    def from_list(
+        cls,
+        data: List[Dict[str, Any]],
+        name: str,
+        input_field: str = "input",
+        expected_field: Optional[str] = None,
+    ) -> "Dataset":
+        """Create a dataset from a list of dictionaries.
+
+        Args:
+            data: List of dictionaries with input/output pairs
+            name: Name for the dataset
+            input_field: Field name for input data
+            expected_field: Field name for expected output
+
+        Returns:
+            Dataset instance
+        """
+        return cls(
+            name=name,
+            data=data,
+            format="json",
+            has_expected=expected_field is not None,
+            input_field=input_field,
+            expected_field=expected_field,
+        )
+
+
+@dataclass
+class Scorer:
+    """Represents a scorer for evaluating model outputs.
+
+    Scorers can be:
+    - Builtin (accuracy, bleu, rouge, etc.)
+    - LLM-as-judge (uses an LLM to evaluate)
+    - Custom (user-defined Python function)
+    """
+    name: str
+    type: str  # "builtin", "llm_judge", "custom"
+    config: Dict[str, Any] = field(default_factory=dict)
+    requires_expected: bool = False
+
+    # Set after creation on server
+    id: Optional[str] = None
+
+    @classmethod
+    def builtin(cls, name: str, **params) -> "Scorer":
+        """Create a builtin scorer.
+
+        Available builtin scorers:
+        - exact_match: Exact string match
+        - contains: Substring match
+        - bleu: BLEU score for text generation
+        - rouge-1, rouge-2, rouge-l: ROUGE scores
+        - accuracy: Classification accuracy
+        - f1, precision, recall: Classification metrics
+        - length: Response character length
+        - word_count: Response word count
+        - json_valid: Check if output is valid JSON
+        - toxicity: Basic toxicity detection
+
+        Args:
+            name: Name of the builtin scorer
+            **params: Additional parameters for the scorer
+
+        Returns:
+            Scorer instance
+        """
+        # Scorers that require expected output
+        requires_expected = name in [
+            "exact_match", "contains", "bleu",
+            "rouge-1", "rouge-2", "rouge-l",
+            "accuracy", "f1", "precision", "recall",
+        ]
+
+        return cls(
+            name=name,
+            type="builtin",
+            config={
+                "builtin_name": name,
+                "parameters": params,
+            },
+            requires_expected=requires_expected,
+        )
+
+    @classmethod
+    def llm_judge(
+        cls,
+        name: str,
+        model: str = "gpt-4o-mini",
+        system_prompt: str = "",
+        user_prompt: str = "",
+        output_parser: str = "numeric",
+        requires_expected: bool = False,
+    ) -> "Scorer":
+        """Create an LLM-as-judge scorer.
+
+        The user_prompt can contain template variables:
+        - {input}: The input from the dataset
+        - {output}: The model's output
+        - {expected}: The expected output (if available)
+
+        Args:
+            name: Name for this scorer
+            model: LLM model to use (e.g., "gpt-4", "claude-3-opus")
+            system_prompt: System prompt for the judge
+            user_prompt: User prompt template
+            output_parser: How to parse the response ("numeric", "boolean", "json")
+            requires_expected: Whether this scorer needs ground truth
+
+        Returns:
+            Scorer instance
+        """
+        return cls(
+            name=name,
+            type="llm_judge",
+            config={
+                "model": model,
+                "system_prompt": system_prompt,
+                "user_prompt": user_prompt,
+                "output_parser": output_parser,
+            },
+            requires_expected=requires_expected,
+        )
+
+    @classmethod
+    def custom(
+        cls,
+        name: str,
+        fn: Callable[[Any, Any, Any], float],
+        requires_expected: bool = False,
+    ) -> "Scorer":
+        """Create a custom scorer from a Python function.
+
+        The function signature should be:
+            fn(input, output, expected) -> float
+
+        Note: Custom scorers run locally, not on the server.
+
+        Args:
+            name: Name for this scorer
+            fn: Scoring function
+            requires_expected: Whether this scorer needs ground truth
+
+        Returns:
+            Scorer instance
+        """
+        return cls(
+            name=name,
+            type="custom",
+            config={
+                "_local_fn": fn,
+            },
+            requires_expected=requires_expected,
+        )
+
+
+@dataclass
+class EvaluationTarget:
+    """Specifies what to evaluate."""
+    run_id: Optional[str] = None
+    endpoint: Optional[str] = None
+    config: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_run(cls, run_id: str, **config) -> "EvaluationTarget":
+        """Evaluate a trained model from a run."""
+        return cls(run_id=run_id, config=config)
+
+    @classmethod
+    def from_endpoint(cls, url: str, **config) -> "EvaluationTarget":
+        """Evaluate an external API endpoint."""
+        return cls(endpoint=url, config=config)
+
+
+@dataclass
+class EvaluationConfig:
+    """Configuration for an evaluation."""
+    name: str
+    dataset: Union[Dataset, str]  # Dataset object or ID
+    target: EvaluationTarget
+    scorers: List[Union[Scorer, str]] = field(default_factory=list)  # Scorer objects or IDs
+    description: Optional[str] = None
+    config: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class EvaluationResult:
+    """Result of a single evaluation row."""
+    row_index: int
+    input: Any
+    model_output: Any
+    expected: Any
+    scores: Dict[str, float]
+    scorer_outputs: Dict[str, Any]
+    latency_ms: Optional[float] = None
+    error: Optional[str] = None
+
+
+@dataclass
+class Evaluation:
+    """Represents an evaluation run."""
+    id: str
+    name: str
+    status: str
+    dataset_id: str
+    scorer_ids: List[str]
+    target: Dict[str, Any]
+    overall_scores: Optional[Dict[str, float]] = None
+    rows_processed: int = 0
+    rows_failed: int = 0
+    created_at: Optional[str] = None
+
+    def is_complete(self) -> bool:
+        return self.status in ["completed", "failed", "canceled"]
+
+    def is_running(self) -> bool:
+        return self.status == "running"
+
+
+class EvaluationClient:
+    """Client for managing evaluations."""
+
+    def __init__(self, client: P95Client, team_slug: str, app_slug: str):
+        """
+        Initialize the evaluation client.
+
+        Args:
+            client: P95 API client
+            team_slug: Team slug
+            app_slug: App slug
+        """
+        self.client = client
+        self.team_slug = team_slug
+        self.app_slug = app_slug
+        self._base_path = f"/teams/{team_slug}/apps/{app_slug}"
+
+    def upload_dataset(self, dataset: Dataset) -> str:
+        """
+        Upload a dataset to the server.
+
+        Args:
+            dataset: Dataset to upload
+
+        Returns:
+            Dataset ID
+        """
+        data = {
+            "name": dataset.name,
+            "format": dataset.format,
+            "has_expected": dataset.has_expected,
+            "input_field": dataset.input_field,
+        }
+
+        if dataset.expected_field:
+            data["expected_field"] = dataset.expected_field
+
+        if dataset.source_url:
+            data["source_type"] = "url"
+            data["source_url"] = dataset.source_url
+        else:
+            data["source_type"] = "inline"
+            data["data"] = dataset.data
+
+        response = self.client._request(
+            "POST",
+            f"{self._base_path}/datasets",
+            data=data,
+        )
+
+        dataset.id = response["id"]
+        return response["id"]
+
+    def create_scorer(self, scorer: Scorer) -> str:
+        """
+        Create a scorer on the server.
+
+        Args:
+            scorer: Scorer to create
+
+        Returns:
+            Scorer ID
+        """
+        if scorer.type == "custom":
+            raise ValueError("Custom scorers run locally and cannot be uploaded")
+
+        data = {
+            "name": scorer.name,
+            "type": scorer.type,
+            "config": scorer.config,
+            "requires_expected": scorer.requires_expected,
+        }
+
+        response = self.client._request(
+            "POST",
+            f"{self._base_path}/scorers",
+            data=data,
+        )
+
+        scorer.id = response["id"]
+        return response["id"]
+
+    def create_evaluation(self, config: EvaluationConfig, start: bool = False) -> Evaluation:
+        """
+        Create an evaluation.
+
+        Args:
+            config: Evaluation configuration
+            start: Whether to start the evaluation immediately
+
+        Returns:
+            Evaluation instance
+        """
+        # Upload dataset if needed
+        dataset_id = config.dataset if isinstance(config.dataset, str) else config.dataset.id
+        if not dataset_id:
+            if isinstance(config.dataset, Dataset):
+                dataset_id = self.upload_dataset(config.dataset)
+            else:
+                raise ValueError("Dataset must be uploaded or provide an ID")
+
+        # Create scorers if needed
+        scorer_ids = []
+        custom_scorers = []
+        for scorer in config.scorers:
+            if isinstance(scorer, str):
+                scorer_ids.append(scorer)
+            elif scorer.type == "custom":
+                custom_scorers.append(scorer)
+            elif not scorer.id:
+                scorer_id = self.create_scorer(scorer)
+                scorer_ids.append(scorer_id)
+            else:
+                scorer_ids.append(scorer.id)
+
+        # Build target
+        target = {}
+        if config.target.run_id:
+            target["run_id"] = config.target.run_id
+        if config.target.endpoint:
+            target["endpoint"] = config.target.endpoint
+        if config.target.config:
+            target["config"] = config.target.config
+
+        data = {
+            "name": config.name,
+            "description": config.description,
+            "dataset_id": dataset_id,
+            "target": target,
+            "scorer_ids": scorer_ids,
+            "config": config.config,
+        }
+
+        response = self.client._request(
+            "POST",
+            f"{self._base_path}/evaluations",
+            data=data,
+        )
+
+        evaluation = Evaluation(
+            id=response["id"],
+            name=response["name"],
+            status=response["status"],
+            dataset_id=response["dataset_id"],
+            scorer_ids=response["scorer_ids"],
+            target=response["target"],
+            created_at=response.get("created_at"),
+        )
+
+        if start:
+            return self.start(evaluation.id)
+
+        return evaluation
+
+    def start(self, evaluation_id: str) -> Evaluation:
+        """
+        Start an evaluation.
+
+        Args:
+            evaluation_id: Evaluation ID
+
+        Returns:
+            Updated Evaluation instance
+        """
+        response = self.client._request(
+            "POST",
+            f"{self._base_path}/evaluations/{evaluation_id}/start",
+        )
+
+        return Evaluation(
+            id=response["id"],
+            name=response["name"],
+            status=response["status"],
+            dataset_id=response["dataset_id"],
+            scorer_ids=response["scorer_ids"],
+            target=response["target"],
+            overall_scores=response.get("overall_scores"),
+            rows_processed=response.get("rows_processed", 0),
+            rows_failed=response.get("rows_failed", 0),
+            created_at=response.get("created_at"),
+        )
+
+    def get(self, evaluation_id: str) -> Evaluation:
+        """
+        Get an evaluation by ID.
+
+        Args:
+            evaluation_id: Evaluation ID
+
+        Returns:
+            Evaluation instance
+        """
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/evaluations/{evaluation_id}",
+        )
+
+        return Evaluation(
+            id=response["id"],
+            name=response["name"],
+            status=response["status"],
+            dataset_id=response["dataset_id"],
+            scorer_ids=response["scorer_ids"],
+            target=response["target"],
+            overall_scores=response.get("overall_scores"),
+            rows_processed=response.get("rows_processed", 0),
+            rows_failed=response.get("rows_failed", 0),
+            created_at=response.get("created_at"),
+        )
+
+    def wait(self, evaluation_id: str, poll_interval: float = 2.0, timeout: Optional[float] = None) -> Evaluation:
+        """
+        Wait for an evaluation to complete.
+
+        Args:
+            evaluation_id: Evaluation ID
+            poll_interval: Seconds between status checks
+            timeout: Maximum seconds to wait (None = no timeout)
+
+        Returns:
+            Completed Evaluation instance
+        """
+        start_time = time.time()
+
+        while True:
+            evaluation = self.get(evaluation_id)
+
+            if evaluation.is_complete():
+                return evaluation
+
+            if timeout and (time.time() - start_time) > timeout:
+                raise TimeoutError(f"Evaluation {evaluation_id} did not complete within {timeout} seconds")
+
+            time.sleep(poll_interval)
+
+    def get_results(self, evaluation_id: str, limit: int = 100, offset: int = 0) -> List[EvaluationResult]:
+        """
+        Get results for an evaluation.
+
+        Args:
+            evaluation_id: Evaluation ID
+            limit: Maximum results to return
+            offset: Offset for pagination
+
+        Returns:
+            List of EvaluationResult instances
+        """
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/evaluations/{evaluation_id}/results",
+            params={"limit": limit, "offset": offset},
+        )
+
+        results = []
+        for item in response.get("results", []):
+            results.append(EvaluationResult(
+                row_index=item["row_index"],
+                input=item["input"],
+                model_output=item.get("model_output"),
+                expected=item.get("expected"),
+                scores=item.get("scores", {}),
+                scorer_outputs=item.get("scorer_outputs", {}),
+                latency_ms=item.get("latency_ms"),
+                error=item.get("error"),
+            ))
+
+        return results
+
+    def get_scores_summary(self, evaluation_id: str) -> Dict[str, Dict[str, float]]:
+        """
+        Get aggregated scores for an evaluation.
+
+        Args:
+            evaluation_id: Evaluation ID
+
+        Returns:
+            Dictionary mapping scorer names to summary stats
+        """
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/evaluations/{evaluation_id}/scores",
+        )
+
+        return response.get("scorer_summaries", {})
+
+    def cancel(self, evaluation_id: str) -> Evaluation:
+        """
+        Cancel a running evaluation.
+
+        Args:
+            evaluation_id: Evaluation ID
+
+        Returns:
+            Updated Evaluation instance
+        """
+        response = self.client._request(
+            "POST",
+            f"{self._base_path}/evaluations/{evaluation_id}/cancel",
+        )
+
+        return Evaluation(
+            id=response["id"],
+            name=response["name"],
+            status=response["status"],
+            dataset_id=response["dataset_id"],
+            scorer_ids=response["scorer_ids"],
+            target=response["target"],
+        )
+
+    def list_datasets(self, limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
+        """List datasets in the app."""
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/datasets",
+            params={"limit": limit, "offset": offset},
+        )
+        return response.get("datasets", [])
+
+    def list_scorers(self, limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
+        """List scorers in the app."""
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/scorers",
+            params={"limit": limit, "offset": offset},
+        )
+        return response.get("scorers", [])
+
+    def list_evaluations(self, limit: int = 50, offset: int = 0, status: Optional[str] = None) -> List[Evaluation]:
+        """List evaluations in the app."""
+        params = {"limit": limit, "offset": offset}
+        if status:
+            params["status"] = status
+
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/evaluations",
+            params=params,
+        )
+
+        evaluations = []
+        for item in response.get("evaluations", []):
+            evaluations.append(Evaluation(
+                id=item["id"],
+                name=item["name"],
+                status=item["status"],
+                dataset_id=item["dataset_id"],
+                scorer_ids=item["scorer_ids"],
+                target=item["target"],
+                overall_scores=item.get("overall_scores"),
+                rows_processed=item.get("rows_processed", 0),
+                rows_failed=item.get("rows_failed", 0),
+                created_at=item.get("created_at"),
+            ))
+
+        return evaluations
+
+    def get_builtin_scorers(self) -> List[Dict[str, Any]]:
+        """Get list of available builtin scorers."""
+        response = self.client._request(
+            "GET",
+            f"{self._base_path}/scorers/builtin",
+        )
+        return response.get("scorers", [])
+
+
+# Convenience functions for quick evaluations
+
+
+def evaluate(
+    project: str,
+    dataset: Union[Dataset, str, List[Dict[str, Any]]],
+    target: Union[EvaluationTarget, str],
+    scorers: List[Union[Scorer, str]],
+    name: Optional[str] = None,
+    wait: bool = True,
+    api_key: Optional[str] = None,
+) -> Evaluation:
+    """
+    Run an evaluation.
+
+    This is a convenience function for quick evaluations.
+
+    Args:
+        project: Project in format "team/app"
+        dataset: Dataset object, ID, or list of dicts
+        target: EvaluationTarget or endpoint URL
+        scorers: List of Scorer objects or builtin scorer names
+        name: Optional evaluation name
+        wait: Whether to wait for completion
+        api_key: Optional API key
+
+    Returns:
+        Completed Evaluation instance
+
+    Example:
+        result = p95.evaluate(
+            project="my-team/my-app",
+            dataset=[
+                {"input": "What is 2+2?", "expected": "4"},
+                {"input": "What is 3+3?", "expected": "6"},
+            ],
+            target="https://api.openai.com/v1/chat/completions",
+            scorers=["exact_match", "contains"],
+        )
+        print(result.overall_scores)
+    """
+    # Parse project
+    parts = project.split("/")
+    if len(parts) != 2:
+        raise ValueError("Project must be in format 'team/app'")
+    team_slug, app_slug = parts
+
+    # Create client
+    config = SDKConfig.from_env()
+    if api_key:
+        config.api_key = api_key
+    client = P95Client(config)
+    eval_client = EvaluationClient(client, team_slug, app_slug)
+
+    # Convert dataset if needed
+    if isinstance(dataset, list):
+        dataset = Dataset.from_list(dataset, name or "inline-dataset")
+
+    # Convert target if needed
+    if isinstance(target, str):
+        target = EvaluationTarget.from_endpoint(target)
+
+    # Convert scorer names to Scorer objects
+    processed_scorers = []
+    for scorer in scorers:
+        if isinstance(scorer, str):
+            processed_scorers.append(Scorer.builtin(scorer))
+        else:
+            processed_scorers.append(scorer)
+
+    # Create evaluation config
+    eval_config = EvaluationConfig(
+        name=name or f"evaluation-{int(time.time())}",
+        dataset=dataset,
+        target=target,
+        scorers=processed_scorers,
+    )
+
+    # Run evaluation
+    evaluation = eval_client.create_evaluation(eval_config, start=True)
+
+    if wait:
+        return eval_client.wait(evaluation.id)
+
+    return evaluation
diff --git a/sdk/python/src/p95/run.py b/sdk/python/src/p95/run.py
index d907b37..2bef4b7 100644
--- a/sdk/python/src/p95/run.py
+++ b/sdk/python/src/p95/run.py
@@ -367,6 +367,62 @@ def log(self, name: str, value: float, step: Optional[int] = None) -> None:
         """
         self.log_metrics({name: value}, step=step)
 
+    def log_eval(
+        self,
+        message: str,
+        rating: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Log a qualitative evaluation annotation at the current step.
+
+        Use this to record human judgments, observations, or notes about
+        model outputs during training or inference.
+
+        Args:
+            message: The evaluation message or annotation
+            rating: Optional rating (e.g., "good", "bad", "neutral")
+            metadata: Optional additional metadata
+
+        Example:
+            # Simple annotation
+            run.log_eval("Output looks coherent and relevant")
+
+            # With rating
+            run.log_eval("Response was off-topic", rating="bad")
+
+            # With metadata
+            run.log_eval(
+                "Great creative writing sample",
+                rating="good",
+                metadata={"sample_id": "abc123", "category": "creativity"}
+            )
+        """
+        with self._lock:
+            step = self._step  # Always use current step
+            ts = time.time()
+
+            eval_data = {
+                "message": message,
+                "step": step,
+                "timestamp": ts,
+            }
+            if rating:
+                eval_data["rating"] = rating
+            if metadata:
+                eval_data["metadata"] = metadata
+
+            if self._config.mode == "local":
+                # For local mode, append to eval_logs in meta
+                meta = self._local_writer._read_meta()
+                eval_logs = meta.get("eval_logs", [])
+                eval_logs.append(eval_data)
+                meta["eval_logs"] = eval_logs
+                self._local_writer._write_meta(meta)
+            else:
+                # For remote mode, send to API
+                self._remote_client.log_eval(self._run_id, eval_data)
+
     def flush(self) -> None:
         """Force flush all buffered metrics."""
         if self._config.mode == "local":

From 1908c623fc50db32a5947d37565879531c8b0de5 Mon Sep 17 00:00:00 2001
From: Brodey Newman <brodeynewman@gmail.com>
Date: Wed, 18 Mar 2026 22:56:41 -0400
Subject: [PATCH 2/3] chore: tests

---
 sdk/python/tests/test_evaluation.py | 644 ++++++++++++++++++++++++++++
 sdk/python/tests/test_log_eval.py   | 276 ++++++++++++
 sdk/python/uv.lock                  |   2 +-
 3 files changed, 921 insertions(+), 1 deletion(-)
 create mode 100644 sdk/python/tests/test_evaluation.py
 create mode 100644 sdk/python/tests/test_log_eval.py

diff --git a/sdk/python/tests/test_evaluation.py b/sdk/python/tests/test_evaluation.py
new file mode 100644
index 0000000..cb604a3
--- /dev/null
+++ b/sdk/python/tests/test_evaluation.py
@@ -0,0 +1,644 @@
+"""Tests for p95 evaluation module."""
+
+import json
+import os
+import tempfile
+from unittest import mock
+
+import pytest
+
+from p95.evaluation import (
+    Dataset,
+    Scorer,
+    Evaluation,
+    EvaluationConfig,
+    EvaluationTarget,
+    EvaluationResult,
+    EvaluationClient,
+)
+
+
+class TestDataset:
+    """Tests for Dataset dataclass."""
+
+    def test_dataset_creation(self):
+        """Test creating a dataset with inline data."""
+        data = [
+            {"input": "What is 2+2?", "expected": "4"},
+            {"input": "What is 3+3?", "expected": "6"},
+        ]
+        dataset = Dataset(name="test-dataset", data=data)
+
+        assert dataset.name == "test-dataset"
+        assert dataset.data == data
+        assert dataset.format == "json"
+        assert dataset.id is None
+
+    def test_dataset_from_list(self):
+        """Test creating a dataset from a list."""
+        data = [
+            {"prompt": "Hello", "answer": "Hi"},
+            {"prompt": "Bye", "answer": "Goodbye"},
+        ]
+        dataset = Dataset.from_list(
+            data,
+            name="greeting-dataset",
+            input_field="prompt",
+            expected_field="answer",
+        )
+
+        assert dataset.name == "greeting-dataset"
+        assert dataset.data == data
+        assert dataset.input_field == "prompt"
+        assert dataset.expected_field == "answer"
+        assert dataset.has_expected is True
+
+    def test_dataset_from_list_no_expected(self):
+        """Test creating a dataset without expected field."""
+        data = [{"input": "Hello"}, {"input": "World"}]
+        dataset = Dataset.from_list(data, name="simple-dataset")
+
+        assert dataset.has_expected is False
+        assert dataset.expected_field is None
+
+    def test_dataset_from_file_json(self):
+        """Test loading dataset from JSON file."""
+        data = [
+            {"input": "test1", "expected": "result1"},
+            {"input": "test2", "expected": "result2"},
+        ]
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(data, f)
+            f.flush()
+
+            try:
+                dataset = Dataset.from_file(f.name)
+
+                assert len(dataset.data) == 2
+                assert dataset.format == "json"
+                assert dataset.has_expected is True
+                assert dataset.expected_field == "expected"
+            finally:
+                os.unlink(f.name)
+
+    def test_dataset_from_file_jsonl(self):
+        """Test loading dataset from JSONL file."""
+        lines = [
+            '{"input": "line1", "label": "a"}',
+            '{"input": "line2", "label": "b"}',
+        ]
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            f.write("\n".join(lines))
+            f.flush()
+
+            try:
+                dataset = Dataset.from_file(f.name)
+
+                assert len(dataset.data) == 2
+                assert dataset.format == "jsonl"
+                assert dataset.has_expected is True
+                assert dataset.expected_field == "label"
+            finally:
+                os.unlink(f.name)
+
+    def test_dataset_from_file_csv(self):
+        """Test loading dataset from CSV file."""
+        csv_content = "input,target\ntest1,result1\ntest2,result2"
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+            f.write(csv_content)
+            f.flush()
+
+            try:
+                dataset = Dataset.from_file(f.name)
+
+                assert len(dataset.data) == 2
+                assert dataset.format == "csv"
+                assert dataset.has_expected is True
+                assert dataset.expected_field == "target"
+            finally:
+                os.unlink(f.name)
+
+    def test_dataset_from_file_not_found(self):
+        """Test that FileNotFoundError is raised for missing files."""
+        with pytest.raises(FileNotFoundError):
+            Dataset.from_file("/nonexistent/path/data.json")
+
+    def test_dataset_from_file_unsupported_format(self):
+        """Test that ValueError is raised for unsupported formats."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+            f.write("test content")
+            f.flush()
+
+            try:
+                with pytest.raises(ValueError) as exc:
+                    Dataset.from_file(f.name)
+                assert "Unsupported file format" in str(exc.value)
+            finally:
+                os.unlink(f.name)
+
+    def test_dataset_from_url(self):
+        """Test creating a dataset from URL reference."""
+        dataset = Dataset.from_url(
+            "https://example.com/data.json",
+            name="remote-dataset",
+            format="json",
+        )
+
+        assert dataset.name == "remote-dataset"
+        assert dataset.source_url == "https://example.com/data.json"
+        assert dataset.data is None
+        assert dataset.format == "json"
+
+    def test_dataset_name_from_filename(self):
+        """Test that dataset name is inferred from filename."""
+        data = [{"input": "test"}]
+
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", prefix="my_dataset_", delete=False
+        ) as f:
+            json.dump(data, f)
+            f.flush()
+
+            try:
+                dataset = Dataset.from_file(f.name)
+                # Name should be the stem of the filename
+                assert "my_dataset_" in dataset.name
+            finally:
+                os.unlink(f.name)
+
+
+class TestScorer:
+    """Tests for Scorer dataclass."""
+
+    def test_builtin_scorer_exact_match(self):
+        """Test creating an exact_match builtin scorer."""
+        scorer = Scorer.builtin("exact_match")
+
+        assert scorer.name == "exact_match"
+        assert scorer.type == "builtin"
+        assert scorer.config["builtin_name"] == "exact_match"
+        assert scorer.requires_expected is True
+
+    def test_builtin_scorer_no_expected(self):
+        """Test builtin scorer that doesn't require expected."""
+        scorer = Scorer.builtin("length")
+
+        assert scorer.name == "length"
+        assert scorer.requires_expected is False
+
+    def test_builtin_scorer_with_params(self):
+        """Test builtin scorer with parameters."""
+        scorer = Scorer.builtin("bleu", n_gram=4, smooth=True)
+
+        assert scorer.config["parameters"]["n_gram"] == 4
+        assert scorer.config["parameters"]["smooth"] is True
+
+    def test_llm_judge_scorer(self):
+        """Test creating an LLM-as-judge scorer."""
+        scorer = Scorer.llm_judge(
+            name="quality-judge",
+            model="gpt-4",
+            system_prompt="You are a quality judge.",
+            user_prompt="Rate this output: {output}",
+            output_parser="numeric",
+        )
+
+        assert scorer.name == "quality-judge"
+        assert scorer.type == "llm_judge"
+        assert scorer.config["model"] == "gpt-4"
+        assert scorer.config["system_prompt"] == "You are a quality judge."
+        assert scorer.config["output_parser"] == "numeric"
+
+    def test_llm_judge_scorer_defaults(self):
+        """Test LLM judge with default values."""
+        scorer = Scorer.llm_judge(
+            name="simple-judge",
+            user_prompt="Is this good? {output}",
+        )
+
+        assert scorer.config["model"] == "gpt-4o-mini"
+        assert scorer.config["system_prompt"] == ""
+        assert scorer.config["output_parser"] == "numeric"
+        assert scorer.requires_expected is False
+
+    def test_custom_scorer(self):
+        """Test creating a custom scorer."""
+
+        def my_scorer(input, output, expected):
+            return 1.0 if output == expected else 0.0
+
+        scorer = Scorer.custom("my-scorer", my_scorer, requires_expected=True)
+
+        assert scorer.name == "my-scorer"
+        assert scorer.type == "custom"
+        assert scorer.config["_local_fn"] == my_scorer
+        assert scorer.requires_expected is True
+
+
+class TestEvaluationTarget:
+    """Tests for EvaluationTarget dataclass."""
+
+    def test_target_from_run(self):
+        """Test creating target from a run ID."""
+        target = EvaluationTarget.from_run("run-abc123", temperature=0.7)
+
+        assert target.run_id == "run-abc123"
+        assert target.endpoint is None
+        assert target.config["temperature"] == 0.7
+
+    def test_target_from_endpoint(self):
+        """Test creating target from an endpoint URL."""
+        target = EvaluationTarget.from_endpoint(
+            "https://api.openai.com/v1/chat/completions",
+            model="gpt-4",
+            max_tokens=100,
+        )
+
+        assert target.endpoint == "https://api.openai.com/v1/chat/completions"
+        assert target.run_id is None
+        assert target.config["model"] == "gpt-4"
+        assert target.config["max_tokens"] == 100
+
+
+class TestEvaluationConfig:
+    """Tests for EvaluationConfig dataclass."""
+
+    def test_evaluation_config(self):
+        """Test creating an evaluation config."""
+        dataset = Dataset.from_list([{"input": "test"}], name="test-ds")
+        target = EvaluationTarget.from_endpoint("https://example.com/api")
+        scorers = [Scorer.builtin("exact_match")]
+
+        config = EvaluationConfig(
+            name="test-eval",
+            dataset=dataset,
+            target=target,
+            scorers=scorers,
+            description="A test evaluation",
+        )
+
+        assert config.name == "test-eval"
+        assert config.dataset == dataset
+        assert config.target == target
+        assert len(config.scorers) == 1
+        assert config.description == "A test evaluation"
+
+    def test_evaluation_config_with_ids(self):
+        """Test evaluation config with string IDs instead of objects."""
+        config = EvaluationConfig(
+            name="test-eval",
+            dataset="dataset-id-123",
+            target=EvaluationTarget.from_endpoint("https://example.com"),
+            scorers=["scorer-id-1", "scorer-id-2"],
+        )
+
+        assert config.dataset == "dataset-id-123"
+        assert config.scorers == ["scorer-id-1", "scorer-id-2"]
+
+
+class TestEvaluationResult:
+    """Tests for EvaluationResult dataclass."""
+
+    def test_evaluation_result(self):
+        """Test creating an evaluation result."""
+        result = EvaluationResult(
+            row_index=0,
+            input={"prompt": "Hello"},
+            model_output="Hi there!",
+            expected="Hello!",
+            scores={"exact_match": 0.0, "contains": 1.0},
+            scorer_outputs={"exact_match": {"matched": False}},
+            latency_ms=150.5,
+        )
+
+        assert result.row_index == 0
+        assert result.input == {"prompt": "Hello"}
+        assert result.model_output == "Hi there!"
+        assert result.scores["exact_match"] == 0.0
+        assert result.scores["contains"] == 1.0
+        assert result.latency_ms == 150.5
+        assert result.error is None
+
+    def test_evaluation_result_with_error(self):
+        """Test evaluation result with error."""
+        result = EvaluationResult(
+            row_index=5,
+            input={"prompt": "test"},
+            model_output=None,
+            expected="expected",
+            scores={},
+            scorer_outputs={},
+            error="API timeout",
+        )
+
+        assert result.error == "API timeout"
+        assert result.model_output is None
+
+
+class TestEvaluation:
+    """Tests for Evaluation dataclass."""
+
+    def test_evaluation_creation(self):
+        """Test creating an evaluation."""
+        evaluation = Evaluation(
+            id="eval-123",
+            name="test-evaluation",
+            status="pending",
+            dataset_id="ds-456",
+            scorer_ids=["scorer-1", "scorer-2"],
+            target={"endpoint": "https://example.com"},
+        )
+
+        assert evaluation.id == "eval-123"
+        assert evaluation.name == "test-evaluation"
+        assert evaluation.status == "pending"
+        assert evaluation.is_complete() is False
+        assert evaluation.is_running() is False
+
+    def test_evaluation_is_complete(self):
+        """Test is_complete for various statuses."""
+        for status in ["completed", "failed", "canceled"]:
+            evaluation = Evaluation(
+                id="eval-123",
+                name="test",
+                status=status,
+                dataset_id="ds-1",
+                scorer_ids=[],
+                target={},
+            )
+            assert evaluation.is_complete() is True
+
+        for status in ["pending", "running"]:
+            evaluation = Evaluation(
+                id="eval-123",
+                name="test",
+                status=status,
+                dataset_id="ds-1",
+                scorer_ids=[],
+                target={},
+            )
+            assert evaluation.is_complete() is False
+
+    def test_evaluation_is_running(self):
+        """Test is_running for various statuses."""
+        evaluation = Evaluation(
+            id="eval-123",
+            name="test",
+            status="running",
+            dataset_id="ds-1",
+            scorer_ids=[],
+            target={},
+        )
+        assert evaluation.is_running() is True
+
+        evaluation.status = "pending"
+        assert evaluation.is_running() is False
+
+    def test_evaluation_with_scores(self):
+        """Test evaluation with overall scores."""
+        evaluation = Evaluation(
+            id="eval-123",
+            name="test",
+            status="completed",
+            dataset_id="ds-1",
+            scorer_ids=["scorer-1"],
+            target={},
+            overall_scores={"exact_match": 0.85, "bleu": 0.72},
+            rows_processed=100,
+            rows_failed=5,
+        )
+
+        assert evaluation.overall_scores["exact_match"] == 0.85
+        assert evaluation.rows_processed == 100
+        assert evaluation.rows_failed == 5
+
+
+class TestEvaluationClient:
+    """Tests for EvaluationClient."""
+
+    def test_client_initialization(self):
+        """Test client initialization."""
+        mock_client = mock.MagicMock()
+        eval_client = EvaluationClient(mock_client, "my-team", "my-app")
+
+        assert eval_client.team_slug == "my-team"
+        assert eval_client.app_slug == "my-app"
+        assert eval_client._base_path == "/teams/my-team/apps/my-app"
+
+    def test_upload_dataset_inline(self):
+        """Test uploading inline dataset."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {"id": "ds-new-123"}
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        dataset = Dataset.from_list(
+            [{"input": "test", "expected": "result"}],
+            name="test-ds",
+            expected_field="expected",
+        )
+
+        result = eval_client.upload_dataset(dataset)
+
+        assert result == "ds-new-123"
+        assert dataset.id == "ds-new-123"
+        mock_client._request.assert_called_once()
+        call_args = mock_client._request.call_args
+        assert call_args[0][0] == "POST"
+        assert "/datasets" in call_args[0][1]
+        assert call_args[1]["data"]["name"] == "test-ds"
+        assert call_args[1]["data"]["source_type"] == "inline"
+
+    def test_upload_dataset_url(self):
+        """Test uploading URL-referenced dataset."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {"id": "ds-url-456"}
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        dataset = Dataset.from_url(
+            "https://example.com/data.json",
+            name="remote-ds",
+        )
+
+        result = eval_client.upload_dataset(dataset)
+
+        assert result == "ds-url-456"
+        call_args = mock_client._request.call_args
+        assert call_args[1]["data"]["source_type"] == "url"
+        assert call_args[1]["data"]["source_url"] == "https://example.com/data.json"
+
+    def test_create_scorer(self):
+        """Test creating a scorer on the server."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {"id": "scorer-new-789"}
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        scorer = Scorer.builtin("exact_match")
+
+        result = eval_client.create_scorer(scorer)
+
+        assert result == "scorer-new-789"
+        assert scorer.id == "scorer-new-789"
+
+    def test_create_scorer_custom_raises(self):
+        """Test that creating a custom scorer raises an error."""
+        mock_client = mock.MagicMock()
+        eval_client = EvaluationClient(mock_client, "team", "app")
+
+        scorer = Scorer.custom("my-scorer", lambda i, o, e: 1.0)
+
+        with pytest.raises(ValueError) as exc:
+            eval_client.create_scorer(scorer)
+        assert "Custom scorers run locally" in str(exc.value)
+
+    def test_create_evaluation(self):
+        """Test creating an evaluation."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {
+            "id": "eval-new-001",
+            "name": "test-eval",
+            "status": "pending",
+            "dataset_id": "ds-123",
+            "scorer_ids": ["scorer-1"],
+            "target": {"endpoint": "https://example.com"},
+        }
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+
+        # Create with existing IDs
+        config = EvaluationConfig(
+            name="test-eval",
+            dataset="ds-123",
+            target=EvaluationTarget.from_endpoint("https://example.com"),
+            scorers=["scorer-1"],
+        )
+
+        result = eval_client.create_evaluation(config)
+
+        assert result.id == "eval-new-001"
+        assert result.status == "pending"
+
+    def test_get_evaluation(self):
+        """Test getting an evaluation by ID."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {
+            "id": "eval-123",
+            "name": "test",
+            "status": "completed",
+            "dataset_id": "ds-1",
+            "scorer_ids": ["s-1"],
+            "target": {},
+            "overall_scores": {"accuracy": 0.95},
+            "rows_processed": 50,
+        }
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        result = eval_client.get("eval-123")
+
+        assert result.id == "eval-123"
+        assert result.status == "completed"
+        assert result.overall_scores["accuracy"] == 0.95
+
+    def test_get_results(self):
+        """Test getting evaluation results."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {
+            "results": [
+                {
+                    "row_index": 0,
+                    "input": {"text": "hello"},
+                    "model_output": "hi",
+                    "expected": "hello",
+                    "scores": {"exact_match": 0.0},
+                    "scorer_outputs": {},
+                    "latency_ms": 100.0,
+                },
+                {
+                    "row_index": 1,
+                    "input": {"text": "world"},
+                    "model_output": "world",
+                    "expected": "world",
+                    "scores": {"exact_match": 1.0},
+                    "scorer_outputs": {},
+                    "latency_ms": 95.0,
+                },
+            ]
+        }
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        results = eval_client.get_results("eval-123")
+
+        assert len(results) == 2
+        assert results[0].row_index == 0
+        assert results[0].scores["exact_match"] == 0.0
+        assert results[1].scores["exact_match"] == 1.0
+
+    def test_cancel_evaluation(self):
+        """Test canceling an evaluation."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {
+            "id": "eval-123",
+            "name": "test",
+            "status": "canceled",
+            "dataset_id": "ds-1",
+            "scorer_ids": [],
+            "target": {},
+        }
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        result = eval_client.cancel("eval-123")
+
+        assert result.status == "canceled"
+        mock_client._request.assert_called_with(
+            "POST",
+            "/teams/team/apps/app/evaluations/eval-123/cancel",
+        )
+
+    def test_list_evaluations(self):
+        """Test listing evaluations."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {
+            "evaluations": [
+                {
+                    "id": "eval-1",
+                    "name": "eval-one",
+                    "status": "completed",
+                    "dataset_id": "ds-1",
+                    "scorer_ids": [],
+                    "target": {},
+                },
+                {
+                    "id": "eval-2",
+                    "name": "eval-two",
+                    "status": "running",
+                    "dataset_id": "ds-2",
+                    "scorer_ids": [],
+                    "target": {},
+                },
+            ]
+        }
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        results = eval_client.list_evaluations()
+
+        assert len(results) == 2
+        assert results[0].name == "eval-one"
+        assert results[1].status == "running"
+
+    def test_get_builtin_scorers(self):
+        """Test getting builtin scorers."""
+        mock_client = mock.MagicMock()
+        mock_client._request.return_value = {
+            "scorers": [
+                {"name": "exact_match", "description": "Exact string match"},
+                {"name": "bleu", "description": "BLEU score"},
+            ]
+        }
+
+        eval_client = EvaluationClient(mock_client, "team", "app")
+        result = eval_client.get_builtin_scorers()
+
+        assert len(result) == 2
+        assert result[0]["name"] == "exact_match"
diff --git a/sdk/python/tests/test_log_eval.py b/sdk/python/tests/test_log_eval.py
new file mode 100644
index 0000000..1a57cc8
--- /dev/null
+++ b/sdk/python/tests/test_log_eval.py
@@ -0,0 +1,276 @@
+"""Tests for the log_eval functionality in p95.Run."""
+
+import json
+import os
+import tempfile
+import time
+from unittest import mock
+
+import pytest
+
+
+class TestLogEvalLocal:
+    """Tests for log_eval in local mode."""
+
+    def test_log_eval_basic(self):
+        """Test basic log_eval call in local mode."""
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                # Log some metrics to advance step
+                run.log_metrics({"loss": 0.5}, step=10)
+
+                # Log an eval
+                run.log_eval("This output looks great")
+
+            # Check that eval was logged
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            assert "eval_logs" in meta
+            assert len(meta["eval_logs"]) == 1
+            assert meta["eval_logs"][0]["message"] == "This output looks great"
+            assert meta["eval_logs"][0]["step"] == 11  # Step after log_metrics incremented it
+
+    def test_log_eval_with_rating(self):
+        """Test log_eval with rating."""
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                run.log_eval("Bad response", rating="bad")
+
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            assert meta["eval_logs"][0]["rating"] == "bad"
+
+    def test_log_eval_with_metadata(self):
+        """Test log_eval with metadata."""
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                run.log_eval(
+                    "Interesting output",
+                    metadata={"sample_id": "abc123", "category": "test"},
+                )
+
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            assert meta["eval_logs"][0]["metadata"]["sample_id"] == "abc123"
+            assert meta["eval_logs"][0]["metadata"]["category"] == "test"
+
+    def test_log_eval_multiple(self):
+        """Test logging multiple evals."""
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                run.log_metrics({"loss": 0.5}, step=0)
+                run.log_eval("First eval")
+
+                run.log_metrics({"loss": 0.4}, step=1)
+                run.log_eval("Second eval")
+
+                run.log_metrics({"loss": 0.3}, step=2)
+                run.log_eval("Third eval", rating="good")
+
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            assert len(meta["eval_logs"]) == 3
+            assert meta["eval_logs"][0]["message"] == "First eval"
+            assert meta["eval_logs"][1]["message"] == "Second eval"
+            assert meta["eval_logs"][2]["message"] == "Third eval"
+            assert meta["eval_logs"][2]["rating"] == "good"
+
+    def test_log_eval_uses_current_step(self):
+        """Test that log_eval uses the current step value."""
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                # Log at specific steps
+                run.log_metrics({"loss": 0.5}, step=100)
+                run.log_eval("At step 100-ish")
+
+                run.log_metrics({"loss": 0.3}, step=200)
+                run.log_eval("At step 200-ish")
+
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            # After log_metrics(step=100), internal step becomes 101
+            assert meta["eval_logs"][0]["step"] == 101
+            # After log_metrics(step=200), internal step becomes 201
+            assert meta["eval_logs"][1]["step"] == 201
+
+    def test_log_eval_has_timestamp(self):
+        """Test that log_eval includes a timestamp."""
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            before = time.time()
+
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                run.log_eval("Timed eval")
+
+            after = time.time()
+
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            ts = meta["eval_logs"][0]["timestamp"]
+            assert before <= ts <= after
+
+
+class TestLogEvalRemote:
+    """Tests for log_eval in remote mode."""
+
+    def test_log_eval_remote_basic(self):
+        """Test log_eval in remote mode calls the API."""
+        from p95.run import Run
+
+        with mock.patch("p95.client.P95Client") as mock_client_class, \
+             mock.patch("p95.metrics.MetricsBatcher") as mock_batcher_class:
+
+            mock_client = mock.MagicMock()
+            mock_client.create_run.return_value = "run-123"
+            mock_client_class.return_value = mock_client
+
+            mock_batcher = mock.MagicMock()
+            mock_batcher_class.return_value = mock_batcher
+
+            # Patch the imports within run module
+            with mock.patch.object(Run, "_init_remote_mode") as mock_init:
+                # Create run with mocked internals
+                run = object.__new__(Run)
+                run._config = mock.MagicMock()
+                run._config.mode = "remote"
+                run._run_id = "run-123"
+                run._remote_client = mock_client
+                run._remote_batcher = mock_batcher
+                run._step = 0
+                run._closed = False
+                run._lock = __import__("threading").Lock()
+
+                run.log_eval("Test message", rating="good")
+
+                # Verify log_eval was called on the client
+                mock_client.log_eval.assert_called_once()
+                call_args = mock_client.log_eval.call_args
+                assert call_args[0][0] == "run-123"  # run_id
+                assert call_args[0][1]["message"] == "Test message"
+                assert call_args[0][1]["rating"] == "good"
+                assert "step" in call_args[0][1]
+                assert "timestamp" in call_args[0][1]
+
+    def test_log_eval_remote_with_metadata(self):
+        """Test log_eval in remote mode with metadata."""
+        from p95.run import Run
+
+        mock_client = mock.MagicMock()
+
+        run = object.__new__(Run)
+        run._config = mock.MagicMock()
+        run._config.mode = "remote"
+        run._run_id = "run-456"
+        run._remote_client = mock_client
+        run._step = 0
+        run._closed = False
+        run._lock = __import__("threading").Lock()
+
+        run.log_eval(
+            "Complex eval",
+            rating="neutral",
+            metadata={"key": "value", "num": 42},
+        )
+
+        call_args = mock_client.log_eval.call_args
+        assert call_args[0][1]["metadata"]["key"] == "value"
+        assert call_args[0][1]["metadata"]["num"] == 42
+
+    def test_log_eval_remote_no_rating(self):
+        """Test log_eval in remote mode without rating."""
+        from p95.run import Run
+
+        mock_client = mock.MagicMock()
+
+        run = object.__new__(Run)
+        run._config = mock.MagicMock()
+        run._config.mode = "remote"
+        run._run_id = "run-789"
+        run._remote_client = mock_client
+        run._step = 5
+        run._closed = False
+        run._lock = __import__("threading").Lock()
+
+        run.log_eval("Simple message")
+
+        call_args = mock_client.log_eval.call_args
+        assert "rating" not in call_args[0][1]
+        assert "metadata" not in call_args[0][1]
+        assert call_args[0][1]["step"] == 5
+
+
+class TestLogEvalThreadSafety:
+    """Tests for log_eval thread safety."""
+
+    def test_log_eval_thread_safe(self):
+        """Test that log_eval is thread-safe."""
+        import threading
+        from p95.run import Run
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with Run(project="test-project", mode="local", logdir=tmpdir) as run:
+                errors = []
+
+                def log_evals(n):
+                    try:
+                        for i in range(n):
+                            run.log_eval(f"Message from thread at {i}")
+                    except Exception as e:
+                        errors.append(e)
+
+                threads = [
+                    threading.Thread(target=log_evals, args=(10,))
+                    for _ in range(5)
+                ]
+
+                for t in threads:
+                    t.start()
+                for t in threads:
+                    t.join()
+
+                assert len(errors) == 0
+
+            # Verify all evals were logged
+            run_dir = os.path.join(tmpdir, "test-project", run.name)
+            meta_path = os.path.join(run_dir, "meta.json")
+
+            with open(meta_path) as f:
+                meta = json.load(f)
+
+            # 5 threads * 10 evals each = 50 total
+            assert len(meta["eval_logs"]) == 50
diff --git a/sdk/python/uv.lock b/sdk/python/uv.lock
index 511e1d0..70ca62c 100644
--- a/sdk/python/uv.lock
+++ b/sdk/python/uv.lock
@@ -569,7 +569,7 @@ wheels = [
 
 [[package]]
 name = "p95"
-version = "0.7.0"
+version = "0.8.0"
 source = { editable = "." }
 dependencies = [
     { name = "requests", version = "2.32.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },

From d7005ad337b97f921df276e91ef184208fdea819 Mon Sep 17 00:00:00 2001
From: Brodey Newman <brodeynewman@gmail.com>
Date: Wed, 18 Mar 2026 23:23:04 -0400
Subject: [PATCH 3/3] chore: tests

---
 sdk/python/examples/train_with_evals.py | 151 ++++++++++++++++++++++++
 sdk/python/src/p95/evaluation.py        |   5 +-
 sdk/python/tests/test_log_eval.py       |   4 +-
 3 files changed, 156 insertions(+), 4 deletions(-)
 create mode 100644 sdk/python/examples/train_with_evals.py

diff --git a/sdk/python/examples/train_with_evals.py b/sdk/python/examples/train_with_evals.py
new file mode 100644
index 0000000..4d987dc
--- /dev/null
+++ b/sdk/python/examples/train_with_evals.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Train a model with qualitative evaluation annotations.
+
+This example demonstrates using p95's log_eval() feature to add
+human-readable annotations during training. These annotations appear
+in the "Notes" tab of the run detail page in the UI.
+
+Usage:
+    # Local mode (default)
+    python examples/train_with_evals.py
+
+    # Remote mode
+    P95_URL=http://localhost:8080 P95_API_KEY=xxx python examples/train_with_evals.py
+"""
+
+import os
+import time
+import numpy as np
+import p95
+
+
+def generate_text_output(epoch: int) -> str:
+    """Simulate a model generating text output."""
+    outputs = [
+        "The quick brown fox jumps over the lazy dog.",
+        "Machine learning is transforming how we build software.",
+        "The weather today is sunny with a chance of clouds.",
+        "Python is a versatile programming language.",
+        "Neural networks learn patterns from data.",
+        "The cat sat on the mat and looked at the window.",
+        "Artificial intelligence is advancing rapidly.",
+        "Data science combines statistics and programming.",
+        "Deep learning requires large amounts of data.",
+        "Transfer learning helps with limited datasets.",
+    ]
+    # Add some variation based on epoch
+    base = outputs[epoch % len(outputs)]
+    if epoch > 20:
+        # Outputs get more coherent as training progresses
+        return base
+    elif epoch > 10:
+        # Medium quality - some minor issues
+        return base.replace("the", "teh").replace(".", "")
+    else:
+        # Early training - lower quality
+        words = base.split()
+        np.random.shuffle(words)
+        return " ".join(words[:len(words)//2])
+
+
+def evaluate_output(output: str, epoch: int) -> tuple[str, str]:
+    """Simulate human evaluation of model output.
+
+    Returns:
+        tuple of (message, rating)
+    """
+    # Simple heuristics to simulate evaluation
+    if len(output) < 20:
+        return "Output too short, model not generating enough content", "bad"
+
+    if "teh" in output or not output.endswith("."):
+        return "Minor quality issues detected - typos or missing punctuation", "neutral"
+
+    if len(output) > 40 and output[0].isupper():
+        return "Good output quality - coherent and well-formed", "good"
+
+    return "Acceptable output, room for improvement", "neutral"
+
+
+def main():
+    config = {
+        "epochs": int(os.environ.get("P95_CONFIG_EPOCHS", "30")),
+        "lr": float(os.environ.get("P95_CONFIG_LR", "0.001")),
+        "batch_size": int(os.environ.get("P95_CONFIG_BATCH_SIZE", "32")),
+        "model_type": "transformer",
+        "eval_frequency": 5,  # Evaluate every N epochs
+    }
+
+    project = os.environ.get("P95_PROJECT", "text-generation")
+
+    print("Training text generation model with qualitative evals")
+    print(f"Config: {config}")
+
+    with p95.Run(project=project, config=config) as run:
+        print(f"Run ID: {run.id}")
+        print(f"Mode: {run.mode}")
+        if run.mode == "local":
+            print(f"Log dir: {run.logdir}")
+            print("\nTo view in UI, run: pnf --logdir <logdir>")
+        else:
+            print("\nView in UI at: /<team>/<app>/runs/<run_id>")
+            print("Look for the 'Notes' tab to see evaluation annotations")
+
+        np.random.seed(42)
+
+        for epoch in range(config["epochs"]):
+            # Simulate training metrics
+            base_loss = 2.0 * np.exp(-epoch / 10) + 0.1
+            loss = base_loss + np.random.normal(0, 0.05)
+            perplexity = np.exp(loss)
+
+            run.log_metrics({
+                "train/loss": loss,
+                "train/perplexity": perplexity,
+            }, step=epoch)
+
+            # Periodically evaluate and log qualitative feedback
+            if epoch % config["eval_frequency"] == 0:
+                # Generate sample output
+                output = generate_text_output(epoch)
+
+                # Evaluate the output
+                message, rating = evaluate_output(output, epoch)
+
+                # Log the evaluation annotation
+                run.log_eval(
+                    message=f"Epoch {epoch}: {message}\nSample output: \"{output}\"",
+                    rating=rating,
+                    metadata={
+                        "epoch": epoch,
+                        "output_length": len(output),
+                        "sample_output": output,
+                    }
+                )
+
+                print(f"Epoch {epoch}: {rating.upper()} - {message}")
+
+            # Print progress
+            if (epoch + 1) % 10 == 0:
+                print(f"Epoch {epoch + 1}/{config['epochs']} - loss: {loss:.4f}")
+
+            time.sleep(0.1)  # Simulate training time
+
+        # Final evaluation
+        final_output = generate_text_output(config["epochs"])
+        run.log_eval(
+            message=f"Final model evaluation: Output quality is {'excellent' if len(final_output) > 40 else 'acceptable'}",
+            rating="good" if len(final_output) > 40 else "neutral",
+            metadata={
+                "final_output": final_output,
+                "total_epochs": config["epochs"],
+            }
+        )
+
+        print("\nTraining complete!")
+        print(f"Final loss: {loss:.4f}")
+        print("\nEvaluation annotations logged. View them in the 'Notes' tab.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sdk/python/src/p95/evaluation.py b/sdk/python/src/p95/evaluation.py
index 988dae0..8339c69 100644
--- a/sdk/python/src/p95/evaluation.py
+++ b/sdk/python/src/p95/evaluation.py
@@ -7,12 +7,15 @@
 import json
 import time
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 from pathlib import Path
 
 from p95.client import P95Client
 from p95.config import SDKConfig
 
+if TYPE_CHECKING:
+    import pandas
+
 
 @dataclass
 class Dataset:
diff --git a/sdk/python/tests/test_log_eval.py b/sdk/python/tests/test_log_eval.py
index 1a57cc8..1e445d1 100644
--- a/sdk/python/tests/test_log_eval.py
+++ b/sdk/python/tests/test_log_eval.py
@@ -6,8 +6,6 @@
 import time
 from unittest import mock
 
-import pytest
-
 
 class TestLogEvalLocal:
     """Tests for log_eval in local mode."""
@@ -163,7 +161,7 @@ def test_log_eval_remote_basic(self):
             mock_batcher_class.return_value = mock_batcher
 
             # Patch the imports within run module
-            with mock.patch.object(Run, "_init_remote_mode") as mock_init:
+            with mock.patch.object(Run, "_init_remote_mode"):
                 # Create run with mocked internals
                 run = object.__new__(Run)
                 run._config = mock.MagicMock()