Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 65 additions & 4 deletions eval_protocol/human_id/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,68 @@

from . import dictionary

__all__ = ["generate_id"]
__all__ = ["generate_id", "num_combinations"]

system_random = random.SystemRandom()


def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=5) -> str:
def generate_id(
separator: str = "-",
seed: int | float | str | bytes | bytearray | None = None,
word_count: int = 5,
index: int | None = None,
) -> str:
"""
Generate a human readable ID

:param separator: The string to use to separate words
:param seed: The seed to use. The same seed will produce the same ID
:param seed: The seed to use. The same seed will produce the same ID or index-based mapping
:param index: Optional non-negative integer providing a 1:1 mapping to an ID.
When provided, the mapping is deterministic and bijective for
all integers in range [0, total_combinations).
:param word_count: The number of words to use. Minimum of 3.
:return: A human readable ID
"""
if word_count < 3:
raise ValueError("word_count cannot be lower than 3")

# If a specific index is provided, use mixed-radix encoding into a fixed
# sequence of parts to guarantee a bijection between integers and IDs.
# The sequence cycles as: verb, adjective, noun, verb, adjective, noun, ...
if index is not None:
if not isinstance(index, int) or index < 0:
raise ValueError("index must be a non-negative integer if provided")

# Prepare category lists; if seed is provided, shuffle deterministically
base_categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
if seed is not None:
rnd = random.Random(seed)
categories = [tuple(rnd.sample(cat, len(cat))) for cat in base_categories]
else:
categories = base_categories
# Build the category order for the desired word_count
ordered_categories = [categories[i % 3] for i in range(word_count)]

# Compute total number of combinations for this word_count
radices = [len(cat) for cat in ordered_categories]
total = num_combinations(word_count)

if index >= total:
raise ValueError(f"index out of range for given word_count. Received {index}, max allowed is {total - 1}")

# Mixed-radix decomposition (least significant position is the last word)
digits: list[int] = []
remaining = index
for base in reversed(radices):
digits.append(remaining % base)
remaining //= base
digits.reverse()

words = [ordered_categories[pos][digits[pos]] for pos in range(word_count)]
return separator.join(words)

random_obj = system_random
if seed:
if seed is not None:
random_obj = random.Random(seed)

parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}
Expand All @@ -33,3 +76,21 @@ def generate_id(separator="-", seed: int | float | str | bytes | bytearray | Non
parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())

return separator.join(parts)


def num_combinations(word_count: int = 5) -> int:
"""
Return the total number of unique IDs possible for the given word_count.

The sequence of categories cycles as: verb, adjective, noun, then repeats.
This value can be used to mod an index when calling generate_id(index=...).
"""
if word_count < 3:
raise ValueError("word_count cannot be lower than 3")

categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
radices = [len(categories[i % 3]) for i in range(word_count)]
total = 1
for r in radices:
total *= r
return total
18 changes: 17 additions & 1 deletion eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ class InputMetadata(BaseModel):

model_config = ConfigDict(extra="allow")

row_id: Optional[str] = Field(default_factory=generate_id, description="Unique string to ID the row")
row_id: Optional[str] = Field(None, description="Unique string to ID the row")
completion_params: CompletionParams = Field(
default_factory=dict, description="Completion endpoint parameters used"
)
Expand Down Expand Up @@ -421,6 +421,22 @@ def get_termination_reason(self) -> str:
return msg.control_plane_step["termination_reason"]
return "unknown"

def __hash__(self) -> int:
# Use a stable hash by sorting keys and ensuring compact output
json_str = self.stable_json(self)
return hash(json_str)

@classmethod
def stable_json(cls, row: "EvaluationRow") -> int:
json_str = row.model_dump_json(
exclude_none=True,
exclude_defaults=True,
by_alias=True,
indent=None,
exclude=["created_at", "execution_metadata"],
)
return json_str


# Original dataclass-based models for backwards compatibility
# These are deprecated and will be removed in a future version
Expand Down
12 changes: 11 additions & 1 deletion eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from eval_protocol.dataset_logger import default_logger
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.human_id import generate_id
from eval_protocol.human_id import generate_id, num_combinations
from eval_protocol.models import (
CompletionParams,
EvalMetadata,
Expand Down Expand Up @@ -294,6 +294,16 @@ def _log_eval_error(
else:
raise ValueError("No input dataset or input messages provided")

for row in data:
# generate a stable row_id for each row
if row.input_metadata.row_id is None:
# Generate a stable, deterministic row_id using the row's hash and num_combinations
index = hash(row)
max_index = num_combinations() - 1
# Ensure index is a non-negative integer within [0, max_index]
index = abs(index) % (max_index + 1)
row.input_metadata.row_id = generate_id(seed=0, index=index)

if "completion_params" not in kwargs or not kwargs["completion_params"]:
raise ValueError(
"No completion parameters provided. Please provide a completion parameters object."
Expand Down
97 changes: 97 additions & 0 deletions tests/pytest/test_pytest_stable_row_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from typing import List

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row


async def test_evaluation_test_decorator_ids_single():
from eval_protocol.pytest.evaluation_test import evaluation_test

row_ids = set()

input_dataset = [
"tests/pytest/data/markdown_dataset.jsonl",
"tests/pytest/data/markdown_dataset.jsonl",
]
completion_params_list = [
{"temperature": 0.0, "model": "dummy/local-model"},
{"temperature": 1.0, "model": "dummy/local-model"},
]

@evaluation_test(
input_dataset=input_dataset,
completion_params=completion_params_list,
dataset_adapter=markdown_dataset_to_evaluation_row,
rollout_processor=NoOpRolloutProcessor(),
mode="pointwise",
combine_datasets=False,
num_runs=5,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
row_ids.add(row.input_metadata.row_id)
return row

# Manually invoke all parameter combinations within a single test
for ds_path in input_dataset:
for params in completion_params_list:
await eval_fn(dataset_path=[ds_path], completion_params=params)

# Second invocation to ensure that IDs are stable across multiple invocations
for ds_path in input_dataset:
for params in completion_params_list:
await eval_fn(dataset_path=[ds_path], completion_params=params)

# Assertions on IDs generated by the decorator logic
assert len(row_ids) == 19 # from the markdown dataset


async def test_evaluation_test_generated_row_ids_without_dataset_keys():
from eval_protocol.pytest.evaluation_test import evaluation_test

# Adapter that does NOT set row_id; lets evaluation_test generate IDs
def markdown_dataset_no_row_id_adapter(data: List[dict]) -> List[EvaluationRow]:
return [
EvaluationRow(
messages=[Message(role="user", content=row["prompt"])],
ground_truth=str(row["num_highlights"]),
)
for row in data
]

row_ids = set()

input_dataset = ["tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl"]
completion_params = [
{"temperature": 0.0, "model": "dummy/local-model"},
{"temperature": 1.0, "model": "dummy/local-model"},
]

@evaluation_test(
input_dataset=input_dataset,
completion_params=completion_params,
dataset_adapter=markdown_dataset_no_row_id_adapter,
rollout_processor=NoOpRolloutProcessor(),
mode="pointwise",
combine_datasets=False,
num_runs=5,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
# row_id should be auto-generated by evaluation_test/InputMetadata
assert row.input_metadata is not None
assert row.input_metadata.row_id is not None and isinstance(row.input_metadata.row_id, str)
row_ids.add(row.input_metadata.row_id)
return row

# Single invocation (one dataset, one param set) with multiple runs
for ds_path in input_dataset:
for params in completion_params:
await eval_fn(dataset_path=[ds_path], completion_params=params)

# Second invocation to ensure that IDs are stable across multiple invocations
for ds_path in input_dataset:
for params in completion_params:
await eval_fn(dataset_path=[ds_path], completion_params=params)

# Even with multiple runs, generated row_ids should be stable within the invocation
assert len(row_ids) == 19 # equals dataset size when IDs are generated once and preserved across runs
69 changes: 69 additions & 0 deletions tests/test_human_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import re
import pytest

from eval_protocol.human_id import generate_id, num_combinations


def test_generate_id_index_basic_3_words():
# index 0 maps to the first element of each category (verb, adjective, noun)
assert generate_id(index=0, word_count=3) == "be-other-time"

# incrementing index advances the least-significant position (noun)
assert generate_id(index=1, word_count=3) == "be-other-year"

# carry into the adjective when nouns wrap
# index == len(nouns) => adjective advances by 1, noun resets
# nouns length inferred by probing with large indices is brittle; instead, compute via reach
# We know index=0 gives be-other-time, and index that produces adjective=new, noun=time should be reachable.
# Derive by scanning forward until adjective changes to 'new'. This keeps test robust to dictionary size edits.
base = generate_id(index=0, word_count=3)
# Find the first index where adjective becomes 'new' and noun resets to 'time'
target = None
for i in range(1, 2000):
cand = generate_id(index=i, word_count=3)
if cand.startswith("be-new-time"):
target = i
break
assert target is not None, "Expected to find carry into adjective within search bound"
assert generate_id(index=target, word_count=3) == "be-new-time"


def test_generate_id_index_word_count_cycle():
# word_count cycles categories: verb, adj, noun, verb, adj, ...
assert generate_id(index=0, word_count=5) == "be-other-time-be-other"
# increment least-significant position (adj at position 5)
assert generate_id(index=1, word_count=5) == "be-other-time-be-new"


def test_generate_id_index_out_of_range_and_negative():
# Use exported total combinations for clean boundary checks
total = num_combinations(word_count=3)
assert total > 0
# Last valid index
generate_id(index=total - 1, word_count=3)
# First invalid index
with pytest.raises(ValueError):
generate_id(index=total, word_count=3)

with pytest.raises(ValueError):
generate_id(index=-1, word_count=3)


def test_generate_id_seed_stability_and_compat():
# Without index, same seed yields same id
a = generate_id(seed=1234)
b = generate_id(seed=1234)
assert a == b

# Without index, default produces separator '-' and at least 3 components
c = generate_id()
assert re.match(r"^[a-z]+(-[a-z]+){2,}$", c)


def test_generate_id_index_ignores_seed():
# With index provided, seed should affect the mapping deterministically
x = generate_id(index=42, seed=1)
y = generate_id(index=42, seed=999)
z = generate_id(index=42, seed=1)
assert x != y
assert x == z
17 changes: 17 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,23 @@ def test_evaluation_row_creation():
assert not row.is_trajectory_evaluation()


def test_stable_hash():
"""Test the stable hash method."""
row = EvaluationRow(
messages=[Message(role="user", content="What is 2+2?"), Message(role="assistant", content="2+2 equals 4.")],
ground_truth="4",
)
row2 = EvaluationRow(
messages=[Message(role="user", content="What is 2+2?"), Message(role="assistant", content="2+2 equals 4.")],
ground_truth="4",
)
stable_json = EvaluationRow.stable_json(row)
stable_json2 = EvaluationRow.stable_json(row2)
assert stable_json == stable_json2
assert "created_at" not in stable_json
assert "execution_metadata" not in stable_json


def test_evaluation_row_trajectory_evaluation():
"""Test EvaluationRow with trajectory evaluation."""
messages = [
Expand Down
Loading