diff --git a/eval_protocol/human_id/__init__.py b/eval_protocol/human_id/__init__.py index 8b5d447c..b5cf206c 100644 --- a/eval_protocol/human_id/__init__.py +++ b/eval_protocol/human_id/__init__.py @@ -4,25 +4,68 @@ from . import dictionary -__all__ = ["generate_id"] +__all__ = ["generate_id", "num_combinations"] system_random = random.SystemRandom() -def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=5) -> str: +def generate_id( + separator: str = "-", + seed: int | float | str | bytes | bytearray | None = None, + word_count: int = 5, + index: int | None = None, +) -> str: """ Generate a human readable ID :param separator: The string to use to separate words - :param seed: The seed to use. The same seed will produce the same ID + :param seed: The seed to use. The same seed will produce the same ID or index-based mapping + :param index: Optional non-negative integer providing a 1:1 mapping to an ID. + When provided, the mapping is deterministic and bijective for + all integers in range [0, total_combinations). :param word_count: The number of words to use. Minimum of 3. :return: A human readable ID """ if word_count < 3: raise ValueError("word_count cannot be lower than 3") + # If a specific index is provided, use mixed-radix encoding into a fixed + # sequence of parts to guarantee a bijection between integers and IDs. + # The sequence cycles as: verb, adjective, noun, verb, adjective, noun, ... + if index is not None: + if not isinstance(index, int) or index < 0: + raise ValueError("index must be a non-negative integer if provided") + + # Prepare category lists; if seed is provided, shuffle deterministically + base_categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns] + if seed is not None: + rnd = random.Random(seed) + categories = [tuple(rnd.sample(cat, len(cat))) for cat in base_categories] + else: + categories = base_categories + # Build the category order for the desired word_count + ordered_categories = [categories[i % 3] for i in range(word_count)] + + # Compute total number of combinations for this word_count + radices = [len(cat) for cat in ordered_categories] + total = num_combinations(word_count) + + if index >= total: + raise ValueError(f"index out of range for given word_count. Received {index}, max allowed is {total - 1}") + + # Mixed-radix decomposition (least significant position is the last word) + digits: list[int] = [] + remaining = index + for base in reversed(radices): + digits.append(remaining % base) + remaining //= base + digits.reverse() + + words = [ordered_categories[pos][digits[pos]] for pos in range(word_count)] + return separator.join(words) + random_obj = system_random - if seed: + if seed is not None: random_obj = random.Random(seed) parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1} @@ -33,3 +76,21 @@ def generate_id(separator="-", seed: int | float | str | bytes | bytearray | Non parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items()) return separator.join(parts) + + +def num_combinations(word_count: int = 5) -> int: + """ + Return the total number of unique IDs possible for the given word_count. + + The sequence of categories cycles as: verb, adjective, noun, then repeats. + This value can be used to mod an index when calling generate_id(index=...). + """ + if word_count < 3: + raise ValueError("word_count cannot be lower than 3") + + categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns] + radices = [len(categories[i % 3]) for i in range(word_count)] + total = 1 + for r in radices: + total *= r + return total diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 9d528289..d2deb9cd 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -209,7 +209,7 @@ class InputMetadata(BaseModel): model_config = ConfigDict(extra="allow") - row_id: Optional[str] = Field(default_factory=generate_id, description="Unique string to ID the row") + row_id: Optional[str] = Field(None, description="Unique string to ID the row") completion_params: CompletionParams = Field( default_factory=dict, description="Completion endpoint parameters used" ) @@ -421,6 +421,22 @@ def get_termination_reason(self) -> str: return msg.control_plane_step["termination_reason"] return "unknown" + def __hash__(self) -> int: + # Use a stable hash by sorting keys and ensuring compact output + json_str = self.stable_json(self) + return hash(json_str) + + @classmethod + def stable_json(cls, row: "EvaluationRow") -> int: + json_str = row.model_dump_json( + exclude_none=True, + exclude_defaults=True, + by_alias=True, + indent=None, + exclude=["created_at", "execution_metadata"], + ) + return json_str + # Original dataclass-based models for backwards compatibility # These are deprecated and will be removed in a future version diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index beac9acb..4a54fab9 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -15,7 +15,7 @@ from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger -from eval_protocol.human_id import generate_id +from eval_protocol.human_id import generate_id, num_combinations from eval_protocol.models import ( CompletionParams, EvalMetadata, @@ -294,6 +294,16 @@ def _log_eval_error( else: raise ValueError("No input dataset or input messages provided") + for row in data: + # generate a stable row_id for each row + if row.input_metadata.row_id is None: + # Generate a stable, deterministic row_id using the row's hash and num_combinations + index = hash(row) + max_index = num_combinations() - 1 + # Ensure index is a non-negative integer within [0, max_index] + index = abs(index) % (max_index + 1) + row.input_metadata.row_id = generate_id(seed=0, index=index) + if "completion_params" not in kwargs or not kwargs["completion_params"]: raise ValueError( "No completion parameters provided. Please provide a completion parameters object." diff --git a/tests/pytest/test_pytest_stable_row_id.py b/tests/pytest/test_pytest_stable_row_id.py new file mode 100644 index 00000000..c2a5709a --- /dev/null +++ b/tests/pytest/test_pytest_stable_row_id.py @@ -0,0 +1,97 @@ +from typing import List + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor +from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row + + +async def test_evaluation_test_decorator_ids_single(): + from eval_protocol.pytest.evaluation_test import evaluation_test + + row_ids = set() + + input_dataset = [ + "tests/pytest/data/markdown_dataset.jsonl", + "tests/pytest/data/markdown_dataset.jsonl", + ] + completion_params_list = [ + {"temperature": 0.0, "model": "dummy/local-model"}, + {"temperature": 1.0, "model": "dummy/local-model"}, + ] + + @evaluation_test( + input_dataset=input_dataset, + completion_params=completion_params_list, + dataset_adapter=markdown_dataset_to_evaluation_row, + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + combine_datasets=False, + num_runs=5, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + row_ids.add(row.input_metadata.row_id) + return row + + # Manually invoke all parameter combinations within a single test + for ds_path in input_dataset: + for params in completion_params_list: + await eval_fn(dataset_path=[ds_path], completion_params=params) + + # Second invocation to ensure that IDs are stable across multiple invocations + for ds_path in input_dataset: + for params in completion_params_list: + await eval_fn(dataset_path=[ds_path], completion_params=params) + + # Assertions on IDs generated by the decorator logic + assert len(row_ids) == 19 # from the markdown dataset + + +async def test_evaluation_test_generated_row_ids_without_dataset_keys(): + from eval_protocol.pytest.evaluation_test import evaluation_test + + # Adapter that does NOT set row_id; lets evaluation_test generate IDs + def markdown_dataset_no_row_id_adapter(data: List[dict]) -> List[EvaluationRow]: + return [ + EvaluationRow( + messages=[Message(role="user", content=row["prompt"])], + ground_truth=str(row["num_highlights"]), + ) + for row in data + ] + + row_ids = set() + + input_dataset = ["tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl"] + completion_params = [ + {"temperature": 0.0, "model": "dummy/local-model"}, + {"temperature": 1.0, "model": "dummy/local-model"}, + ] + + @evaluation_test( + input_dataset=input_dataset, + completion_params=completion_params, + dataset_adapter=markdown_dataset_no_row_id_adapter, + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + combine_datasets=False, + num_runs=5, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + # row_id should be auto-generated by evaluation_test/InputMetadata + assert row.input_metadata is not None + assert row.input_metadata.row_id is not None and isinstance(row.input_metadata.row_id, str) + row_ids.add(row.input_metadata.row_id) + return row + + # Single invocation (one dataset, one param set) with multiple runs + for ds_path in input_dataset: + for params in completion_params: + await eval_fn(dataset_path=[ds_path], completion_params=params) + + # Second invocation to ensure that IDs are stable across multiple invocations + for ds_path in input_dataset: + for params in completion_params: + await eval_fn(dataset_path=[ds_path], completion_params=params) + + # Even with multiple runs, generated row_ids should be stable within the invocation + assert len(row_ids) == 19 # equals dataset size when IDs are generated once and preserved across runs diff --git a/tests/test_human_id.py b/tests/test_human_id.py new file mode 100644 index 00000000..8fcefee0 --- /dev/null +++ b/tests/test_human_id.py @@ -0,0 +1,69 @@ +import re +import pytest + +from eval_protocol.human_id import generate_id, num_combinations + + +def test_generate_id_index_basic_3_words(): + # index 0 maps to the first element of each category (verb, adjective, noun) + assert generate_id(index=0, word_count=3) == "be-other-time" + + # incrementing index advances the least-significant position (noun) + assert generate_id(index=1, word_count=3) == "be-other-year" + + # carry into the adjective when nouns wrap + # index == len(nouns) => adjective advances by 1, noun resets + # nouns length inferred by probing with large indices is brittle; instead, compute via reach + # We know index=0 gives be-other-time, and index that produces adjective=new, noun=time should be reachable. + # Derive by scanning forward until adjective changes to 'new'. This keeps test robust to dictionary size edits. + base = generate_id(index=0, word_count=3) + # Find the first index where adjective becomes 'new' and noun resets to 'time' + target = None + for i in range(1, 2000): + cand = generate_id(index=i, word_count=3) + if cand.startswith("be-new-time"): + target = i + break + assert target is not None, "Expected to find carry into adjective within search bound" + assert generate_id(index=target, word_count=3) == "be-new-time" + + +def test_generate_id_index_word_count_cycle(): + # word_count cycles categories: verb, adj, noun, verb, adj, ... + assert generate_id(index=0, word_count=5) == "be-other-time-be-other" + # increment least-significant position (adj at position 5) + assert generate_id(index=1, word_count=5) == "be-other-time-be-new" + + +def test_generate_id_index_out_of_range_and_negative(): + # Use exported total combinations for clean boundary checks + total = num_combinations(word_count=3) + assert total > 0 + # Last valid index + generate_id(index=total - 1, word_count=3) + # First invalid index + with pytest.raises(ValueError): + generate_id(index=total, word_count=3) + + with pytest.raises(ValueError): + generate_id(index=-1, word_count=3) + + +def test_generate_id_seed_stability_and_compat(): + # Without index, same seed yields same id + a = generate_id(seed=1234) + b = generate_id(seed=1234) + assert a == b + + # Without index, default produces separator '-' and at least 3 components + c = generate_id() + assert re.match(r"^[a-z]+(-[a-z]+){2,}$", c) + + +def test_generate_id_index_ignores_seed(): + # With index provided, seed should affect the mapping deterministically + x = generate_id(index=42, seed=1) + y = generate_id(index=42, seed=999) + z = generate_id(index=42, seed=1) + assert x != y + assert x == z diff --git a/tests/test_models.py b/tests/test_models.py index 8220a746..817c4e7c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -289,6 +289,23 @@ def test_evaluation_row_creation(): assert not row.is_trajectory_evaluation() +def test_stable_hash(): + """Test the stable hash method.""" + row = EvaluationRow( + messages=[Message(role="user", content="What is 2+2?"), Message(role="assistant", content="2+2 equals 4.")], + ground_truth="4", + ) + row2 = EvaluationRow( + messages=[Message(role="user", content="What is 2+2?"), Message(role="assistant", content="2+2 equals 4.")], + ground_truth="4", + ) + stable_json = EvaluationRow.stable_json(row) + stable_json2 = EvaluationRow.stable_json(row2) + assert stable_json == stable_json2 + assert "created_at" not in stable_json + assert "execution_metadata" not in stable_json + + def test_evaluation_row_trajectory_evaluation(): """Test EvaluationRow with trajectory evaluation.""" messages = [