Stable ids (#99)

Dylan Huang · web-flow · commit de246c6d0bf4 · 2025-08-19T01:46:45.000-07:00
* implement human id mapping + hashing for evaluationrow

* example w/ row ids from dataset

* test_pytest_stable_row_ids
diff --git a/eval_protocol/human_id/__init__.py b/eval_protocol/human_id/__init__.py
@@ -4,25 +4,68 @@
 
 from . import dictionary
 
-__all__ = ["generate_id"]
+__all__ = ["generate_id", "num_combinations"]
 
 system_random = random.SystemRandom()
 
 
-def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=5) -> str:
+def generate_id(
+    separator: str = "-",
+    seed: int | float | str | bytes | bytearray | None = None,
+    word_count: int = 5,
+    index: int | None = None,
+) -> str:
     """
     Generate a human readable ID
 
     :param separator: The string to use to separate words
-    :param seed: The seed to use. The same seed will produce the same ID
+    :param seed: The seed to use. The same seed will produce the same ID or index-based mapping
+    :param index: Optional non-negative integer providing a 1:1 mapping to an ID.
+                  When provided, the mapping is deterministic and bijective for
+                  all integers in range [0, total_combinations).
     :param word_count: The number of words to use. Minimum of 3.
     :return: A human readable ID
     """
     if word_count < 3:
         raise ValueError("word_count cannot be lower than 3")
 
+    # If a specific index is provided, use mixed-radix encoding into a fixed
+    # sequence of parts to guarantee a bijection between integers and IDs.
+    # The sequence cycles as: verb, adjective, noun, verb, adjective, noun, ...
+    if index is not None:
+        if not isinstance(index, int) or index < 0:
+            raise ValueError("index must be a non-negative integer if provided")
+
+        # Prepare category lists; if seed is provided, shuffle deterministically
+        base_categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
+        if seed is not None:
+            rnd = random.Random(seed)
+            categories = [tuple(rnd.sample(cat, len(cat))) for cat in base_categories]
+        else:
+            categories = base_categories
+        # Build the category order for the desired word_count
+        ordered_categories = [categories[i % 3] for i in range(word_count)]
+
+        # Compute total number of combinations for this word_count
+        radices = [len(cat) for cat in ordered_categories]
+        total = num_combinations(word_count)
+
+        if index >= total:
+            raise ValueError(f"index out of range for given word_count. Received {index}, max allowed is {total - 1}")
+
+        # Mixed-radix decomposition (least significant position is the last word)
+        digits: list[int] = []
+        remaining = index
+        for base in reversed(radices):
+            digits.append(remaining % base)
+            remaining //= base
+        digits.reverse()
+
+        words = [ordered_categories[pos][digits[pos]] for pos in range(word_count)]
+        return separator.join(words)
+
     random_obj = system_random
-    if seed:
+    if seed is not None:
         random_obj = random.Random(seed)
 
     parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}
@@ -33,3 +76,21 @@ def generate_id(separator="-", seed: int | float | str | bytes | bytearray | Non
     parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())
 
     return separator.join(parts)
+
+
+def num_combinations(word_count: int = 5) -> int:
+    """
+    Return the total number of unique IDs possible for the given word_count.
+
+    The sequence of categories cycles as: verb, adjective, noun, then repeats.
+    This value can be used to mod an index when calling generate_id(index=...).
+    """
+    if word_count < 3:
+        raise ValueError("word_count cannot be lower than 3")
+
+    categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
+    radices = [len(categories[i % 3]) for i in range(word_count)]
+    total = 1
+    for r in radices:
+        total *= r
+    return total
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -211,7 +211,7 @@ class InputMetadata(BaseModel):
 
     model_config = ConfigDict(extra="allow")
 
-    row_id: Optional[str] = Field(default_factory=generate_id, description="Unique string to ID the row")
+    row_id: Optional[str] = Field(None, description="Unique string to ID the row")
     completion_params: CompletionParams = Field(
         default_factory=dict, description="Completion endpoint parameters used"
     )
@@ -429,6 +429,22 @@ def get_termination_reason(self) -> str:
                 return msg.control_plane_step["termination_reason"]
         return "unknown"
 
+    def __hash__(self) -> int:
+        # Use a stable hash by sorting keys and ensuring compact output
+        json_str = self.stable_json(self)
+        return hash(json_str)
+
+    @classmethod
+    def stable_json(cls, row: "EvaluationRow") -> int:
+        json_str = row.model_dump_json(
+            exclude_none=True,
+            exclude_defaults=True,
+            by_alias=True,
+            indent=None,
+            exclude=["created_at", "execution_metadata"],
+        )
+        return json_str
+
 
 # Original dataclass-based models for backwards compatibility
 # These are deprecated and will be removed in a future version
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -15,7 +15,7 @@
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
-from eval_protocol.human_id import generate_id
+from eval_protocol.human_id import generate_id, num_combinations
 from eval_protocol.models import (
     CompletionParams,
     EvalMetadata,
@@ -294,6 +294,16 @@ def _log_eval_error(
                     else:
                         raise ValueError("No input dataset or input messages provided")
 
+                    for row in data:
+                        # generate a stable row_id for each row
+                        if row.input_metadata.row_id is None:
+                            # Generate a stable, deterministic row_id using the row's hash and num_combinations
+                            index = hash(row)
+                            max_index = num_combinations() - 1
+                            # Ensure index is a non-negative integer within [0, max_index]
+                            index = abs(index) % (max_index + 1)
+                            row.input_metadata.row_id = generate_id(seed=0, index=index)
+
                     if "completion_params" not in kwargs or not kwargs["completion_params"]:
                         raise ValueError(
                             "No completion parameters provided. Please provide a completion parameters object."
diff --git a/tests/pytest/test_pytest_stable_row_id.py b/tests/pytest/test_pytest_stable_row_id.py
@@ -0,0 +1,97 @@
+from typing import List
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
+from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
+
+
+async def test_evaluation_test_decorator_ids_single():
+    from eval_protocol.pytest.evaluation_test import evaluation_test
+
+    row_ids = set()
+
+    input_dataset = [
+        "tests/pytest/data/markdown_dataset.jsonl",
+        "tests/pytest/data/markdown_dataset.jsonl",
+    ]
+    completion_params_list = [
+        {"temperature": 0.0, "model": "dummy/local-model"},
+        {"temperature": 1.0, "model": "dummy/local-model"},
+    ]
+
+    @evaluation_test(
+        input_dataset=input_dataset,
+        completion_params=completion_params_list,
+        dataset_adapter=markdown_dataset_to_evaluation_row,
+        rollout_processor=NoOpRolloutProcessor(),
+        mode="pointwise",
+        combine_datasets=False,
+        num_runs=5,
+    )
+    def eval_fn(row: EvaluationRow) -> EvaluationRow:
+        row_ids.add(row.input_metadata.row_id)
+        return row
+
+    # Manually invoke all parameter combinations within a single test
+    for ds_path in input_dataset:
+        for params in completion_params_list:
+            await eval_fn(dataset_path=[ds_path], completion_params=params)
+
+    # Second invocation to ensure that IDs are stable across multiple invocations
+    for ds_path in input_dataset:
+        for params in completion_params_list:
+            await eval_fn(dataset_path=[ds_path], completion_params=params)
+
+    # Assertions on IDs generated by the decorator logic
+    assert len(row_ids) == 19  # from the markdown dataset
+
+
+async def test_evaluation_test_generated_row_ids_without_dataset_keys():
+    from eval_protocol.pytest.evaluation_test import evaluation_test
+
+    # Adapter that does NOT set row_id; lets evaluation_test generate IDs
+    def markdown_dataset_no_row_id_adapter(data: List[dict]) -> List[EvaluationRow]:
+        return [
+            EvaluationRow(
+                messages=[Message(role="user", content=row["prompt"])],
+                ground_truth=str(row["num_highlights"]),
+            )
+            for row in data
+        ]
+
+    row_ids = set()
+
+    input_dataset = ["tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl"]
+    completion_params = [
+        {"temperature": 0.0, "model": "dummy/local-model"},
+        {"temperature": 1.0, "model": "dummy/local-model"},
+    ]
+
+    @evaluation_test(
+        input_dataset=input_dataset,
+        completion_params=completion_params,
+        dataset_adapter=markdown_dataset_no_row_id_adapter,
+        rollout_processor=NoOpRolloutProcessor(),
+        mode="pointwise",
+        combine_datasets=False,
+        num_runs=5,
+    )
+    def eval_fn(row: EvaluationRow) -> EvaluationRow:
+        # row_id should be auto-generated by evaluation_test/InputMetadata
+        assert row.input_metadata is not None
+        assert row.input_metadata.row_id is not None and isinstance(row.input_metadata.row_id, str)
+        row_ids.add(row.input_metadata.row_id)
+        return row
+
+    # Single invocation (one dataset, one param set) with multiple runs
+    for ds_path in input_dataset:
+        for params in completion_params:
+            await eval_fn(dataset_path=[ds_path], completion_params=params)
+
+    # Second invocation to ensure that IDs are stable across multiple invocations
+    for ds_path in input_dataset:
+        for params in completion_params:
+            await eval_fn(dataset_path=[ds_path], completion_params=params)
+
+    # Even with multiple runs, generated row_ids should be stable within the invocation
+    assert len(row_ids) == 19  # equals dataset size when IDs are generated once and preserved across runs
diff --git a/tests/test_human_id.py b/tests/test_human_id.py
@@ -0,0 +1,69 @@
+import re
+import pytest
+
+from eval_protocol.human_id import generate_id, num_combinations
+
+
+def test_generate_id_index_basic_3_words():
+    # index 0 maps to the first element of each category (verb, adjective, noun)
+    assert generate_id(index=0, word_count=3) == "be-other-time"
+
+    # incrementing index advances the least-significant position (noun)
+    assert generate_id(index=1, word_count=3) == "be-other-year"
+
+    # carry into the adjective when nouns wrap
+    # index == len(nouns) => adjective advances by 1, noun resets
+    # nouns length inferred by probing with large indices is brittle; instead, compute via reach
+    # We know index=0 gives be-other-time, and index that produces adjective=new, noun=time should be reachable.
+    # Derive by scanning forward until adjective changes to 'new'. This keeps test robust to dictionary size edits.
+    base = generate_id(index=0, word_count=3)
+    # Find the first index where adjective becomes 'new' and noun resets to 'time'
+    target = None
+    for i in range(1, 2000):
+        cand = generate_id(index=i, word_count=3)
+        if cand.startswith("be-new-time"):
+            target = i
+            break
+    assert target is not None, "Expected to find carry into adjective within search bound"
+    assert generate_id(index=target, word_count=3) == "be-new-time"
+
+
+def test_generate_id_index_word_count_cycle():
+    # word_count cycles categories: verb, adj, noun, verb, adj, ...
+    assert generate_id(index=0, word_count=5) == "be-other-time-be-other"
+    # increment least-significant position (adj at position 5)
+    assert generate_id(index=1, word_count=5) == "be-other-time-be-new"
+
+
+def test_generate_id_index_out_of_range_and_negative():
+    # Use exported total combinations for clean boundary checks
+    total = num_combinations(word_count=3)
+    assert total > 0
+    # Last valid index
+    generate_id(index=total - 1, word_count=3)
+    # First invalid index
+    with pytest.raises(ValueError):
+        generate_id(index=total, word_count=3)
+
+    with pytest.raises(ValueError):
+        generate_id(index=-1, word_count=3)
+
+
+def test_generate_id_seed_stability_and_compat():
+    # Without index, same seed yields same id
+    a = generate_id(seed=1234)
+    b = generate_id(seed=1234)
+    assert a == b
+
+    # Without index, default produces separator '-' and at least 3 components
+    c = generate_id()
+    assert re.match(r"^[a-z]+(-[a-z]+){2,}$", c)
+
+
+def test_generate_id_index_ignores_seed():
+    # With index provided, seed should affect the mapping deterministically
+    x = generate_id(index=42, seed=1)
+    y = generate_id(index=42, seed=999)
+    z = generate_id(index=42, seed=1)
+    assert x != y
+    assert x == z
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -289,6 +289,23 @@ def test_evaluation_row_creation():
     assert not row.is_trajectory_evaluation()
 
 
+def test_stable_hash():
+    """Test the stable hash method."""
+    row = EvaluationRow(
+        messages=[Message(role="user", content="What is 2+2?"), Message(role="assistant", content="2+2 equals 4.")],
+        ground_truth="4",
+    )
+    row2 = EvaluationRow(
+        messages=[Message(role="user", content="What is 2+2?"), Message(role="assistant", content="2+2 equals 4.")],
+        ground_truth="4",
+    )
+    stable_json = EvaluationRow.stable_json(row)
+    stable_json2 = EvaluationRow.stable_json(row2)
+    assert stable_json == stable_json2
+    assert "created_at" not in stable_json
+    assert "execution_metadata" not in stable_json
+
+
 def test_evaluation_row_trajectory_evaluation():
     """Test EvaluationRow with trajectory evaluation."""
     messages = [