fix test

mayinghan · mayinghan · commit 0a78e79b258b · 2025-08-19T23:24:46.000-07:00
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -16,6 +16,13 @@
     RolloutProcessorConfig,
 )
 
+def is_in_event_loop():
+    try:
+        asyncio.get_event_loop()
+        return True
+    except RuntimeError:
+        return False
+
 
 def execute_function(func: Callable, **kwargs) -> Any:
     """
diff --git a/tests/pytest/test_pytest_ensure_logging.py b/tests/pytest/test_pytest_ensure_logging.py
@@ -2,7 +2,7 @@
 from unittest.mock import Mock, patch
 
 
-async def test_ensure_logging(monkeypatch):
+def test_ensure_logging(monkeypatch):
     """
     Ensure that default SQLITE logger gets called by mocking the storage and checking that the storage is called.
     """
@@ -37,7 +37,7 @@ async def test_ensure_logging(monkeypatch):
         def eval_fn(row: EvaluationRow) -> EvaluationRow:
             return row
 
-        await eval_fn(
+        eval_fn(
             dataset_path=["tests/pytest/data/markdown_dataset.jsonl"],
             completion_params={"temperature": 0.0, "model": "dummy/local-model"},
         )
diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py
@@ -19,7 +19,7 @@ def read(self):
         return list(self._rows.values())
 
 
-async def test_evaluation_test_decorator(monkeypatch):
+def test_evaluation_test_decorator(monkeypatch):
     from eval_protocol.pytest.evaluation_test import evaluation_test
 
     logger = InMemoryLogger()
@@ -45,13 +45,13 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
 
     # Manually invoke all parameter combinations within a single test
     for ds_path in dataset_paths:
-        await eval_fn(dataset_path=[ds_path], completion_params={"temperature": 0.0, "model": "dummy/local-model"})
+        eval_fn(dataset_path=[ds_path], completion_params={"temperature": 0.0, "model": "dummy/local-model"})
 
     # Assertions on IDs generated by the decorator logic
     assert len(logger.read()) == 38
 
 
-async def test_evaluation_test_decorator_ids_single(monkeypatch):
+def test_evaluation_test_decorator_ids_single(monkeypatch):
     in_memory_logger = InMemoryLogger()
     unique_run_ids = set()
     unique_experiment_ids = set()
@@ -97,7 +97,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
     # Manually invoke all parameter combinations within a single test
     for ds_path in dataset_paths:
         for params in completion_params_list:
-            await eval_fn(dataset_path=[ds_path], completion_params=params)
+            eval_fn(dataset_path=[ds_path], completion_params=params)
 
     # Assertions on IDs generated by the decorator logic
     assert len(unique_invocation_ids) == 1
diff --git a/tests/pytest/test_pytest_stable_row_id.py b/tests/pytest/test_pytest_stable_row_id.py
@@ -5,7 +5,7 @@
 from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
 
 
-async def test_evaluation_test_decorator_ids_single():
+def test_evaluation_test_decorator_ids_single():
     from eval_protocol.pytest.evaluation_test import evaluation_test
 
     row_ids = set()
@@ -35,18 +35,18 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
     # Manually invoke all parameter combinations within a single test
     for ds_path in input_dataset:
         for params in completion_params_list:
-            await eval_fn(dataset_path=[ds_path], completion_params=params)
+            eval_fn(dataset_path=[ds_path], completion_params=params)
 
     # Second invocation to ensure that IDs are stable across multiple invocations
     for ds_path in input_dataset:
         for params in completion_params_list:
-            await eval_fn(dataset_path=[ds_path], completion_params=params)
+            eval_fn(dataset_path=[ds_path], completion_params=params)
 
     # Assertions on IDs generated by the decorator logic
     assert len(row_ids) == 19  # from the markdown dataset
 
 
-async def test_evaluation_test_generated_row_ids_without_dataset_keys():
+def test_evaluation_test_generated_row_ids_without_dataset_keys():
     from eval_protocol.pytest.evaluation_test import evaluation_test
 
     # Adapter that does NOT set row_id; lets evaluation_test generate IDs
@@ -86,12 +86,12 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
     # Single invocation (one dataset, one param set) with multiple runs
     for ds_path in input_dataset:
         for params in completion_params:
-            await eval_fn(dataset_path=[ds_path], completion_params=params)
+            eval_fn(dataset_path=[ds_path], completion_params=params)
 
     # Second invocation to ensure that IDs are stable across multiple invocations
     for ds_path in input_dataset:
         for params in completion_params:
-            await eval_fn(dataset_path=[ds_path], completion_params=params)
+            eval_fn(dataset_path=[ds_path], completion_params=params)
 
     # Even with multiple runs, generated row_ids should be stable within the invocation
     assert len(row_ids) == 19  # equals dataset size when IDs are generated once and preserved across runs