Aggregated metrics part 7 (cohort -> experiment) (#60)

Dylan Huang · web-flow · commit 4f15a7049804 · 2025-08-11T15:46:11.000-07:00
* add --port arg to ep logs * Fix WebSocketManager to reset broadcast task after cancellation * simple tests work * TODO: TestLogsServer * TODO: TestLogsServerIntegration * TODO: test HTML injection - also test TestAsyncWebSocketOperations * add logs server tests * add port parameter testes * use gpt-oss-120b to avoid rate limits * point to port 8000 for dev * woops * fix "uvicorn eval_protocol.utils.logs_server:create_app --factory --reload" * use gpt-oss-120b since less rate limiting (#57) * Aggregated metrics part 7 (#58) * use gpt-oss-120b for less rate limits and faster tests * fix typeerror * Refactor LogsServer event handling and improve integration tests - Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop. - Updated integration tests to use multiprocessing for server startup and improved health check validation. - Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity. * Enhance test_create_app_factory to verify LogsServer start_loops call - Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation. - Ensured the test remains asynchronous and maintains clarity in its assertions. * fix * use active logger * cohort -> experiment * vite build
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -286,9 +286,9 @@ class EvaluationRow(BaseModel):
         description="The ID of the invocation that this row belongs to.",
     )
 
-    cohort_id: Optional[str] = Field(
+    experiment_id: Optional[str] = Field(
         default_factory=generate_id,
-        description="The ID of the cohort that this row belongs to.",
+        description="The ID of the experiment that this row belongs to.",
     )
 
     rollout_id: Optional[str] = Field(
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -73,14 +73,14 @@ def evaluation_test(  # noqa: C901
     Here are some key concepts to understand the terminology in EP:
 
     - "invocation" is a single execution of a test function. An invocation can
-        generate 1 or more cohorts. Grouping by invocation might be useful to
+        generate 1 or more experiments. Grouping by invocation might be useful to
         aggregate eval scores across multiple invocations when you want to aggregate
         scores across multiple datasets.
-    - "cohort" is a group of runs with for a combination of parameters. A single
-        cohort will have multiple runs if num_runs > 1.
+    - "experiment" is a group of runs with for a combination of parameters. A single
+        experiment will have multiple runs if num_runs > 1.
         1. If your evaluation_test has combinations of parameters, it will generate
-        multiple cohorts per combination of parameters.
-        2. A new execution of a test function will generate a new cohort.
+        multiple experiments per combination of parameters.
+        2. A new execution of a test function will generate a new experiment.
     - "run" is a group of rollouts. For multiple num_runs > 1, there will be
         multiple "run_id"s.
     - "rollout" is the execution/process that produces a "trajectory". You
@@ -98,7 +98,7 @@ def evaluation_test(  # noqa: C901
         decorated test. It simply produces a score from 0 to 1 and attached it
         to the row as the "evaluation_result" field.
 
-    "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
+    "invocation", "experiment", "run", "rollout", and "row" each have a unique ID
     which can be used to easily group and identify your dataset by.
 
     Args:
@@ -302,7 +302,7 @@ def wrapper_body(**kwargs):
                 eval_metadata = None
                 all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
 
-                cohort_id = generate_id()
+                experiment_id = generate_id()
 
                 def _log_eval_error(
                     status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
@@ -383,7 +383,7 @@ def _log_eval_error(
                         row.input_metadata.session_data["mode"] = mode
                         # Initialize eval_metadata for each row
                         row.eval_metadata = eval_metadata
-                        row.cohort_id = cohort_id
+                        row.experiment_id = experiment_id
                         row.invocation_id = invocation_id
 
                         # has to be done in the pytest main process since it's
diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py
@@ -54,7 +54,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
 def test_evaluation_test_decorator_ids_single(monkeypatch):
     in_memory_logger = InMemoryLogger()
     unique_run_ids = set()
-    unique_cohort_ids = set()
+    unique_experiment_ids = set()
     unique_rollout_ids = set()
     unique_invocation_ids = set()
     unique_row_ids = set()
@@ -77,7 +77,7 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
     )
     def eval_fn(row: EvaluationRow) -> EvaluationRow:
         unique_run_ids.add(row.run_id)
-        unique_cohort_ids.add(row.cohort_id)
+        unique_experiment_ids.add(row.experiment_id)
         unique_rollout_ids.add(row.rollout_id)
         unique_invocation_ids.add(row.invocation_id)
         unique_row_ids.add(row.input_metadata.row_id)
@@ -97,6 +97,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
     # Assertions on IDs generated by the decorator logic
     assert len(unique_invocation_ids) == 1
     assert len(unique_run_ids) == 20  # 4 combinations * 5 runs each
-    assert len(unique_cohort_ids) == 2 * 2  # 2 datasets * 2 param sets
+    assert len(unique_experiment_ids) == 2 * 2  # 2 datasets * 2 param sets
     assert len(unique_row_ids) == 19  # from the markdown dataset
     assert len(unique_rollout_ids) == 19 * 5 * 2 * 2  # rows * runs * datasets * params
diff --git a/vite-app/dist/assets/index-Cvu-Dnw_.js b/vite-app/dist/assets/index-Cvu-Dnw_.js
diff --git a/vite-app/dist/assets/index-Cvu-Dnw_.js.map b/vite-app/dist/assets/index-Cvu-Dnw_.js.map
diff --git a/vite-app/dist/index.html b/vite-app/dist/index.html
@@ -5,10 +5,10 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>EP | Log Viewer</title>
     <link rel="icon" href="/assets/favicon-BkAAWQga.png" />
-    <script type="module" crossorigin src="/assets/index-t_hsfGP1.js"></script>
+    <script type="module" crossorigin src="/assets/index-Cvu-Dnw_.js"></script>
     <link rel="stylesheet" crossorigin href="/assets/index-CGYj40Gx.css">
   </head>
   <body>
     <div id="root"></div>
   </body>
-</html>
+</html> 
diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx
@@ -134,7 +134,7 @@ const IdSection = observer(({ data }: { data: EvaluationRowType }) => (
     title="IDs"
     data={{
       rollout_id: data.rollout_id,
-      cohort_id: data.cohort_id,
+      experiment_id: data.experiment_id,
       invocation_id: data.invocation_id,
       run_id: data.run_id,
     }}
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -100,7 +100,7 @@ export const EvaluationRowSchema = z.object({
   input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
   rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
   invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
-  cohort_id: z.string().optional().describe('The ID of the cohort that this row belongs to.'),
+  experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
   rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
   run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
   ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
diff --git a/vite-app/src/util/pivot.test.ts b/vite-app/src/util/pivot.test.ts
@@ -189,7 +189,7 @@ describe('computePivot', () => {
 
     const res = computePivot({
       data: rows,
-      rowFields: ['$.eval_metadata.name', '$.cohort_id'],
+      rowFields: ['$.eval_metadata.name', '$.experiment_id'],
       columnFields: ['$.input_metadata.completion_params.model'],
       valueField: '$.evaluation_result.score',
       aggregator: 'avg',

Original file line number	Diff line number	Diff line change
`@@ -286,9 +286,9 @@ class EvaluationRow(BaseModel):`
`286`	`286`	`description="The ID of the invocation that this row belongs to.",`
`287`	`287`	`)`
`288`	`288`
`289`		`- cohort_id: Optional[str] = Field(`
	`289`	`+ experiment_id: Optional[str] = Field(`
`290`	`290`	`default_factory=generate_id,`
`291`		`- description="The ID of the cohort that this row belongs to.",`
	`291`	`+ description="The ID of the experiment that this row belongs to.",`
`292`	`292`	`)`
`293`	`293`
`294`	`294`	`rollout_id: Optional[str] = Field(`