Skip to content

Commit 4f15a70

Browse files
author
Dylan Huang
authored
Aggregated metrics part 7 (cohort -> experiment) (#60)
* add --port arg to ep logs * Fix WebSocketManager to reset broadcast task after cancellation * simple tests work * TODO: TestLogsServer * TODO: TestLogsServerIntegration * TODO: test HTML injection - also test TestAsyncWebSocketOperations * add logs server tests * add port parameter testes * use gpt-oss-120b to avoid rate limits * point to port 8000 for dev * woops * fix "uvicorn eval_protocol.utils.logs_server:create_app --factory --reload" * use gpt-oss-120b since less rate limiting (#57) * Aggregated metrics part 7 (#58) * use gpt-oss-120b for less rate limits and faster tests * fix typeerror * Refactor LogsServer event handling and improve integration tests - Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop. - Updated integration tests to use multiprocessing for server startup and improved health check validation. - Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity. * Enhance test_create_app_factory to verify LogsServer start_loops call - Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation. - Ensured the test remains asynchronous and maintains clarity in its assertions. * fix * use active logger * cohort -> experiment * vite build
1 parent 171b002 commit 4f15a70

File tree

9 files changed

+23
-23
lines changed

9 files changed

+23
-23
lines changed

eval_protocol/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,9 +286,9 @@ class EvaluationRow(BaseModel):
286286
description="The ID of the invocation that this row belongs to.",
287287
)
288288

289-
cohort_id: Optional[str] = Field(
289+
experiment_id: Optional[str] = Field(
290290
default_factory=generate_id,
291-
description="The ID of the cohort that this row belongs to.",
291+
description="The ID of the experiment that this row belongs to.",
292292
)
293293

294294
rollout_id: Optional[str] = Field(

eval_protocol/pytest/evaluation_test.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,14 @@ def evaluation_test( # noqa: C901
7373
Here are some key concepts to understand the terminology in EP:
7474
7575
- "invocation" is a single execution of a test function. An invocation can
76-
generate 1 or more cohorts. Grouping by invocation might be useful to
76+
generate 1 or more experiments. Grouping by invocation might be useful to
7777
aggregate eval scores across multiple invocations when you want to aggregate
7878
scores across multiple datasets.
79-
- "cohort" is a group of runs with for a combination of parameters. A single
80-
cohort will have multiple runs if num_runs > 1.
79+
- "experiment" is a group of runs with for a combination of parameters. A single
80+
experiment will have multiple runs if num_runs > 1.
8181
1. If your evaluation_test has combinations of parameters, it will generate
82-
multiple cohorts per combination of parameters.
83-
2. A new execution of a test function will generate a new cohort.
82+
multiple experiments per combination of parameters.
83+
2. A new execution of a test function will generate a new experiment.
8484
- "run" is a group of rollouts. For multiple num_runs > 1, there will be
8585
multiple "run_id"s.
8686
- "rollout" is the execution/process that produces a "trajectory". You
@@ -98,7 +98,7 @@ def evaluation_test( # noqa: C901
9898
decorated test. It simply produces a score from 0 to 1 and attached it
9999
to the row as the "evaluation_result" field.
100100
101-
"invocation", "cohort", "run", "rollout", and "row" each have a unique ID
101+
"invocation", "experiment", "run", "rollout", and "row" each have a unique ID
102102
which can be used to easily group and identify your dataset by.
103103
104104
Args:
@@ -302,7 +302,7 @@ def wrapper_body(**kwargs):
302302
eval_metadata = None
303303
all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
304304

305-
cohort_id = generate_id()
305+
experiment_id = generate_id()
306306

307307
def _log_eval_error(
308308
status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
@@ -383,7 +383,7 @@ def _log_eval_error(
383383
row.input_metadata.session_data["mode"] = mode
384384
# Initialize eval_metadata for each row
385385
row.eval_metadata = eval_metadata
386-
row.cohort_id = cohort_id
386+
row.experiment_id = experiment_id
387387
row.invocation_id = invocation_id
388388

389389
# has to be done in the pytest main process since it's

tests/pytest/test_pytest_ids.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
5454
def test_evaluation_test_decorator_ids_single(monkeypatch):
5555
in_memory_logger = InMemoryLogger()
5656
unique_run_ids = set()
57-
unique_cohort_ids = set()
57+
unique_experiment_ids = set()
5858
unique_rollout_ids = set()
5959
unique_invocation_ids = set()
6060
unique_row_ids = set()
@@ -77,7 +77,7 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
7777
)
7878
def eval_fn(row: EvaluationRow) -> EvaluationRow:
7979
unique_run_ids.add(row.run_id)
80-
unique_cohort_ids.add(row.cohort_id)
80+
unique_experiment_ids.add(row.experiment_id)
8181
unique_rollout_ids.add(row.rollout_id)
8282
unique_invocation_ids.add(row.invocation_id)
8383
unique_row_ids.add(row.input_metadata.row_id)
@@ -97,6 +97,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
9797
# Assertions on IDs generated by the decorator logic
9898
assert len(unique_invocation_ids) == 1
9999
assert len(unique_run_ids) == 20 # 4 combinations * 5 runs each
100-
assert len(unique_cohort_ids) == 2 * 2 # 2 datasets * 2 param sets
100+
assert len(unique_experiment_ids) == 2 * 2 # 2 datasets * 2 param sets
101101
assert len(unique_row_ids) == 19 # from the markdown dataset
102102
assert len(unique_rollout_ids) == 19 * 5 * 2 * 2 # rows * runs * datasets * params
Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/assets/index-t_hsfGP1.js.map renamed to vite-app/dist/assets/index-Cvu-Dnw_.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
66
<title>EP | Log Viewer</title>
77
<link rel="icon" href="/assets/favicon-BkAAWQga.png" />
8-
<script type="module" crossorigin src="/assets/index-t_hsfGP1.js"></script>
8+
<script type="module" crossorigin src="/assets/index-Cvu-Dnw_.js"></script>
99
<link rel="stylesheet" crossorigin href="/assets/index-CGYj40Gx.css">
1010
</head>
1111
<body>
1212
<div id="root"></div>
1313
</body>
14-
</html>
14+
</html>

vite-app/src/components/EvaluationRow.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ const IdSection = observer(({ data }: { data: EvaluationRowType }) => (
134134
title="IDs"
135135
data={{
136136
rollout_id: data.rollout_id,
137-
cohort_id: data.cohort_id,
137+
experiment_id: data.experiment_id,
138138
invocation_id: data.invocation_id,
139139
run_id: data.run_id,
140140
}}

vite-app/src/types/eval-protocol.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ export const EvaluationRowSchema = z.object({
100100
input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
101101
rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
102102
invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
103-
cohort_id: z.string().optional().describe('The ID of the cohort that this row belongs to.'),
103+
experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
104104
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
105105
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
106106
ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),

vite-app/src/util/pivot.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ describe('computePivot', () => {
189189

190190
const res = computePivot({
191191
data: rows,
192-
rowFields: ['$.eval_metadata.name', '$.cohort_id'],
192+
rowFields: ['$.eval_metadata.name', '$.experiment_id'],
193193
columnFields: ['$.input_metadata.completion_params.model'],
194194
valueField: '$.evaluation_result.score',
195195
aggregator: 'avg',

0 commit comments

Comments
 (0)