File tree Expand file tree Collapse file tree 1 file changed +0
-8
lines changed
Expand file tree Collapse file tree 1 file changed +0
-8
lines changed Original file line number Diff line number Diff line change 55and comparing the output against expected results in a pointwise manner.
66"""
77
8- import logging
9- import time
108from typing import Any , Dict , List
119
1210from eval_protocol .models import EvaluateResult , EvaluationRow , Message
1311from eval_protocol .pytest import default_single_turn_rollout_processor , evaluation_test
1412from eval_protocol .rewards .code_execution import execute_python_code , extract_code_blocks
1513
16- logger = logging .getLogger (__name__ )
17-
1814
1915def coding_dataset_to_evaluation_row (data : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
2016 """
@@ -43,22 +39,18 @@ def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
4339 """
4440 Evaluation function that tests code correctness by executing it locally.
4541
46-
4742 This function:
4843 1. Extracts Python code from the assistant's response
4944 2. Executes the code locally with timeout=10
5045 3. Compares the output to ground_truth
5146 4. Returns a score of 1.0 if output matches, 0.0 otherwise
5247
53-
5448 Args:
5549 row: EvaluationRow containing the conversation messages and expected_output in ground_truth
5650
57-
5851 Returns:
5952 EvaluationRow with the evaluation result
6053 """
61- logger .info (f"STARTING TO EVALUATE ROW: { row .input_metadata .row_id } at time { time .time ()} " )
6254 # Check if we have an assistant response
6355 if len (row .messages ) < 2 or row .messages [- 1 ].role != "assistant" :
6456 row .evaluation_result = EvaluateResult (score = 0.0 , reason = "No assistant response found" )
You can’t perform that action at this time.
0 commit comments