Skip to content

Commit a1d6a52

Browse files
committed
formatting
1 parent 44b1326 commit a1d6a52

File tree

1 file changed

+0
-8
lines changed

1 file changed

+0
-8
lines changed

tests/pytest/test_basic_coding.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,12 @@
55
and comparing the output against expected results in a pointwise manner.
66
"""
77

8-
import logging
9-
import time
108
from typing import Any, Dict, List
119

1210
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
1311
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
1412
from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
1513

16-
logger = logging.getLogger(__name__)
17-
1814

1915
def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
2016
"""
@@ -43,22 +39,18 @@ def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
4339
"""
4440
Evaluation function that tests code correctness by executing it locally.
4541
46-
4742
This function:
4843
1. Extracts Python code from the assistant's response
4944
2. Executes the code locally with timeout=10
5045
3. Compares the output to ground_truth
5146
4. Returns a score of 1.0 if output matches, 0.0 otherwise
5247
53-
5448
Args:
5549
row: EvaluationRow containing the conversation messages and expected_output in ground_truth
5650
57-
5851
Returns:
5952
EvaluationRow with the evaluation result
6053
"""
61-
logger.info(f"STARTING TO EVALUATE ROW: {row.input_metadata.row_id} at time {time.time()}")
6254
# Check if we have an assistant response
6355
if len(row.messages) < 2 or row.messages[-1].role != "assistant":
6456
row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")

0 commit comments

Comments
 (0)