99
1010from eval_protocol .models import EvaluateResult , EvaluationRow , Message
1111from eval_protocol .pytest import default_single_turn_rollout_processor , evaluation_test
12- from eval_protocol .rewards .code_execution import extract_code_blocks , execute_python_code
12+ from eval_protocol .rewards .code_execution import execute_python_code , extract_code_blocks
1313
1414
1515def coding_dataset_to_evaluation_row (data : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
@@ -18,8 +18,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
1818 """
1919 return [
2020 EvaluationRow (
21- messages = [Message (role = "user" , content = f"{ row ['prompt' ]} Input: { row ['input' ]} " )],
22- ground_truth = row ["expected_output" ]
21+ messages = [Message (role = "user" , content = f"{ row ['prompt' ]} Input: { row ['input' ]} " )],
22+ ground_truth = row ["expected_output" ],
2323 )
2424 for row in data
2525 ]
@@ -30,63 +30,59 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
3030 dataset_adapter = coding_dataset_to_evaluation_row ,
3131 model = ["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct" ],
3232 rollout_input_params = [{"temperature" : 0.0 , "max_tokens" : 4096 }],
33- threshold_of_success = 0.8 ,
33+ threshold = 0.8 ,
3434 rollout_processor = default_single_turn_rollout_processor ,
3535 num_runs = 1 ,
3636 mode = "pointwise" ,
3737)
3838def test_coding_code_evaluation (row : EvaluationRow ) -> EvaluationRow :
3939 """
4040 Evaluation function that tests code correctness by executing it locally.
41-
41+
4242 This function:
4343 1. Extracts Python code from the assistant's response
4444 2. Executes the code locally with timeout=10
4545 3. Compares the output to ground_truth
4646 4. Returns a score of 1.0 if output matches, 0.0 otherwise
47-
47+
4848 Args:
4949 row: EvaluationRow containing the conversation messages and expected_output in ground_truth
50-
50+
5151 Returns:
5252 EvaluationRow with the evaluation result
5353 """
5454 # Check if we have an assistant response
5555 if len (row .messages ) < 2 or row .messages [- 1 ].role != "assistant" :
5656 row .evaluation_result = EvaluateResult (score = 0.0 , reason = "No assistant response found" )
5757 return row
58-
58+
5959 assistant_content = row .messages [- 1 ].content or ""
6060 expected_output = (row .ground_truth or "" ).strip ()
61-
61+
6262 # Extract Python code blocks
6363 code_blocks = extract_code_blocks (assistant_content , language = "python" )
6464 if not code_blocks :
6565 row .evaluation_result = EvaluateResult (score = 0.0 , reason = "No Python code block found" )
6666 return row
67-
67+
6868 code = code_blocks [0 ]["code" ]
69-
69+
7070 # Execute the code locally
7171 execution_result = execute_python_code (code , timeout = 10 )
72-
72+
7373 if not execution_result .get ("success" , False ):
7474 error_msg = execution_result .get ("error" , "Code execution failed" )
7575 row .evaluation_result = EvaluateResult (score = 0.0 , reason = f"Execution error: { error_msg } " )
7676 return row
77-
77+
7878 # Compare output with expected
7979 actual_output = (execution_result .get ("output" , "" ) or "" ).strip ()
80-
80+
8181 if actual_output == expected_output :
82- row .evaluation_result = EvaluateResult (
83- score = 1.0 ,
84- reason = f"✅ Output matches: '{ actual_output } '"
85- )
82+ row .evaluation_result = EvaluateResult (score = 1.0 , reason = f"✅ Output matches: '{ actual_output } '" )
8683 else :
8784 row .evaluation_result = EvaluateResult (
88- score = 0.0 ,
89- reason = f"❌ Expected: '{ expected_output } ', Got: '{ actual_output } '"
85+ score = 0.0 , reason = f"❌ Expected: '{ expected_output } ', Got: '{ actual_output } '"
9086 )
91-
87+
9288 return row
0 commit comments