Skip to content

Commit a8f80fe

Browse files
committed
changing other files
1 parent 70c2cac commit a8f80fe

9 files changed

+47
-50
lines changed

tests/pytest/test_apps_coding.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
1818
Convert entries from APPS dataset to EvaluationRow objects.
1919
"""
2020
return [
21-
EvaluationRow(
22-
messages=[Message(role="user", content=row["question"])],
23-
ground_truth=row["input_output"]
24-
)
21+
EvaluationRow(messages=[Message(role="user", content=row["question"])], ground_truth=row["input_output"])
2522
for row in data
2623
]
2724

@@ -31,7 +28,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
3128
dataset_adapter=apps_dataset_to_evaluation_row,
3229
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
3330
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
34-
threshold_of_success=0.33,
31+
threshold=0.33,
3532
rollout_processor=default_single_turn_rollout_processor,
3633
num_runs=1,
3734
mode="pointwise",
@@ -42,7 +39,7 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
4239
4340
Args:
4441
row: EvaluationRow containing the conversation messages and ground_truth as JSON string
45-
42+
4643
Returns:
4744
EvaluationRow with the evaluation result
4845
"""
@@ -51,8 +48,8 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
5148
messages=row.messages,
5249
ground_truth=row.ground_truth,
5350
)
54-
51+
5552
# Set the evaluation result on the row
5653
row.evaluation_result = result
57-
58-
return row
54+
55+
return row

tests/pytest/test_basic_coding.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
1111
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
12-
from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
12+
from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
1313

1414

1515
def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -18,8 +18,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
1818
"""
1919
return [
2020
EvaluationRow(
21-
messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
22-
ground_truth=row["expected_output"]
21+
messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
22+
ground_truth=row["expected_output"],
2323
)
2424
for row in data
2525
]
@@ -30,63 +30,59 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
3030
dataset_adapter=coding_dataset_to_evaluation_row,
3131
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
3232
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
33-
threshold_of_success=0.8,
33+
threshold=0.8,
3434
rollout_processor=default_single_turn_rollout_processor,
3535
num_runs=1,
3636
mode="pointwise",
3737
)
3838
def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
3939
"""
4040
Evaluation function that tests code correctness by executing it locally.
41-
41+
4242
This function:
4343
1. Extracts Python code from the assistant's response
4444
2. Executes the code locally with timeout=10
4545
3. Compares the output to ground_truth
4646
4. Returns a score of 1.0 if output matches, 0.0 otherwise
47-
47+
4848
Args:
4949
row: EvaluationRow containing the conversation messages and expected_output in ground_truth
50-
50+
5151
Returns:
5252
EvaluationRow with the evaluation result
5353
"""
5454
# Check if we have an assistant response
5555
if len(row.messages) < 2 or row.messages[-1].role != "assistant":
5656
row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
5757
return row
58-
58+
5959
assistant_content = row.messages[-1].content or ""
6060
expected_output = (row.ground_truth or "").strip()
61-
61+
6262
# Extract Python code blocks
6363
code_blocks = extract_code_blocks(assistant_content, language="python")
6464
if not code_blocks:
6565
row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
6666
return row
67-
67+
6868
code = code_blocks[0]["code"]
69-
69+
7070
# Execute the code locally
7171
execution_result = execute_python_code(code, timeout=10)
72-
72+
7373
if not execution_result.get("success", False):
7474
error_msg = execution_result.get("error", "Code execution failed")
7575
row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
7676
return row
77-
77+
7878
# Compare output with expected
7979
actual_output = (execution_result.get("output", "") or "").strip()
80-
80+
8181
if actual_output == expected_output:
82-
row.evaluation_result = EvaluateResult(
83-
score=1.0,
84-
reason=f"✅ Output matches: '{actual_output}'"
85-
)
82+
row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'")
8683
else:
8784
row.evaluation_result = EvaluateResult(
88-
score=0.0,
89-
reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
85+
score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
9086
)
91-
87+
9288
return row

tests/pytest/test_frozen_lake.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from typing import Any, Dict, List
99

10-
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult
10+
from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult
1111
from eval_protocol.pytest import evaluation_test
1212
from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
1313

@@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
4141
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
4242
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
4343
rollout_processor=default_mcp_gym_rollout_processor,
44-
threshold_of_success=0.66,
44+
threshold=0.66,
4545
num_runs=1,
4646
max_concurrent_rollouts=3,
4747
mode="pointwise",

tests/pytest/test_hallucination.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
3535
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
3636
rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
3737
rollout_processor=default_single_turn_rollout_processor,
38-
threshold_of_success=0.33,
38+
threshold=0.33,
3939
num_runs=1,
4040
mode="pointwise",
4141
)

tests/pytest/test_lunar_lander.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from typing import Any, Dict, List
99

10-
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams
10+
from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message
1111
from eval_protocol.pytest import evaluation_test
1212
from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
1313

@@ -17,7 +17,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
1717
Convert entries from lunar lander dataset to EvaluationRow objects.
1818
"""
1919
rows = []
20-
20+
2121
for row in data:
2222
eval_row = EvaluationRow(
2323
messages=[Message(role="system", content=row["system_prompt"])],
@@ -26,12 +26,12 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
2626
dataset_info={
2727
"environment_context": row["environment_context"],
2828
"user_prompt_template": row["user_prompt_template"],
29-
}
30-
)
29+
},
30+
),
3131
)
32-
32+
3333
rows.append(eval_row)
34-
34+
3535
return rows
3636

3737

@@ -41,7 +41,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
4141
model=["gpt-4.1"],
4242
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
4343
rollout_processor=default_mcp_gym_rollout_processor,
44-
threshold_of_success=0.0,
44+
threshold=0.0,
4545
num_runs=1,
4646
mode="pointwise",
4747
max_concurrent_rollouts=3,
@@ -51,24 +51,28 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
5151
def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow:
5252
"""
5353
Test lunar lander evaluation using the pytest framework.
54-
54+
5555
This test evaluates how well the model can control the lunar lander to achieve
5656
a successful landing by checking the final reward and termination status.
57-
57+
5858
Args:
5959
row: EvaluationRow object from lunar lander dataset
60-
60+
6161
Returns:
6262
EvaluationRow object with evaluation results
6363
"""
6464
score = row.get_total_reward()
6565

6666
evaluation_score = 1.0 if score >= 200 else 0.0
67-
reason = f"✅ Successful landing with reward {score:.2f}" if score >= 200 else f"❌ Failed landing with reward {score:.2f}"
67+
reason = (
68+
f"✅ Successful landing with reward {score:.2f}"
69+
if score >= 200
70+
else f"❌ Failed landing with reward {score:.2f}"
71+
)
6872

6973
row.evaluation_result = EvaluateResult(
7074
score=evaluation_score,
7175
reason=reason,
7276
)
73-
74-
return row
77+
78+
return row

tests/pytest/test_markdown_highlighting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
2626
dataset_adapter=markdown_dataset_to_evaluation_row,
2727
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
2828
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
29-
threshold_of_success=0.5,
29+
threshold=0.5,
3030
rollout_processor=default_single_turn_rollout_processor,
3131
num_runs=1,
3232
mode="pointwise",

tests/pytest/test_pytest_math_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
1212
rollout_input_params=[{"temperature": 0.0}],
1313
max_dataset_rows=5,
14-
threshold_of_success=0.0,
14+
threshold=0.0,
1515
rollout_processor=default_single_turn_rollout_processor,
1616
mode="pointwise",
1717
evaluation_test_kwargs=[

tests/pytest/test_pytest_math_format_length.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
1515
rollout_input_params=[{"temperature": 0.0}],
1616
max_dataset_rows=5,
17-
threshold_of_success=0.0,
17+
threshold=0.0,
1818
rollout_processor=default_single_turn_rollout_processor,
1919
mode="pointwise",
2020
evaluation_test_kwargs=[

tests/pytest/test_pytest_word_count_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
1212
rollout_input_params=[{"temperature": 0.0}],
1313
max_dataset_rows=5,
14-
threshold_of_success=0.3, # Reasonable threshold for word count evaluation
14+
threshold=0.3, # Reasonable threshold for word count evaluation
1515
rollout_processor=default_single_turn_rollout_processor,
1616
mode="pointwise", # Use pointwise mode for elegant row-by-row evaluation
1717
)

0 commit comments

Comments
 (0)