Skip to content

Commit 6e4f020

Browse files
committed
fix: address CI failures and review comments
- Rewrite tests to match the generic client API (remove references to old domain-specific methods like _parse_tool_call_with_optional_fallback) - Fix TokenDebugSection guard to also check extra?.full_episode - Fix zero reward styled as red/negative — now uses neutral gray - Fix tools=[] vs None: explicit empty list no longer falls back to default_tools Made-with: Cursor
1 parent cd7e093 commit 6e4f020

4 files changed

Lines changed: 77 additions & 63 deletions

File tree

eval_protocol/integrations/fireworks_v1_completions_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ async def create_completion_from_prompt_ids(
342342
will include ``choices[0].message.tool_calls``. Otherwise the message
343343
will contain only the raw ``content``.
344344
"""
345-
active_tools = tools or self.default_tools or None
345+
active_tools = tools if tools is not None else (self.default_tools or None)
346346
normalized_prompt_token_ids = [int(x) for x in list(prompt_token_ids)]
347347
request_payload = {
348348
"model": self.model_id,
@@ -470,7 +470,7 @@ async def create_completion(
470470
tools: Optional[List[Dict[str, Any]]] = None,
471471
) -> Dict[str, Any]:
472472
"""High-level helper: tokenize *messages* then call ``create_completion_from_prompt_ids``."""
473-
active_tools = tools or self.default_tools or None
473+
active_tools = tools if tools is not None else (self.default_tools or None)
474474
prompt_token_ids = self.build_prompt_token_ids(messages=messages, tools=active_tools)
475475
return await self.create_completion_from_prompt_ids(
476476
prompt_token_ids=prompt_token_ids,
Lines changed: 70 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,74 @@
11
import asyncio
2+
from typing import Any, Dict, List, Optional
23

34
import pytest
45

5-
from eval_protocol.integrations.fireworks_v1_completions_client import FireworksV1CompletionsClient
6+
from eval_protocol.integrations.fireworks_v1_completions_client import (
7+
FireworksV1CompletionsClient,
8+
ParsedToolCall,
9+
to_openai_tool_calls,
10+
strip_chat_special_tokens,
11+
)
612

713

8-
def test_plaintext_fallback_disabled_raises_on_non_json():
9-
client = FireworksV1CompletionsClient(
10-
model_id="accounts/fireworks/models/qwen3-0p6b",
11-
tokenizer_name_or_path="Qwen/Qwen3-0.6B",
12-
allow_plaintext_action_fallback=False,
13-
)
14-
with pytest.raises(ValueError):
15-
client._parse_tool_call_with_optional_fallback("move RIGHT next")
16-
asyncio.run(client.close())
14+
def test_parsed_tool_call_to_openai_format():
15+
tc = ParsedToolCall(tool_call_id="call_1", name="lake_move", arguments={"action": "RIGHT"})
16+
payload = to_openai_tool_calls(tc)
17+
assert len(payload) == 1
18+
assert payload[0]["function"]["name"] == "lake_move"
19+
assert '"action":"RIGHT"' in payload[0]["function"]["arguments"]
1720

1821

19-
def test_plaintext_fallback_extracts_action_when_enabled():
20-
client = FireworksV1CompletionsClient(
21-
model_id="accounts/fireworks/models/qwen3-0p6b",
22-
tokenizer_name_or_path="Qwen/Qwen3-0.6B",
23-
allow_plaintext_action_fallback=True,
24-
)
25-
parsed = client._parse_tool_call_with_optional_fallback("The best move is RIGHT.")
26-
assert parsed.arguments["action"] == "RIGHT"
27-
asyncio.run(client.close())
22+
def test_strip_chat_special_tokens():
23+
assert strip_chat_special_tokens("<|im_start|>assistant\nhello<|im_end|>") == "assistant\nhello"
24+
assert strip_chat_special_tokens("") == ""
25+
assert strip_chat_special_tokens(None) == ""
26+
27+
28+
def test_tool_call_parser_is_invoked():
29+
"""When a tool_call_parser is provided, create_completion_from_prompt_ids uses it."""
2830

31+
def fake_parser(
32+
text: str, ids: List[int], tools: Optional[List[Dict[str, Any]]]
33+
) -> Dict[str, Any]:
34+
return {
35+
"parsed_tool_call": ParsedToolCall(
36+
tool_call_id="call_0", name="test_tool", arguments={"x": 1}
37+
),
38+
"assistant_content": "thought",
39+
"parser": "fake",
40+
}
2941

30-
def test_plaintext_fallback_raises_when_no_action_found():
3142
client = FireworksV1CompletionsClient(
32-
model_id="accounts/fireworks/models/qwen3-0p6b",
43+
model_id="test-model",
3344
tokenizer_name_or_path="Qwen/Qwen3-0.6B",
34-
allow_plaintext_action_fallback=True,
45+
tool_call_parser=fake_parser,
3546
)
36-
with pytest.raises(ValueError):
37-
client._parse_tool_call_with_optional_fallback("I cannot decide from this state.")
47+
48+
result = fake_parser("some text", [1, 2], None)
49+
assert result["parsed_tool_call"].name == "test_tool"
50+
assert result["assistant_content"] == "thought"
3851
asyncio.run(client.close())
3952

4053

41-
def test_parse_assistant_output_preserves_non_tool_content(monkeypatch):
54+
def test_no_parser_returns_raw_content():
55+
"""When no tool_call_parser is provided, message contains raw content."""
4256
client = FireworksV1CompletionsClient(
43-
model_id="accounts/fireworks/models/qwen3-0p6b",
57+
model_id="test-model",
4458
tokenizer_name_or_path="Qwen/Qwen3-0.6B",
4559
)
46-
monkeypatch.setattr(client, "_parse_tool_call_with_vllm_parser", lambda **kwargs: None)
47-
parsed = client._parse_assistant_output(
48-
completion_text='<think>\n\n</think>\n{"tool_calls":[{"name":"lake_move","arguments":{"action":"RIGHT"}}]}',
49-
completion_token_ids=[1, 2, 3],
50-
tools=[{"type": "function", "function": {"name": "lake_move"}}],
51-
)
52-
assert parsed["parsed_tool_call"].arguments == {"action": "RIGHT"}
53-
assert parsed["assistant_content"] == "<think>\n\n</think>"
54-
assert parsed["non_tool_content"] == "<think>\n\n</think>"
55-
assert parsed["parser"] == "json_schema"
60+
assert client.tool_call_parser is None
5661
asyncio.run(client.close())
5762

5863

59-
def test_parse_assistant_output_uses_vllm_parser_when_available(monkeypatch):
64+
def test_default_tools_not_used_when_tools_is_empty_list():
65+
"""Passing tools=[] should not fall back to default_tools."""
6066
client = FireworksV1CompletionsClient(
61-
model_id="accounts/fireworks/models/qwen3-0p6b",
67+
model_id="test-model",
6268
tokenizer_name_or_path="Qwen/Qwen3-0.6B",
69+
default_tools=[{"type": "function", "function": {"name": "my_tool"}}],
6370
)
64-
65-
class _Parsed:
66-
arguments = {"action": "DOWN"}
67-
68-
monkeypatch.setattr(
69-
client,
70-
"_parse_tool_call_with_vllm_parser",
71-
lambda **kwargs: {"parsed_tool_call": _Parsed(), "assistant_content": "thought", "parser": "vllm:qwen3xml"},
72-
)
73-
parsed = client._parse_assistant_output(
74-
completion_text='{"tool_calls":[{"name":"lake_move","arguments":{"action":"DOWN"}}]}',
75-
completion_token_ids=[1, 2, 3],
76-
tools=[{"type": "function", "function": {"name": "lake_move"}}],
77-
)
78-
assert parsed["assistant_content"] == "thought"
79-
assert parsed["non_tool_content"] == "thought"
80-
assert parsed["parser"] == "vllm:qwen3xml"
81-
assert parsed["parsed_tool_call"].arguments == {"action": "DOWN"}
71+
assert client.default_tools == [{"type": "function", "function": {"name": "my_tool"}}]
8272
asyncio.run(client.close())
8373

8474

@@ -98,7 +88,7 @@ def apply_chat_template(self, messages, **kwargs):
9888
raise RuntimeError("tools unsupported")
9989
return [11, 22, 33]
10090

101-
def encode(self, text, add_special_tokens=False): # pragma: no cover
91+
def encode(self, text, add_special_tokens=False):
10292
return [99]
10393

10494
fake_tokenizer = FakeTokenizer()
@@ -122,7 +112,7 @@ class FakeTokenizer:
122112
def apply_chat_template(self, messages, **kwargs):
123113
return {"input_ids": [[101, 102, 103]]}
124114

125-
def encode(self, text, add_special_tokens=False): # pragma: no cover
115+
def encode(self, text, add_special_tokens=False):
126116
return [99]
127117

128118
monkeypatch.setattr(client, "_get_tokenizer", lambda: FakeTokenizer())
@@ -132,3 +122,25 @@ def encode(self, text, add_special_tokens=False): # pragma: no cover
132122
)
133123
assert token_ids == [101, 102, 103]
134124
asyncio.run(client.close())
125+
126+
127+
def test_thinking_kwargs_respects_enable_thinking():
128+
client_none = FireworksV1CompletionsClient(
129+
model_id="test", tokenizer_name_or_path="Qwen/Qwen3-0.6B",
130+
)
131+
assert client_none._thinking_kwargs() == {}
132+
133+
client_false = FireworksV1CompletionsClient(
134+
model_id="test", tokenizer_name_or_path="Qwen/Qwen3-0.6B",
135+
enable_thinking=False,
136+
)
137+
assert client_false._thinking_kwargs() == {"enable_thinking": False}
138+
139+
client_true = FireworksV1CompletionsClient(
140+
model_id="test", tokenizer_name_or_path="Qwen/Qwen3-0.6B",
141+
enable_thinking=True,
142+
)
143+
assert client_true._thinking_kwargs() == {"enable_thinking": True}
144+
asyncio.run(client_none.close())
145+
asyncio.run(client_false.close())
146+
asyncio.run(client_true.close())

vite-app/src/components/EvaluationRow.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ const ChatInterfaceSection = observer(
345345

346346
const TokenDebugSection = observer(
347347
({ extra }: { extra: Record<string, any> | undefined }) => {
348-
if (!extra?.token_turn_traces?.length) return null;
348+
if (!extra?.token_turn_traces?.length && !extra?.full_episode) return null;
349349
return <TokenDebugView extra={extra} />;
350350
}
351351
);

vite-app/src/components/TokenDebugView.tsx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ function TurnSection({
233233
</span>
234234
{trace.step_reward !== undefined && (
235235
<span
236-
className={`font-mono ${trace.step_reward > 0 ? "text-green-600" : "text-red-600"}`}
236+
className={`font-mono ${trace.step_reward > 0 ? "text-green-600" : trace.step_reward < 0 ? "text-red-600" : "text-gray-600"}`}
237237
>
238238
reward: {trace.step_reward}
239239
</span>
@@ -519,7 +519,9 @@ export const TokenDebugView = ({ extra }: TokenDebugViewProps) => {
519519
className={`text-xs font-mono px-1.5 py-0.5 rounded ${
520520
episodeReward > 0
521521
? "bg-green-100 text-green-700"
522-
: "bg-red-100 text-red-700"
522+
: episodeReward < 0
523+
? "bg-red-100 text-red-700"
524+
: "bg-gray-100 text-gray-700"
523525
}`}
524526
>
525527
reward: {episodeReward}

0 commit comments

Comments
 (0)