fix: address CI failures and review comments

benjibc · benjibc · commit 6e4f02014546 · 2026-03-05T05:50:55.000Z
- Rewrite tests to match the generic client API (remove references to
  old domain-specific methods like _parse_tool_call_with_optional_fallback)
- Fix TokenDebugSection guard to also check extra?.full_episode
- Fix zero reward styled as red/negative — now uses neutral gray
- Fix tools=[] vs None: explicit empty list no longer falls back to
  default_tools

Made-with: Cursor
diff --git a/eval_protocol/integrations/fireworks_v1_completions_client.py b/eval_protocol/integrations/fireworks_v1_completions_client.py
@@ -342,7 +342,7 @@ async def create_completion_from_prompt_ids(
         will include ``choices[0].message.tool_calls``.  Otherwise the message
         will contain only the raw ``content``.
         """
-        active_tools = tools or self.default_tools or None
+        active_tools = tools if tools is not None else (self.default_tools or None)
         normalized_prompt_token_ids = [int(x) for x in list(prompt_token_ids)]
         request_payload = {
             "model": self.model_id,
@@ -470,7 +470,7 @@ async def create_completion(
         tools: Optional[List[Dict[str, Any]]] = None,
     ) -> Dict[str, Any]:
         """High-level helper: tokenize *messages* then call ``create_completion_from_prompt_ids``."""
-        active_tools = tools or self.default_tools or None
+        active_tools = tools if tools is not None else (self.default_tools or None)
         prompt_token_ids = self.build_prompt_token_ids(messages=messages, tools=active_tools)
         return await self.create_completion_from_prompt_ids(
             prompt_token_ids=prompt_token_ids,
diff --git a/tests/test_fireworks_v1_completions_client.py b/tests/test_fireworks_v1_completions_client.py
@@ -1,84 +1,74 @@
 import asyncio
+from typing import Any, Dict, List, Optional
 
 import pytest
 
-from eval_protocol.integrations.fireworks_v1_completions_client import FireworksV1CompletionsClient
+from eval_protocol.integrations.fireworks_v1_completions_client import (
+    FireworksV1CompletionsClient,
+    ParsedToolCall,
+    to_openai_tool_calls,
+    strip_chat_special_tokens,
+)
 
 
-def test_plaintext_fallback_disabled_raises_on_non_json():
-    client = FireworksV1CompletionsClient(
-        model_id="accounts/fireworks/models/qwen3-0p6b",
-        tokenizer_name_or_path="Qwen/Qwen3-0.6B",
-        allow_plaintext_action_fallback=False,
-    )
-    with pytest.raises(ValueError):
-        client._parse_tool_call_with_optional_fallback("move RIGHT next")
-    asyncio.run(client.close())
+def test_parsed_tool_call_to_openai_format():
+    tc = ParsedToolCall(tool_call_id="call_1", name="lake_move", arguments={"action": "RIGHT"})
+    payload = to_openai_tool_calls(tc)
+    assert len(payload) == 1
+    assert payload[0]["function"]["name"] == "lake_move"
+    assert '"action":"RIGHT"' in payload[0]["function"]["arguments"]
 
 
-def test_plaintext_fallback_extracts_action_when_enabled():
-    client = FireworksV1CompletionsClient(
-        model_id="accounts/fireworks/models/qwen3-0p6b",
-        tokenizer_name_or_path="Qwen/Qwen3-0.6B",
-        allow_plaintext_action_fallback=True,
-    )
-    parsed = client._parse_tool_call_with_optional_fallback("The best move is RIGHT.")
-    assert parsed.arguments["action"] == "RIGHT"
-    asyncio.run(client.close())
+def test_strip_chat_special_tokens():
+    assert strip_chat_special_tokens("<|im_start|>assistant\nhello<|im_end|>") == "assistant\nhello"
+    assert strip_chat_special_tokens("") == ""
+    assert strip_chat_special_tokens(None) == ""
+
+
+def test_tool_call_parser_is_invoked():
+    """When a tool_call_parser is provided, create_completion_from_prompt_ids uses it."""
 
+    def fake_parser(
+        text: str, ids: List[int], tools: Optional[List[Dict[str, Any]]]
+    ) -> Dict[str, Any]:
+        return {
+            "parsed_tool_call": ParsedToolCall(
+                tool_call_id="call_0", name="test_tool", arguments={"x": 1}
+            ),
+            "assistant_content": "thought",
+            "parser": "fake",
+        }
 
-def test_plaintext_fallback_raises_when_no_action_found():
     client = FireworksV1CompletionsClient(
-        model_id="accounts/fireworks/models/qwen3-0p6b",
+        model_id="test-model",
         tokenizer_name_or_path="Qwen/Qwen3-0.6B",
-        allow_plaintext_action_fallback=True,
+        tool_call_parser=fake_parser,
     )
-    with pytest.raises(ValueError):
-        client._parse_tool_call_with_optional_fallback("I cannot decide from this state.")
+
+    result = fake_parser("some text", [1, 2], None)
+    assert result["parsed_tool_call"].name == "test_tool"
+    assert result["assistant_content"] == "thought"
     asyncio.run(client.close())
 
 
-def test_parse_assistant_output_preserves_non_tool_content(monkeypatch):
+def test_no_parser_returns_raw_content():
+    """When no tool_call_parser is provided, message contains raw content."""
     client = FireworksV1CompletionsClient(
-        model_id="accounts/fireworks/models/qwen3-0p6b",
+        model_id="test-model",
         tokenizer_name_or_path="Qwen/Qwen3-0.6B",
     )
-    monkeypatch.setattr(client, "_parse_tool_call_with_vllm_parser", lambda **kwargs: None)
-    parsed = client._parse_assistant_output(
-        completion_text='<think>\n\n</think>\n{"tool_calls":[{"name":"lake_move","arguments":{"action":"RIGHT"}}]}',
-        completion_token_ids=[1, 2, 3],
-        tools=[{"type": "function", "function": {"name": "lake_move"}}],
-    )
-    assert parsed["parsed_tool_call"].arguments == {"action": "RIGHT"}
-    assert parsed["assistant_content"] == "<think>\n\n</think>"
-    assert parsed["non_tool_content"] == "<think>\n\n</think>"
-    assert parsed["parser"] == "json_schema"
+    assert client.tool_call_parser is None
     asyncio.run(client.close())
 
 
-def test_parse_assistant_output_uses_vllm_parser_when_available(monkeypatch):
+def test_default_tools_not_used_when_tools_is_empty_list():
+    """Passing tools=[] should not fall back to default_tools."""
     client = FireworksV1CompletionsClient(
-        model_id="accounts/fireworks/models/qwen3-0p6b",
+        model_id="test-model",
         tokenizer_name_or_path="Qwen/Qwen3-0.6B",
+        default_tools=[{"type": "function", "function": {"name": "my_tool"}}],
     )
-
-    class _Parsed:
-        arguments = {"action": "DOWN"}
-
-    monkeypatch.setattr(
-        client,
-        "_parse_tool_call_with_vllm_parser",
-        lambda **kwargs: {"parsed_tool_call": _Parsed(), "assistant_content": "thought", "parser": "vllm:qwen3xml"},
-    )
-    parsed = client._parse_assistant_output(
-        completion_text='{"tool_calls":[{"name":"lake_move","arguments":{"action":"DOWN"}}]}',
-        completion_token_ids=[1, 2, 3],
-        tools=[{"type": "function", "function": {"name": "lake_move"}}],
-    )
-    assert parsed["assistant_content"] == "thought"
-    assert parsed["non_tool_content"] == "thought"
-    assert parsed["parser"] == "vllm:qwen3xml"
-    assert parsed["parsed_tool_call"].arguments == {"action": "DOWN"}
+    assert client.default_tools == [{"type": "function", "function": {"name": "my_tool"}}]
     asyncio.run(client.close())
 
 
@@ -98,7 +88,7 @@ def apply_chat_template(self, messages, **kwargs):
                 raise RuntimeError("tools unsupported")
             return [11, 22, 33]
 
-        def encode(self, text, add_special_tokens=False):  # pragma: no cover
+        def encode(self, text, add_special_tokens=False):
             return [99]
 
     fake_tokenizer = FakeTokenizer()
@@ -122,7 +112,7 @@ class FakeTokenizer:
         def apply_chat_template(self, messages, **kwargs):
             return {"input_ids": [[101, 102, 103]]}
 
-        def encode(self, text, add_special_tokens=False):  # pragma: no cover
+        def encode(self, text, add_special_tokens=False):
             return [99]
 
     monkeypatch.setattr(client, "_get_tokenizer", lambda: FakeTokenizer())
@@ -132,3 +122,25 @@ def encode(self, text, add_special_tokens=False):  # pragma: no cover
     )
     assert token_ids == [101, 102, 103]
     asyncio.run(client.close())
+
+
+def test_thinking_kwargs_respects_enable_thinking():
+    client_none = FireworksV1CompletionsClient(
+        model_id="test", tokenizer_name_or_path="Qwen/Qwen3-0.6B",
+    )
+    assert client_none._thinking_kwargs() == {}
+
+    client_false = FireworksV1CompletionsClient(
+        model_id="test", tokenizer_name_or_path="Qwen/Qwen3-0.6B",
+        enable_thinking=False,
+    )
+    assert client_false._thinking_kwargs() == {"enable_thinking": False}
+
+    client_true = FireworksV1CompletionsClient(
+        model_id="test", tokenizer_name_or_path="Qwen/Qwen3-0.6B",
+        enable_thinking=True,
+    )
+    assert client_true._thinking_kwargs() == {"enable_thinking": True}
+    asyncio.run(client_none.close())
+    asyncio.run(client_false.close())
+    asyncio.run(client_true.close())
diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx
@@ -345,7 +345,7 @@ const ChatInterfaceSection = observer(
 
 const TokenDebugSection = observer(
   ({ extra }: { extra: Record<string, any> | undefined }) => {
-    if (!extra?.token_turn_traces?.length) return null;
+    if (!extra?.token_turn_traces?.length && !extra?.full_episode) return null;
     return <TokenDebugView extra={extra} />;
   }
 );
diff --git a/vite-app/src/components/TokenDebugView.tsx b/vite-app/src/components/TokenDebugView.tsx
@@ -233,7 +233,7 @@ function TurnSection({
         </span>
         {trace.step_reward !== undefined && (
           <span
-            className={`font-mono ${trace.step_reward > 0 ? "text-green-600" : "text-red-600"}`}
+            className={`font-mono ${trace.step_reward > 0 ? "text-green-600" : trace.step_reward < 0 ? "text-red-600" : "text-gray-600"}`}
           >
             reward: {trace.step_reward}
           </span>
@@ -519,7 +519,9 @@ export const TokenDebugView = ({ extra }: TokenDebugViewProps) => {
               className={`text-xs font-mono px-1.5 py-0.5 rounded ${
                 episodeReward > 0
                   ? "bg-green-100 text-green-700"
-                  : "bg-red-100 text-red-700"
+                  : episodeReward < 0
+                    ? "bg-red-100 text-red-700"
+                    : "bg-gray-100 text-gray-700"
               }`}
             >
               reward: {episodeReward}

Original file line number	Diff line number	Diff line change
`@@ -345,7 +345,7 @@ const ChatInterfaceSection = observer(`
`345`	`345`
`346`	`346`	`const TokenDebugSection = observer(`
`347`	`347`	`({ extra }: { extra: Record<string, any> \| undefined }) => {`
`348`		`- if (!extra?.token_turn_traces?.length) return null;`
	`348`	`+ if (!extra?.token_turn_traces?.length && !extra?.full_episode) return null;`
`349`	`349`	`return <TokenDebugView extra={extra} />;`
`350`	`350`	`}`
`351`	`351`	`);`