fix more errors

Benny Chen · Benny Chen · commit 63d0fb548ab9 · 2025-09-01T19:13:23.000+08:00
diff --git a/eval_protocol/integrations/deepeval.py b/eval_protocol/integrations/deepeval.py
@@ -96,7 +96,11 @@ def _build_case_kwargs() -> Dict[str, Any]:
             case_kwargs = _build_case_kwargs()
             test_case = LLMTestCase(**case_kwargs)
 
-        metric.measure(test_case, **kwargs)
+        # Guard against metric.measure being None or non-callable
+        measure_fn = getattr(metric, "measure", None)
+        if not callable(measure_fn):
+            raise TypeError("Provided metric does not have a callable 'measure' method")
+        measure_fn(test_case, **kwargs)
         score = float(metric.score or 0.0)
         reason = getattr(metric, "reason", None)
         name = _metric_name(metric)
diff --git a/eval_protocol/mcp_agent/orchestration/local_docker_client.py b/eval_protocol/mcp_agent/orchestration/local_docker_client.py
@@ -57,6 +57,7 @@ async def startup(self) -> None:
         except docker.errors.DockerException as e:
             logger.warning(f"docker.from_env() failed: {e}. Trying explicit base_url.")
             try:
+                # docker.from_env is preferred, but as a fallback use DockerClient with url param name 'base_url'
                 self.docker_client = docker.DockerClient(base_url="unix://var/run/docker.sock")
                 if not self.docker_client.ping():  # type: ignore
                     raise ConnectionError("Failed to connect to Docker daemon with explicit base_url.")
@@ -649,7 +650,7 @@ async def list_tools_on_instance(self, instance: ManagedInstanceInfo) -> types.L
                 )
             target_base_url = instance.mcp_endpoint_url.rstrip("/")
             try:
-                async with streamablehttp_client(base_url=target_base_url) as (
+                async with streamablehttp_client(url=target_base_url) as (
                     read_s,
                     write_s,
                     _,  # get_session_id_func usually not needed for a single call
diff --git a/eval_protocol/mcp_servers/tau2/tau2_mcp.py b/eval_protocol/mcp_servers/tau2/tau2_mcp.py
@@ -43,6 +43,7 @@ def __init__(self, seed: Optional[int] = None, **kwargs):
 
         self.adapter = EnvironmentAdapter(env_class=AirlineEnvironment, default_config=default_config)
 
+        # Ensure name is a str and not None
         super().__init__("airline", self.adapter, seed, **kwargs)
 
     def _register_tools(self):
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -309,6 +309,11 @@ def pytest_sessionfinish(session, exitstatus):
                     print(f"❌ Experiment {link['experiment_id']}: {link['job_link']}", file=sys.__stderr__)
 
             print("=" * 80, file=sys.__stderr__)
-            sys.__stderr__.flush()
+            err_stream = getattr(sys, "__stderr__", None)
+            if err_stream is not None:
+                try:
+                    err_stream.flush()  # type: ignore[attr-defined]
+                except Exception:
+                    pass
     except Exception:
         pass
diff --git a/eval_protocol/rewards/json_schema.py b/eval_protocol/rewards/json_schema.py
@@ -342,8 +342,13 @@ def json_schema_reward_with_llm_judge(
         if messages:
             conversation_parts = []
             for msg in messages[:-1]:
-                role = msg.get("role", "")
-                content_part = msg.get("content", "")
+                if isinstance(msg, dict):
+                    role = msg.get("role", "")
+                    content_part = msg.get("content", "")
+                else:
+                    # Fallback for Message objects
+                    role = getattr(msg, "role", "")
+                    content_part = getattr(msg, "content", "")
                 if role and content_part:
                     conversation_parts.append(f"{role}: {content_part}")
             if conversation_parts:
diff --git a/eval_protocol/rewards/math.py b/eval_protocol/rewards/math.py
@@ -8,11 +8,16 @@
 
 import math
 import re
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union, cast
 
 from ..models import EvaluateResult, Message, MetricResult
 from ..typed_interface import reward_function
 
+# Types used throughout this module to clearly express allowed answer values.
+# Include both float and int since extraction may yield either at analysis time.
+Numeric = Union[int, float]
+AnswerValue = Union[Numeric, str]
+
 _ALGEBRAIC_VARS_SET: Set[str] = {
     "x",
     "y",
@@ -78,9 +83,9 @@ def _is_coefficient(
     return False
 
 
-def _extract_html_tag_answers(text: str) -> List[Tuple[str, Union[float, str]]]:
+def _extract_html_tag_answers(text: str) -> List[Tuple[str, AnswerValue]]:
     """Extracts answers from <answer> or <ans> HTML-like tags."""
-    html_tag_answers: List[Tuple[str, Union[float, str]]] = []
+    html_tag_answers: List[Tuple[str, AnswerValue]] = []
     tag_re = re.compile(
         r"<(?P<tag>answer|ans)\b[^>]*>(?P<inner>.*?)</(?P=tag)>",
         re.IGNORECASE | re.DOTALL,
@@ -126,12 +131,12 @@ def _extract_html_tag_answers(text: str) -> List[Tuple[str, Union[float, str]]]:
 
 def _extract_boxed_latex_answers(
     text: str,
-) -> Tuple[List[Tuple[str, Union[float, str]]], bool]:
+) -> Tuple[List[Tuple[str, AnswerValue]], bool]:
     """
     Extracts answers from \\boxed{} LaTeX expressions.
     Returns a tuple: (list of answers, boolean indicating if any boxed expr was found).
     """
-    boxed_answers: List[Tuple[str, Union[float, str]]] = []
+    boxed_answers: List[Tuple[str, AnswerValue]] = []
     found_any_boxed_expr = False
     for m_boxed in re.finditer(r"\\boxed\s*\{\s*((?:[^{}]|\{[^{}]*\})*?)\s*\}", text):
         found_any_boxed_expr = True
@@ -192,7 +197,7 @@ def _extract_boxed_latex_answers(
     return boxed_answers, found_any_boxed_expr
 
 
-def extract_numbers(text: str) -> List[Tuple[str, Union[float, str]]]:
+def extract_numbers(text: str) -> List[Tuple[str, AnswerValue]]:
     """
     Extracts mathematical answers from text based on a hierarchical priority:
     1. HTML <answer>/<ans> tags
@@ -228,7 +233,7 @@ def extract_numbers(text: str) -> List[Tuple[str, Union[float, str]]]:
     return []
 
 
-def _extract_gsm8k_answers(text: str) -> List[Tuple[str, Union[float, str]]]:
+def _extract_gsm8k_answers(text: str) -> List[Tuple[str, AnswerValue]]:
     """Extracts answers from GSM8K-style final answer markers (#### ...)."""
     final_marker_answers: List[Tuple[str, Union[float, str]]] = []
     GSM8K_NUM_CONTENT_PATTERN = r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?"
@@ -243,7 +248,7 @@ def _extract_gsm8k_answers(text: str) -> List[Tuple[str, Union[float, str]]]:
     return final_marker_answers
 
 
-def _extract_general_numeric_answers(text: str) -> List[Tuple[str, Union[float, str]]]:
+def _extract_general_numeric_answers(text: str) -> List[Tuple[str, AnswerValue]]:
     """Extracts general numeric or LaTeX-formatted numbers as a fallback."""
     potential_general_matches: List[Dict[str, Any]] = []
 
@@ -399,7 +404,7 @@ def _extract_general_numeric_answers(text: str) -> List[Tuple[str, Union[float,
             pass
 
     potential_general_matches.sort(key=lambda x: (x["span"][0], -(x["span"][1] - x["span"][0]), x["type_priority"]))
-    filtered_general_answers: List[Tuple[str, Union[float, str]]] = []
+    filtered_general_answers: List[Tuple[str, AnswerValue]] = []
     last_covered_end = -1
     for item in potential_general_matches:
         start, end = item["span"]
@@ -461,7 +466,7 @@ def _has_unit_text(full_extracted_text: str, numeric_value: float) -> bool:
 
 def _check_unboxed_or_strictness(
     model_response_content: str,
-    gen_answers_extracted: List[Tuple[str, Union[float, str]]],
+    gen_answers_extracted: Sequence[Tuple[str, AnswerValue]],
     metrics: Dict[str, MetricResult],
 ) -> Optional[EvaluateResult]:
     """Checks for 'unboxed or' strictness violation."""
@@ -487,8 +492,8 @@ def _check_unboxed_or_strictness(
 
 
 def _check_ambiguity_strictness(
-    orig_answers_extracted: List[Tuple[str, Union[float, str]]],
-    gen_answers_extracted: List[Tuple[str, Union[float, str]]],
+    orig_answers_extracted: Sequence[Tuple[str, AnswerValue]],
+    gen_answers_extracted: Sequence[Tuple[str, AnswerValue]],
     metrics: Dict[str, MetricResult],
 ) -> Optional[EvaluateResult]:
     """Checks for ambiguity strictness violation."""
@@ -503,8 +508,8 @@ def _check_ambiguity_strictness(
 
 
 def _check_conflicting_answers_strictness(
-    orig_answers_extracted: List[Tuple[str, Union[float, str]]],
-    gen_answers_extracted: List[Tuple[str, Union[float, str]]],
+    orig_answers_extracted: Sequence[Tuple[str, AnswerValue]],
+    gen_answers_extracted: Sequence[Tuple[str, AnswerValue]],
     best_match_score: float,
     match_found_flag: bool,
     is_single_orig_boxed_truth: bool,
@@ -603,7 +608,7 @@ def math_reward(
 
     gen_answers_extracted_initial = extract_numbers(model_response_content)
     orig_answers_extracted = extract_numbers(ground_truth)
-    gen_answers_extracted = list(gen_answers_extracted_initial)
+    gen_answers_extracted: List[Tuple[str, AnswerValue]] = list(gen_answers_extracted_initial)
     metrics: Dict[str, MetricResult] = {}
 
     def format_extracted(items: List[Tuple[str, Union[float, str]]]) -> str:
@@ -654,7 +659,7 @@ def format_extracted(items: List[Tuple[str, Union[float, str]]]) -> str:
                     abs_tol=absolute_tolerance,
                 ):
                     has_matching_gen_boxed_answer = True
-                    gen_answers_extracted = [(gen_text, gen_val)]
+                    gen_answers_extracted = [(gen_text, cast(AnswerValue, gen_val))]
                     metrics["demo_leniency_info"] = MetricResult(
                         score=1.0,
                         is_score_valid=True,