From fa3c1ea4ee9e8bb5c64220f0e7e071549a2d3a89 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Wed, 25 Feb 2026 11:59:52 -0800
Subject: [PATCH 1/9] reasoningSUT type + matching

---
 src/modelgauge/general.py                     |  8 +--
 src/modelgauge/reasoning_handlers.py          | 45 ++++++++++++++--
 .../test_reasoning_handlers.py                | 53 +++++++++++++++++--
 3 files changed, 92 insertions(+), 14 deletions(-)
diff --git a/src/modelgauge/general.py b/src/modelgauge/general.py
index 5400d28b4..866196e11 100644
--- a/src/modelgauge/general.py
+++ b/src/modelgauge/general.py
@@ -5,13 +5,13 @@
 import shlex
 import subprocess
 import time
-from typing import List, Optional, Set, Type, TypeVar
+from typing import List, Optional, TypeVar
 
 from airrlogger.log_config import get_logger
 from tqdm import tqdm
 
 # Type vars helpful in defining templates.
-_InT = TypeVar("_InT")
+_InT = TypeVar("_InT", bound=type)
 
 logger = get_logger(__name__)
 
@@ -20,8 +20,8 @@ def current_timestamp_millis() -> int:
     return time.time_ns() // 1_000_000
 
 
-def get_concrete_subclasses(cls: Type[_InT]) -> Set[Type[_InT]]:
-    result = set()
+def get_concrete_subclasses(cls: _InT) -> set[_InT]:
+    result: set[_InT] = set()
     for subclass in cls.__subclasses__():
         if not inspect.isabstract(subclass):
             result.add(subclass)
diff --git a/src/modelgauge/reasoning_handlers.py b/src/modelgauge/reasoning_handlers.py
index 95ec0e416..8b4c053ee 100644
--- a/src/modelgauge/reasoning_handlers.py
+++ b/src/modelgauge/reasoning_handlers.py
@@ -1,8 +1,10 @@
+from abc import abstractmethod
 from typing import Any
 
 from airrlogger.log_config import get_logger
 from pydantic import BaseModel
 
+from modelgauge.general import get_concrete_subclasses
 from modelgauge.model_options import ModelOptions
 from modelgauge.prompt import TextPrompt
 from modelgauge.sut import PromptResponseSUT, SUTResponse
@@ -17,19 +19,52 @@ class ReasoningRequest(BaseModel):
     max_total_tokens: int | None = None  # Total number of tokens allowed (thinking + content).
 
 
-class ThinkingMixin(PromptResponseSUT):
+class ReasoningSUT(PromptResponseSUT):
+    @staticmethod
+    def find_match(sut: PromptResponseSUT) -> type["ReasoningSUT"] | None:
+        reasoning_suts = get_concrete_subclasses(ReasoningSUT)
+        for rs in reasoning_suts:
+            if rs.sut_matches(sut):
+                return rs
+        return None
+
+    @classmethod
+    def sut_matches(cls, sut) -> bool:
+        request = sut.translate_text_prompt(
+            TextPrompt(text="If I have 2 apples and give 1 to my friend, how many apples do I have left?"),
+            options=ModelOptions(max_tokens=1000),
+        )
+        raw_response = sut.evaluate(request)
+        response = sut.translate_response(request, raw_response)
+        return cls.response_contains_reasoning(response)
+
+    @classmethod
+    @abstractmethod
+    def response_contains_reasoning(cls, response: SUTResponse) -> bool:
+        pass
+
+
+class ThinkingMixin(ReasoningSUT):
     """
     A mixin for SUTs that parses out thinking text from the output.
 
     The output is expected to be in the form: {reasoning text}</think>{content text}.
     If max_total_output_tokens is set in ModelOptions, that value will be used in the model call and the content text will be truncated to max_tokens.
     Otherwise, max_tokens is used in the model call and everything after </think> is returned as content.
+
+    Reasoning should be enabled by the model by default. This mixin does not request reasoning be enabled (yet).
     """
 
+    OPEN_TAG = "<think>"  # Optional.
+    CLOSE_TAG = "</think>"  # Tag that separates reasoning from content.
+
     def __init__(self, uid, *args, **kwargs):
         super().__init__(uid, *args, **kwargs)
         self.tokenizer = GeneralTokenizer()
-        self.separator = "</think>"  # Tag that separates reasoning from content.
+
+    @classmethod
+    def response_contains_reasoning(cls, response: SUTResponse) -> bool:
+        return cls.OPEN_TAG in response.text or cls.CLOSE_TAG in response.text
 
     def translate_text_prompt(self, prompt: TextPrompt, options: ModelOptions) -> ReasoningRequest:
         max_total_tokens = options.max_total_output_tokens
@@ -52,13 +87,13 @@ def evaluate(self, request: ReasoningRequest) -> Any:
     def translate_response(self, request: ReasoningRequest, response: Any) -> SUTResponse:
         text = super().translate_response(request.request, response).text  # type: ignore
 
-        think_close = text.find(self.separator)
+        think_close = text.find(self.CLOSE_TAG)
         if think_close == -1:
             # no closing tag: everything is thinking text
             return SUTResponse(text="")
 
-        reasoning = text[: think_close + len(self.separator)].strip()
-        content = text[think_close + len(self.separator) :].strip()
+        reasoning = text[: think_close + len(self.CLOSE_TAG)].strip()
+        content = text[think_close + len(self.CLOSE_TAG) :].strip()
         self.warn_edge_cases(content, reasoning, request)
 
         # Truncate content
diff --git a/tests/modelgauge_tests/test_reasoning_handlers.py b/tests/modelgauge_tests/test_reasoning_handlers.py
index e2d219965..0dd0b7538 100644
--- a/tests/modelgauge_tests/test_reasoning_handlers.py
+++ b/tests/modelgauge_tests/test_reasoning_handlers.py
@@ -4,7 +4,7 @@
 
 from modelgauge.model_options import ModelOptions
 from modelgauge.prompt import TextPrompt
-from modelgauge.reasoning_handlers import ReasoningRequest, ThinkingMixin
+from modelgauge.reasoning_handlers import ReasoningRequest, ReasoningSUT, ThinkingMixin
 
 from modelgauge.sut import SUTResponse, PromptResponseSUT
 from modelgauge.sut_capabilities import AcceptsTextPrompt
@@ -34,14 +34,57 @@ def translate_response(self, request: FakeSUTRequest, response: FakeSUTResponse)
         return SUTResponse(text=response.text)
 
 
+class TestReasoningSUT:
+
+    class CountMixin(ReasoningSUT, FakeBaseSUT):
+        # Inherit from FakeBaseSUT so that this is a concrete class.
+        @classmethod
+        def response_contains_reasoning(cls, response: SUTResponse) -> bool:
+            return "123" in response.text
+
+    def test_find_thinking_mixin(self):
+        class CountSUT(FakeBaseSUT):
+            def evaluate(self, request: FakeSUTRequest) -> FakeSUTResponse:
+                return FakeSUTResponse(text="123")
+
+        sut = CountSUT("sut")
+        reasoning_cls = ReasoningSUT.find_match(sut)
+        assert reasoning_cls == self.CountMixin
+
+    def test_find_no_match(self):
+        class NoReasoningSUT(FakeBaseSUT):
+            def evaluate(self, request: FakeSUTRequest) -> FakeSUTResponse:
+                return FakeSUTResponse(text="text only")
+
+        sut = NoReasoningSUT("sut")
+        reasoning_cls = ReasoningSUT.find_match(sut)
+        assert reasoning_cls is None
+
+
 class TestThinkMixin:
+    @modelgauge_sut(capabilities=[AcceptsTextPrompt])
+    class ThinkSut(ThinkingMixin, FakeBaseSUT):
+        pass
+
     @pytest.fixture
     def sut(self):
-        @modelgauge_sut(capabilities=[AcceptsTextPrompt])
-        class ThinkSut(ThinkingMixin, FakeBaseSUT):
-            pass
+        return self.ThinkSut("sut-uid")
+
+    def test_response_contains_reasoning(self):
+        response = SUTResponse(text="reasoning</think>output")
+        assert self.ThinkSut.response_contains_reasoning(response) is True
+
+        response = SUTResponse(text="<think>reasoning</think>output")
+        assert self.ThinkSut.response_contains_reasoning(response) is True
+
+        response = SUTResponse(text="<think> only thinking")
+        assert self.ThinkSut.response_contains_reasoning(response) is True
+
+        response = SUTResponse(text="content")
+        assert self.ThinkSut.response_contains_reasoning(response) is False
 
-        return ThinkSut("sut-uid")
+        response = SUTResponse(text="")
+        assert self.ThinkSut.response_contains_reasoning(response) is False
 
     def test_translate_text_prompt_sets_max_tokens(self, sut):
         prompt = TextPrompt(text="some-text")

From 6b7100c8cd7385e63e8b7877faf2cfa697ceefdb Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Wed, 25 Feb 2026 12:07:56 -0800
Subject: [PATCH 2/9] patch for better tests

---
 src/modelgauge/reasoning_handlers.py              |  6 +++++-
 tests/modelgauge_tests/test_reasoning_handlers.py | 11 +++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/modelgauge/reasoning_handlers.py b/src/modelgauge/reasoning_handlers.py
index 8b4c053ee..016e382a7 100644
--- a/src/modelgauge/reasoning_handlers.py
+++ b/src/modelgauge/reasoning_handlers.py
@@ -20,9 +20,13 @@ class ReasoningRequest(BaseModel):
 
 
 class ReasoningSUT(PromptResponseSUT):
+    @staticmethod
+    def _get_concrete_reasoning_suts() -> set[type["ReasoningSUT"]]:
+        return get_concrete_subclasses(ReasoningSUT)
+
     @staticmethod
     def find_match(sut: PromptResponseSUT) -> type["ReasoningSUT"] | None:
-        reasoning_suts = get_concrete_subclasses(ReasoningSUT)
+        reasoning_suts = ReasoningSUT._get_concrete_reasoning_suts()
         for rs in reasoning_suts:
             if rs.sut_matches(sut):
                 return rs
diff --git a/tests/modelgauge_tests/test_reasoning_handlers.py b/tests/modelgauge_tests/test_reasoning_handlers.py
index 0dd0b7538..358572f90 100644
--- a/tests/modelgauge_tests/test_reasoning_handlers.py
+++ b/tests/modelgauge_tests/test_reasoning_handlers.py
@@ -1,4 +1,5 @@
 import pytest
+from unittest.mock import patch
 
 from pydantic import BaseModel
 
@@ -42,6 +43,16 @@ class CountMixin(ReasoningSUT, FakeBaseSUT):
         def response_contains_reasoning(cls, response: SUTResponse) -> bool:
             return "123" in response.text
 
+    @pytest.fixture(autouse=True)
+    def _patch_reasoning_suts(self):
+        # Only consider the CountMixin for matching.
+        with patch.object(
+            ReasoningSUT,
+            "_get_concrete_reasoning_suts",
+            return_value={self.CountMixin},
+        ):
+            yield
+
     def test_find_thinking_mixin(self):
         class CountSUT(FakeBaseSUT):
             def evaluate(self, request: FakeSUTRequest) -> FakeSUTResponse:

From 369144802dbf8c17a52e5f9a88f4a9a0562ad513 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Wed, 25 Feb 2026 12:42:54 -0800
Subject: [PATCH 3/9] Store reasoning in response

---
 src/modelgauge/reasoning_handlers.py              | 13 +++++++++++--
 src/modelgauge/sut.py                             |  1 +
 tests/modelgauge_tests/test_reasoning_handlers.py | 12 +++++++++---
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/modelgauge/reasoning_handlers.py b/src/modelgauge/reasoning_handlers.py
index 016e382a7..5fdf0bb47 100644
--- a/src/modelgauge/reasoning_handlers.py
+++ b/src/modelgauge/reasoning_handlers.py
@@ -94,16 +94,18 @@ def translate_response(self, request: ReasoningRequest, response: Any) -> SUTRes
         think_close = text.find(self.CLOSE_TAG)
         if think_close == -1:
             # no closing tag: everything is thinking text
-            return SUTResponse(text="")
+            return SUTResponse(text="", reasoning=self.trim_tokens(text))
 
         reasoning = text[: think_close + len(self.CLOSE_TAG)].strip()
         content = text[think_close + len(self.CLOSE_TAG) :].strip()
         self.warn_edge_cases(content, reasoning, request)
 
+        reasoning = self.trim_tokens(reasoning)
+
         # Truncate content
         if request.max_content_tokens is not None:
             content = self.tokenizer.truncate(content, request.max_content_tokens)
-        return SUTResponse(text=content)
+        return SUTResponse(text=content, reasoning=reasoning)
 
     def warn_edge_cases(self, content, reasoning, request):
         if request.max_total_tokens is None:
@@ -116,3 +118,10 @@ def warn_edge_cases(self, content, reasoning, request):
             logger.warning(
                 f"SUT {self.uid} reasoning likely ate into the token budget of the actual output. Consider increasing max_total_output_tokens."
             )
+
+    def trim_tokens(self, text: str) -> str:
+        if text.startswith(self.OPEN_TAG):
+            text = text[len(self.OPEN_TAG) :]
+        if text.endswith(self.CLOSE_TAG):
+            text = text[: -len(self.CLOSE_TAG)]
+        return text
diff --git a/src/modelgauge/sut.py b/src/modelgauge/sut.py
index 06865a425..205576995 100644
--- a/src/modelgauge/sut.py
+++ b/src/modelgauge/sut.py
@@ -18,6 +18,7 @@ class SUTResponse(BaseModel):
     """The data that came out of the SUT."""
 
     text: str
+    reasoning: Optional[str] = None
     top_logprobs: Optional[Sequence[TopTokens]] = None
     """For each position, list the probabilities for each of the most likely tokens.
 
diff --git a/tests/modelgauge_tests/test_reasoning_handlers.py b/tests/modelgauge_tests/test_reasoning_handlers.py
index 358572f90..ab1b12131 100644
--- a/tests/modelgauge_tests/test_reasoning_handlers.py
+++ b/tests/modelgauge_tests/test_reasoning_handlers.py
@@ -121,10 +121,15 @@ def test_translate_text_prompt_sets_max_tokens(self, sut):
         assert request.max_content_tokens == None
 
     @pytest.mark.parametrize(
-        "full_text, content_text",
-        [("<think>hmm</think>\n Output", "Output"), ("hmm</think>\n Output", "Output"), ("<think>hmmm", "")],
+        "full_text, content_text, reason_text",
+        [
+            ("<think>hmm</think>\n Output", "Output", "hmm"),
+            ("hmm</think>\n Output", "Output", "hmm"),
+            ("<think>hmmm", "", "hmmm"),
+            ("<think>", "", ""),
+        ],
     )
-    def test_translate_response_no_truncation(self, full_text, content_text, sut):
+    def test_translate_response_no_truncation(self, full_text, content_text, reason_text, sut):
         request = ReasoningRequest(
             request=FakeSUTRequest(text="", max_tokens=100), max_content_tokens=100, max_total_tokens=100
         )
@@ -138,6 +143,7 @@ def test_translate_response_no_truncation(self, full_text, content_text, sut):
 
         result = sut.translate_response(request, response)
         assert result.text == content_text
+        assert result.reasoning == reason_text
 
     @pytest.mark.parametrize(
         "full_text, content_text",

From c15c8112bf2991a664fbe5358b63634ba31373b7 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Wed, 25 Feb 2026 12:54:42 -0800
Subject: [PATCH 4/9] remove pre-registered reasoning suts

---
 .../suts/huggingface_chat_completion.py       | 41 +------------------
 src/modelgauge/suts/together_client.py        |  8 ----
 2 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/src/modelgauge/suts/huggingface_chat_completion.py b/src/modelgauge/suts/huggingface_chat_completion.py
index 2f3eb0f58..08037414e 100644
--- a/src/modelgauge/suts/huggingface_chat_completion.py
+++ b/src/modelgauge/suts/huggingface_chat_completion.py
@@ -11,7 +11,6 @@
 from modelgauge.auth.huggingface_inference_token import HuggingFaceInferenceToken
 from modelgauge.model_options import ModelOptions, TokenProbability, TopTokens
 from modelgauge.prompt import TextPrompt, ChatPrompt
-from modelgauge.reasoning_handlers import ThinkingMixin
 from modelgauge.retry_decorator import retry
 from modelgauge.secret_values import InjectSecret
 from modelgauge.sut import PromptResponseSUT, SUTResponse
@@ -186,14 +185,6 @@ def translate_chat_prompt(self, prompt: ChatPrompt, options: ModelOptions) -> Hu
         )
 
 
-@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
-class HuggingFaceChatCompletionDedicatedThinkingSUT(ThinkingMixin, HuggingFaceChatCompletionDedicatedSUT):
-    """
-    A SUT that excludes the reasoning from model output.
-    Reasoning must be seperated from normal output with a </think> tag (like nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)
-    """
-
-
 @modelgauge_sut(capabilities=[AcceptsChatPrompt, AcceptsTextPrompt, ProducesPerTokenLogProbabilities])
 class HuggingFaceChatCompletionServerlessSUT(BaseHuggingFaceChatCompletionSUT):
     """A SUT hosted by an inference provider on huggingface."""
@@ -231,14 +222,6 @@ def translate_chat_prompt(self, prompt: ChatPrompt, options: ModelOptions) -> Hu
         )
 
 
-@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
-class HuggingFaceChatCompletionServerlessThinkingSUT(ThinkingMixin, HuggingFaceChatCompletionServerlessSUT):
-    """
-    A SUT that excludes the reasoning from model output.
-    Reasoning must be seperated from normal output with a </think> tag (like nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)
-    """
-
-
 HF_SECRET = InjectSecret(HuggingFaceInferenceToken)
 
 SUTS.register(
@@ -276,29 +259,7 @@ class HuggingFaceChatCompletionServerlessThinkingSUT(ThinkingMixin, HuggingFaceC
         None,
         HF_SECRET,
     )
-# Special thinking dedicated SUTs
-SUTS.register(
-    HuggingFaceChatCompletionDedicatedThinkingSUT,
-    "nvidia-nemotron-3-nano-30b-a-thinking-excluded-hf",
-    "nvidia-nemotron-3-nano-30b-a-mia",
-    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
-    HF_SECRET,
-)
-SUTS.register(
-    HuggingFaceChatCompletionDedicatedThinkingSUT,
-    "PrimeIntellect-INTELLECT-3-thinking-excluded-hf",
-    "intellect-3-uqs",
-    "PrimeIntellect/INTELLECT-3",
-    HF_SECRET,
-)
-# Special thinking serverless SUTs
-SUTS.register(
-    HuggingFaceChatCompletionServerlessThinkingSUT,
-    "moonshotai/Kimi-K2.5-together-thinking-excluded-hf",
-    "moonshotai/Kimi-K2.5",
-    "together",
-    HF_SECRET,
-)
+
 # Register serverless SUTs.
 SUTS.register(
     HuggingFaceChatCompletionServerlessSUT,
diff --git a/src/modelgauge/suts/together_client.py b/src/modelgauge/suts/together_client.py
index c1103f3e8..01eee3af5 100644
--- a/src/modelgauge/suts/together_client.py
+++ b/src/modelgauge/suts/together_client.py
@@ -11,7 +11,6 @@
 from modelgauge.model_options import ModelOptions, TokenProbability, TopTokens
 from modelgauge.prompt import ChatPrompt, ChatRole, TextPrompt
 from modelgauge.prompt_formatting import format_chat
-from modelgauge.reasoning_handlers import ThinkingMixin
 from modelgauge.secret_values import InjectSecret
 from modelgauge.sut import PromptResponseSUT, SUTResponse
 from modelgauge.sut_capabilities import AcceptsChatPrompt, AcceptsTextPrompt, ProducesPerTokenLogProbabilities
@@ -271,11 +270,6 @@ def translate_response(self, request: TogetherChatRequest, response: TogetherCha
         return SUTResponse(text=text, top_logprobs=logprobs)
 
 
-@modelgauge_sut(capabilities=[AcceptsTextPrompt, AcceptsChatPrompt])
-class TogetherThinkingSUT(ThinkingMixin, TogetherChatSUT):
-    """SUT that preforms reasoning like deepseek-r1"""
-
-
 @modelgauge_sut(
     capabilities=[
         AcceptsTextPrompt,
@@ -382,5 +376,3 @@ def evaluate(self, request: TogetherChatRequest) -> TogetherChatResponse:
 }
 for uid, model_name in DEDICATED_CHAT_MODELS.items():
     SUTS.register(TogetherDedicatedChatSUT, uid, model_name, InjectSecret(TogetherApiKey))
-
-SUTS.register(TogetherThinkingSUT, "deepseek-R1-thinking", "deepseek-ai/DeepSeek-R1", InjectSecret(TogetherApiKey))

From ccf9741b3f4b344b80144f8b2a14978fa0615de9 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Wed, 25 Feb 2026 15:59:12 -0800
Subject: [PATCH 5/9] fix test

---
 tests/modelgauge_tests/test_records.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/modelgauge_tests/test_records.py b/tests/modelgauge_tests/test_records.py
index 057696753..961366360 100644
--- a/tests/modelgauge_tests/test_records.py
+++ b/tests/modelgauge_tests/test_records.py
@@ -136,6 +136,7 @@ def test_serialize_test_record():
         },
         "sut_response": {
           "text": "sut-completion",
+          "reasoning": null,
           "top_logprobs": null
         },
         "annotations": {

From 7e066fa79c2b150f4225f0c937c99118d6df9909 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Thu, 26 Feb 2026 13:54:30 -0800
Subject: [PATCH 6/9] don't run unit tests on real suts

---
 tests/modelbench_tests/test_run.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
index b842d55f4..4901cb42a 100644
--- a/tests/modelbench_tests/test_run.py
+++ b/tests/modelbench_tests/test_run.py
@@ -301,6 +301,7 @@ def runner(self, run_dir):
         def invoke(command, args=None, **kwargs):
             args = list(args or [])
             full_args = ["--run-path", run_dir] + args
+            print(command, full_args, kwargs)
             return runner.invoke(command, full_args, **kwargs)
 
         return invoke
@@ -316,7 +317,7 @@ def invoke(command, args=None, **kwargs):
         ],
         # TODO add more locales as we add support for them
     )
-    @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"])
+    @pytest.mark.parametrize("sut_uid", ["fake-sut"])
     def test_benchmark_basic_run_produces_json(
         self,
         monkeypatch,
@@ -396,7 +397,7 @@ def test_benchmark_basic_run_produces_json(
         ],
         # TODO add more locales as we add support for them
     )
-    @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay;mt=500;t=0.3"])
+    @pytest.mark.parametrize("sut_uid", ["fake-sut"])
     def test_benchmark_multiple_suts_produces_json(
         self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, run_dir, monkeypatch
     ):
@@ -546,7 +547,7 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben
     #
     #     benchmark_arg = mock_score_benchmarks.call_args.args[0][0]
     #     assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark)
-    @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"])
+    @pytest.mark.parametrize("sut_uid", ["fake-sut"])
     def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid):
         _ = runner(cli, ["benchmark", "general", "--sut", sut_uid])
 
@@ -555,14 +556,14 @@ def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid):
         assert benchmark_arg.locale == EN_US
         assert benchmark_arg.prompt_set == "demo"
 
-    @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"])
+    @pytest.mark.parametrize("sut_uid", ["fake-sut"])
     def test_nonexistent_benchmark_prompt_sets_can_not_be_called(self, runner, sut_uid):
         result = runner(cli, ["benchmark", "general", "--prompt-set", "fake", "--sut", sut_uid])
         assert result.exit_code == 2
         assert "Invalid value for '--prompt-set'" in result.output
 
     @pytest.mark.parametrize("prompt_set", GENERAL_PROMPT_SETS.keys())
-    @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"])
+    @pytest.mark.parametrize("sut_uid", ["fake-sut"])
     def test_calls_score_benchmark_with_correct_prompt_set(self, runner, mock_run_benchmarks, prompt_set, sut_uid):
         _ = runner(cli, ["benchmark", "general", "--prompt-set", prompt_set, "--sut", sut_uid])
 

From b50286d0998f1b04627c4ee33e898e432ddc2475 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Fri, 27 Feb 2026 14:41:33 -0800
Subject: [PATCH 7/9] handle multiple think blocks

---
 src/modelgauge/reasoning_handlers.py              | 2 +-
 tests/modelgauge_tests/test_reasoning_handlers.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/modelgauge/reasoning_handlers.py b/src/modelgauge/reasoning_handlers.py
index 5fdf0bb47..f6606ac9c 100644
--- a/src/modelgauge/reasoning_handlers.py
+++ b/src/modelgauge/reasoning_handlers.py
@@ -91,7 +91,7 @@ def evaluate(self, request: ReasoningRequest) -> Any:
     def translate_response(self, request: ReasoningRequest, response: Any) -> SUTResponse:
         text = super().translate_response(request.request, response).text  # type: ignore
 
-        think_close = text.find(self.CLOSE_TAG)
+        think_close = text.rfind(self.CLOSE_TAG)
         if think_close == -1:
             # no closing tag: everything is thinking text
             return SUTResponse(text="", reasoning=self.trim_tokens(text))
diff --git a/tests/modelgauge_tests/test_reasoning_handlers.py b/tests/modelgauge_tests/test_reasoning_handlers.py
index ab1b12131..95bb42ea1 100644
--- a/tests/modelgauge_tests/test_reasoning_handlers.py
+++ b/tests/modelgauge_tests/test_reasoning_handlers.py
@@ -124,6 +124,12 @@ def test_translate_text_prompt_sets_max_tokens(self, sut):
         "full_text, content_text, reason_text",
         [
             ("<think>hmm</think>\n Output", "Output", "hmm"),
+            (
+                "<think>hmm <think> nested think> </think></think>\n Output",
+                "Output",
+                "hmm <think> nested think> </think>",
+            ),
+            ("hmm</think><think>more think</think> Output", "Output", "hmm</think><think>more think"),
             ("hmm</think>\n Output", "Output", "hmm"),
             ("<think>hmmm", "", "hmmm"),
             ("<think>", "", ""),

From 416b9b240299ce15d6f000f0faf5466ed3230c9c Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Fri, 27 Feb 2026 14:43:34 -0800
Subject: [PATCH 8/9] document sut call

---
 src/modelgauge/reasoning_handlers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/modelgauge/reasoning_handlers.py b/src/modelgauge/reasoning_handlers.py
index f6606ac9c..c7c2fe69d 100644
--- a/src/modelgauge/reasoning_handlers.py
+++ b/src/modelgauge/reasoning_handlers.py
@@ -34,6 +34,7 @@ def find_match(sut: PromptResponseSUT) -> type["ReasoningSUT"] | None:
 
     @classmethod
     def sut_matches(cls, sut) -> bool:
+        """Finds a matching reasoning handler for the given SUT. Calling this method will result in 1 SUT call."""
         request = sut.translate_text_prompt(
             TextPrompt(text="If I have 2 apples and give 1 to my friend, how many apples do I have left?"),
             options=ModelOptions(max_tokens=1000),

From dbf09cb9a7270ef98c17128f5271920e9da19af9 Mon Sep 17 00:00:00 2001
From: Barbara Korycki <barbara@mlcommons.org>
Date: Mon, 2 Mar 2026 17:03:06 -0800
Subject: [PATCH 9/9] sut has a reasoning handler

---
 src/modelgauge/reasoning_handlers.py | 28 +++++++++++++++-------------
 src/modelgauge/sut.py                | 27 ++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/src/modelgauge/reasoning_handlers.py b/src/modelgauge/reasoning_handlers.py
index c7c2fe69d..3d2b8ee4c 100644
--- a/src/modelgauge/reasoning_handlers.py
+++ b/src/modelgauge/reasoning_handlers.py
@@ -1,4 +1,4 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from typing import Any
 
 from airrlogger.log_config import get_logger
@@ -19,14 +19,14 @@ class ReasoningRequest(BaseModel):
     max_total_tokens: int | None = None  # Total number of tokens allowed (thinking + content).
 
 
-class ReasoningSUT(PromptResponseSUT):
+class ReasoningHandler(ABC):
     @staticmethod
-    def _get_concrete_reasoning_suts() -> set[type["ReasoningSUT"]]:
-        return get_concrete_subclasses(ReasoningSUT)
+    def _get_concrete_reasoning_suts() -> set[type["ReasoningHandler"]]:
+        return get_concrete_subclasses(ReasoningHandler)
 
     @staticmethod
-    def find_match(sut: PromptResponseSUT) -> type["ReasoningSUT"] | None:
-        reasoning_suts = ReasoningSUT._get_concrete_reasoning_suts()
+    def find_match(sut: PromptResponseSUT) -> type["ReasoningHandler"] | None:
+        reasoning_suts = ReasoningHandler._get_concrete_reasoning_suts()
         for rs in reasoning_suts:
             if rs.sut_matches(sut):
                 return rs
@@ -49,7 +49,7 @@ def response_contains_reasoning(cls, response: SUTResponse) -> bool:
         pass
 
 
-class ThinkingMixin(ReasoningSUT):
+class ThinkingMixin(ReasoningHandler):
     """
     A mixin for SUTs that parses out thinking text from the output.
 
@@ -71,7 +71,9 @@ def __init__(self, uid, *args, **kwargs):
     def response_contains_reasoning(cls, response: SUTResponse) -> bool:
         return cls.OPEN_TAG in response.text or cls.CLOSE_TAG in response.text
 
-    def translate_text_prompt(self, prompt: TextPrompt, options: ModelOptions) -> ReasoningRequest:
+    def translate_text_prompt(
+        self, sut: PromptResponseSUT, prompt: TextPrompt, options: ModelOptions
+    ) -> ReasoningRequest:
         max_total_tokens = options.max_total_output_tokens
         if max_total_tokens is None:
             max_total_tokens = options.max_tokens
@@ -79,18 +81,18 @@ def translate_text_prompt(self, prompt: TextPrompt, options: ModelOptions) -> Re
 
         # Replace max_tokens in raw request with the max total tokens.
         options.max_tokens = max_total_tokens
-        request = super().translate_text_prompt(prompt, options)
+        request = sut.translate_text_prompt(prompt, options)
         return ReasoningRequest(
             request=request,
             max_content_tokens=max_content_tokens,
             max_total_tokens=max_total_tokens,
         )
 
-    def evaluate(self, request: ReasoningRequest) -> Any:
-        return super().evaluate(request.request)  # type: ignore
+    def evaluate(self, sut: PromptResponseSUT, request: ReasoningRequest) -> Any:
+        return sut._evaluate(request.request)  # type: ignore
 
-    def translate_response(self, request: ReasoningRequest, response: Any) -> SUTResponse:
-        text = super().translate_response(request.request, response).text  # type: ignore
+    def translate_response(self, sut: PromptResponseSUT, request: ReasoningRequest, response: Any) -> SUTResponse:
+        text = sut._translate_response(request.request, response).text  # type: ignore
 
         think_close = text.rfind(self.CLOSE_TAG)
         if think_close == -1:
diff --git a/src/modelgauge/sut.py b/src/modelgauge/sut.py
index 205576995..750c0fc0f 100644
--- a/src/modelgauge/sut.py
+++ b/src/modelgauge/sut.py
@@ -59,14 +59,23 @@ class PromptResponseSUT(SUT, Readyable):
     Abstract base class that provides an interface to any SUT that is designed for handling a single-turn.
     """
 
+    def __init__(self, uid: str):
+        super().__init__(uid)
+        self.reasoning_handler: Optional[Type[ReasoningHandler]] = ReasoningHandler.sut_matches(self)
+
     def run_readiness_check(self) -> ReadyResponse:
         raw_request = self.translate_text_prompt(_READINESS_CHECK_TEXT_PROMPT, options=_READINESS_CHECK_SUT_OPTIONS)
         raw_response = self.evaluate(raw_request)
         response = self.translate_response(raw_request, raw_response)
         return ReadyResponse(is_ready=response.text is not None, response=response)
 
-    @not_implemented
     def translate_text_prompt(self, prompt: TextPrompt, options: ModelOptions):
+        if self.reasoning_handler is not None:
+            return self.reasoning_handler.translate_text_prompt(self, prompt, options)
+        return self._translate_text_prompt(prompt, options)
+
+    @not_implemented
+    def _translate_text_prompt(self, prompt: TextPrompt, options: ModelOptions):
         """Convert the prompt + SUT options into the SUT's native representation.
 
         This method must be implemented if the SUT accepts text prompts.
@@ -81,12 +90,24 @@ def translate_chat_prompt(self, prompt: ChatPrompt, options: ModelOptions):
         """
         raise NotImplementedError(f"SUT {self.__class__.__name__} does not implement translate_chat_prompt.")
 
-    @abstractmethod
     def evaluate(self, request):
         """Evaluate this SUT on the native request."""
-        pass
+        if self.reasoning_handler is not None:
+            return self.reasoning_handler.evaluate(self, request)
+        return self._evaluate(request)
 
     @abstractmethod
+    def _evaluate(self, request):
+        """Evaluate this SUT on the native request."""
+        pass
+
     def translate_response(self, request, response) -> SUTResponse:
+        """Convert the native response into a form all Tests can process."""
+        if self.reasoning_handler is not None:
+            return self.reasoning_handler._translate_response(self, request, response)
+        return self._translate_response(request, response)
+
+    @abstractmethod
+    def _translate_response(self, request, response) -> SUTResponse:
         """Convert the native response into a form all Tests can process."""
         pass