From 102a4fc0fed573545f2ca7932ddce39262fffc51 Mon Sep 17 00:00:00 2001 From: Eliott Kalfon Date: Mon, 11 May 2026 22:46:40 -0400 Subject: [PATCH 1/3] initial commit --- src/tabpfn_client/client.py | 97 ++++++++++++++++++++++++++++--------- tests/unit/test_client.py | 92 +++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+), 22 deletions(-) diff --git a/src/tabpfn_client/client.py b/src/tabpfn_client/client.py index 2d1df51..96d573d 100644 --- a/src/tabpfn_client/client.py +++ b/src/tabpfn_client/client.py @@ -152,6 +152,38 @@ def _serialize_to_parquet(df: pd.DataFrame) -> tuple[bytes, str]: return parquet_bytes, crc32c_b64 +def _thinking_aware_dedup_hash( + content_hash: str, + *, + thinking_effort: str | None, + thinking_timeout_s: float | None, + thinking_metric: str | None, +) -> str: + """Return a dedup hash that partitions by thinking config. + + When thinking is disabled (all params None) the content hash is returned + unchanged so non-thinking calls keep the existing dedup semantics. When + thinking is enabled, the thinking config is folded into the hash so that + `(dataset, thinking_config)` becomes the cache unit — same config hits + the existing fit, different config misses it. + """ + if ( + thinking_effort is None + and thinking_timeout_s is None + and thinking_metric is None + ): + return content_hash + discriminator = json.dumps( + { + "thinking_effort": thinking_effort, + "thinking_timeout_s": thinking_timeout_s, + "thinking_metric": thinking_metric, + }, + sort_keys=True, + ) + return _get_crc32c_hash(f"{content_hash}|{discriminator}".encode("utf-8")) + + class NeedsRefittingError(Exception): """ Exception raised when the server is not able to predict given the current state. @@ -315,12 +347,49 @@ def fit( f"the server limit of {limits.dataset_max_size_bytes} bytes." ) + # Resolve thinking config up-front: it feeds both the dedup hash below + # and the FitRequest fields further down. Thinking is enabled when + # either `thinking_mode=True` is set or `thinking_effort` is set; when + # only `thinking_mode=True` is supplied, the effective effort is + # "medium" (matches the server's FitRequest default). + thinking_enabled = bool(tabpfn_config) and ( + bool(tabpfn_config.get("thinking_mode")) + or tabpfn_config.get("thinking_effort") is not None + ) + if thinking_enabled and tabpfn_config: + thinking_effort = tabpfn_config.get("thinking_effort") or "medium" + thinking_timeout_s = tabpfn_config.get("thinking_timeout_s") + thinking_metric = tabpfn_config.get("thinking_metric") + else: + thinking_effort = None + thinking_timeout_s = None + thinking_metric = None + x_bytes, x_crc32c_hash = _serialize_to_parquet(df_X) y_bytes, y_crc32c_hash = _serialize_to_parquet(df_y) if dedup_datasets_enabled(): - x_dedup_hash = x_crc32c_hash - y_dedup_hash = y_crc32c_hash + # When thinking is enabled, mix the thinking config into the dedup + # hash so the server treats (dataset, thinking_config) as the + # cache unit. Thinking is deterministic, so a second call with the + # same dataset *and* same thinking config should hit the existing + # fit; but a follow-up call with a different `thinking_effort` (or + # timeout / metric) must miss it instead of silently reusing the + # earlier fit. The hash is opaque to the server's dedup layer, so + # discriminating it by thinking config is enough to partition the + # cache keys without any server change. + x_dedup_hash = _thinking_aware_dedup_hash( + x_crc32c_hash, + thinking_effort=thinking_effort, + thinking_timeout_s=thinking_timeout_s, + thinking_metric=thinking_metric, + ) + y_dedup_hash = _thinking_aware_dedup_hash( + y_crc32c_hash, + thinking_effort=thinking_effort, + thinking_timeout_s=thinking_timeout_s, + thinking_metric=thinking_metric, + ) else: x_dedup_hash = None y_dedup_hash = None @@ -382,14 +451,6 @@ def fit( raise tabpfn_systems = ["preprocessing", "text"] - # Thinking is enabled when either flag is set: explicit `thinking_mode=True`, - # or any non-None `thinking_effort`. Setting `thinking_effort` alone is - # enough — the server-side validator on FitRequest also normalises this, - # but doing it here means the request body itself is consistent. - thinking_enabled = bool(tabpfn_config) and ( - bool(tabpfn_config.get("thinking_mode")) - or tabpfn_config.get("thinking_effort") is not None - ) if tabpfn_config: if tabpfn_config.get("paper_version") is True: tabpfn_systems = [] @@ -399,18 +460,10 @@ def fit( tabpfn_systems = ["preprocessing", "text", "thinking"] # The client-side `thinking_*` knobs forward 1:1 to the server's - # top-level FitRequest fields. When the user enabled thinking via - # `thinking_mode=True` without picking a level, default to "medium". - # The user-facing kwarg is `thinking_metric`; on the wire it is sent - # as `thinking_effort_metric` (matching the server's FitRequest schema). - if thinking_enabled and tabpfn_config: - thinking_effort = tabpfn_config.get("thinking_effort") or "medium" - thinking_timeout_s = tabpfn_config.get("thinking_timeout_s") - thinking_metric = tabpfn_config.get("thinking_metric") - else: - thinking_effort = None - thinking_timeout_s = None - thinking_metric = None + # top-level FitRequest fields. The user-facing kwarg is + # `thinking_metric`; on the wire it is sent as `thinking_effort_metric` + # (matching the server's FitRequest schema). The effective values were + # resolved above so they could feed into the dedup hash. # Strip client-only keys that the server does not expect (mirrors # the predict path's filter below). diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 2c9ee98..196d548 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -11,6 +11,7 @@ GetModelLimitsResponse, NeedsRefittingError, ServiceClient, + _thinking_aware_dedup_hash, ) from tests.mock_tabpfn_server import with_mock_server @@ -530,3 +531,94 @@ def test_predict_converts_none_in_dict_prediction_to_nan(self, mock_server): np.array([[1.0, np.nan], [np.nan, 4.0]]), equal_nan=True, ) + + +class TestThinkingAwareDedupHash(unittest.TestCase): + """Pins the cache-partitioning rules for thinking-mode fits. + + Thinking mode is deterministic: same dataset + same thinking config must + collide on the server's dedup/fit cache (cache hit -> identical result), + but a *different* thinking config (e.g. effort medium -> high) must miss + so the fit actually runs at the requested effort. + """ + + CONTENT = "content-hash-abc" + + def test_no_thinking_returns_content_hash_unchanged(self): + # Preserves existing dedup behavior for non-thinking fits. + self.assertEqual( + _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort=None, + thinking_timeout_s=None, + thinking_metric=None, + ), + self.CONTENT, + ) + + def test_same_thinking_config_is_stable(self): + # Two calls with identical (dataset, thinking config) must hash to the + # same value so the server's cache hits. + h1 = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=60.0, + thinking_metric="rmse", + ) + h2 = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=60.0, + thinking_metric="rmse", + ) + self.assertEqual(h1, h2) + + def test_effort_change_partitions_cache(self): + # The bug fix: medium -> high on the same dataset must NOT collide. + medium = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=None, + thinking_metric=None, + ) + high = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="high", + thinking_timeout_s=None, + thinking_metric=None, + ) + self.assertNotEqual(medium, high) + + def test_timeout_and_metric_also_partition(self): + base = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=None, + thinking_metric=None, + ) + different_timeout = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=120.0, + thinking_metric=None, + ) + different_metric = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=None, + thinking_metric="rmse", + ) + self.assertNotEqual(base, different_timeout) + self.assertNotEqual(base, different_metric) + self.assertNotEqual(different_timeout, different_metric) + + def test_thinking_hash_differs_from_content_hash(self): + # Enabling thinking must change the hash, otherwise a prior + # non-thinking fit on the same dataset would be served. + with_thinking = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=None, + thinking_metric=None, + ) + self.assertNotEqual(with_thinking, self.CONTENT) From fc62144ca083d6a420ea582166d9064240681055 Mon Sep 17 00:00:00 2001 From: Eliott Kalfon Date: Mon, 11 May 2026 22:55:41 -0400 Subject: [PATCH 2/3] addressed comments --- src/tabpfn_client/client.py | 9 ++++++--- tests/unit/test_client.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/tabpfn_client/client.py b/src/tabpfn_client/client.py index 96d573d..c2d29b1 100644 --- a/src/tabpfn_client/client.py +++ b/src/tabpfn_client/client.py @@ -176,8 +176,11 @@ def _thinking_aware_dedup_hash( discriminator = json.dumps( { "thinking_effort": thinking_effort, - "thinking_timeout_s": thinking_timeout_s, - "thinking_metric": thinking_metric, + "thinking_timeout_s": ( + # normalise to float for cache stability (0 vs 0.0) + float(thinking_timeout_s) if thinking_timeout_s is not None else None + ), + "thinking_effort_metric": thinking_metric, }, sort_keys=True, ) @@ -356,7 +359,7 @@ def fit( bool(tabpfn_config.get("thinking_mode")) or tabpfn_config.get("thinking_effort") is not None ) - if thinking_enabled and tabpfn_config: + if thinking_enabled: thinking_effort = tabpfn_config.get("thinking_effort") or "medium" thinking_timeout_s = tabpfn_config.get("thinking_timeout_s") thinking_metric = tabpfn_config.get("thinking_metric") diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 196d548..1dc0709 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -612,6 +612,23 @@ def test_timeout_and_metric_also_partition(self): self.assertNotEqual(base, different_metric) self.assertNotEqual(different_timeout, different_metric) + def test_int_and_float_timeout_hash_identically(self): + # `json.dumps(60)` and `json.dumps(60.0)` differ; the helper normalizes + # so callers don't suffer spurious cache misses on equivalent values. + h_int = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=60, + thinking_metric=None, + ) + h_float = _thinking_aware_dedup_hash( + self.CONTENT, + thinking_effort="medium", + thinking_timeout_s=60.0, + thinking_metric=None, + ) + self.assertEqual(h_int, h_float) + def test_thinking_hash_differs_from_content_hash(self): # Enabling thinking must change the hash, otherwise a prior # non-thinking fit on the same dataset would be served. From 2401f2b8cb5da70927cc1fb0df28a7674d1f9ce5 Mon Sep 17 00:00:00 2001 From: Eliott Kalfon Date: Mon, 11 May 2026 23:12:48 -0400 Subject: [PATCH 3/3] reduced verbosity --- src/tabpfn_client/client.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/tabpfn_client/client.py b/src/tabpfn_client/client.py index c2d29b1..4e0fc32 100644 --- a/src/tabpfn_client/client.py +++ b/src/tabpfn_client/client.py @@ -161,7 +161,7 @@ def _thinking_aware_dedup_hash( ) -> str: """Return a dedup hash that partitions by thinking config. - When thinking is disabled (all params None) the content hash is returned + When thinking is disabled the content hash is returned unchanged so non-thinking calls keep the existing dedup semantics. When thinking is enabled, the thinking config is folded into the hash so that `(dataset, thinking_config)` becomes the cache unit — same config hits @@ -350,11 +350,6 @@ def fit( f"the server limit of {limits.dataset_max_size_bytes} bytes." ) - # Resolve thinking config up-front: it feeds both the dedup hash below - # and the FitRequest fields further down. Thinking is enabled when - # either `thinking_mode=True` is set or `thinking_effort` is set; when - # only `thinking_mode=True` is supplied, the effective effort is - # "medium" (matches the server's FitRequest default). thinking_enabled = bool(tabpfn_config) and ( bool(tabpfn_config.get("thinking_mode")) or tabpfn_config.get("thinking_effort") is not None @@ -372,15 +367,6 @@ def fit( y_bytes, y_crc32c_hash = _serialize_to_parquet(df_y) if dedup_datasets_enabled(): - # When thinking is enabled, mix the thinking config into the dedup - # hash so the server treats (dataset, thinking_config) as the - # cache unit. Thinking is deterministic, so a second call with the - # same dataset *and* same thinking config should hit the existing - # fit; but a follow-up call with a different `thinking_effort` (or - # timeout / metric) must miss it instead of silently reusing the - # earlier fit. The hash is opaque to the server's dedup layer, so - # discriminating it by thinking config is enough to partition the - # cache keys without any server change. x_dedup_hash = _thinking_aware_dedup_hash( x_crc32c_hash, thinking_effort=thinking_effort,