From e828b14f0bab9c7437d920f978c3acdf3fcebe1c Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Wed, 6 May 2026 18:17:27 -0700
Subject: [PATCH 1/9] Support differentiable_input on TabPFNRegressor

Mirrors the classifier-side prompt-tuning path so gradients can flow from
a downstream loss back through TabPFNRegressor to upstream torch modules
feeding X (and y, when it carries grads). Previously, TabPFNRegressor.fit
raised ValueError("Differentiable input is not supported for regressors
yet.") and there was no fit_with_differentiable_input.

What this changes:
- _initialize_for_differentiable_input(X, y, rng): minimal preprocessing
  that uses PreprocessorConfig("none", differentiable=True), z-normalises
  y as a torch op (preserves grads), and rebuilds raw_space_bardist_ in
  the caller's target scale. Polynomial features are forced to "no" since
  the polynomial step relies on sklearn StandardScaler on numpy.
- fit_with_differentiable_input(X, y): mirrors the classifier method;
  builds an InferenceEngineCachePreprocessing with inference_mode=False.
- _iter_forward_executor: gates use_inference_mode on differentiable_input
  so a user calling forward(X, use_inference_mode=True) after
  fit_with_differentiable_input still gets gradients (parallel to the
  classifier's existing actual_inference_mode gate).
- fit() now raises a clearer ValueError pointing users to the new method
  when differentiable_input=True, instead of silently converting torch
  tensors to numpy.

Tests:
- end-to-end gradient-flow test (CPU + CUDA): a loss computed from
  forward output produces a finite, non-zero gradient on an upstream
  nn.Linear's weight.
- guard tests for fit() with differentiable_input=True and for
  categorical features under the differentiable path.
---
 src/tabpfn/regressor.py           | 170 +++++++++++++++++++++++++++++-
 tests/test_regressor_interface.py |  93 ++++++++++++++++
 2 files changed, 258 insertions(+), 5 deletions(-)

diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index ae5daddf3..8efe45de1 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -54,7 +54,11 @@
     ModelVersion,
 )
 from tabpfn.errors import TabPFNValidationError, handle_oom_errors
-from tabpfn.inference import InferenceEngine, InferenceEngineBatchedNoPreprocessing
+from tabpfn.inference import (
+    InferenceEngine,
+    InferenceEngineBatchedNoPreprocessing,
+    InferenceEngineCachePreprocessing,
+)
 from tabpfn.model_loading import (
     ModelSource,
     load_fitted_tabpfn_model,
@@ -65,12 +69,13 @@
 from tabpfn.preprocessing import (
     EnsembleConfig,
     FeatureSubsamplingMethod,
+    PreprocessorConfig,
     RegressorEnsembleConfig,
     clean_data,
     generate_regression_ensemble_configs,
 )
 from tabpfn.preprocessing.clean import fix_dtypes, process_text_na_dataframe
-from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema
+from tabpfn.preprocessing.datamodel import Feature, FeatureModality, FeatureSchema
 from tabpfn.preprocessing.ensemble import (
     TabPFNEnsemblePreprocessor,
     scale_n_estimators_for_feature_coverage,
@@ -83,12 +88,14 @@
     DevicesSpecification,
     convert_batch_of_cat_ix_to_schema,
     infer_random_state,
+    remove_non_differentiable_preprocessing_from_models,
     transform_borders_one,
     translate_probs_across_borders,
 )
 from tabpfn.validation import (
     ensure_compatible_fit_inputs,
     ensure_compatible_predict_input_sklearn,
+    validate_dataset_size,
 )
 
 if TYPE_CHECKING:
@@ -640,6 +647,86 @@ def _initialize_model_variables(self) -> int:
         """
         return initialize_model_variables_helper(self, self.estimator_type)
 
+    def _initialize_for_differentiable_input(
+        self,
+        X: torch.Tensor,
+        y: torch.Tensor,
+        rng: np.random.Generator,
+    ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]:
+        """Initialize the model for differentiable input.
+
+        Mirrors the classifier-side helper so that gradients can flow from a
+        loss back to upstream torch modules feeding ``X`` (and optionally
+        ``y``). Skips the standard numpy preprocessing path and uses a
+        differentiable identity preprocessor.
+
+        Returns the ensemble configs together with ``X`` and the
+        z-normalised ``y``. The standardisation parameters are stored on
+        ``self`` so ``raw_space_bardist_`` reflects the caller's target
+        scale.
+        """
+        validate_dataset_size(
+            X=X,
+            y=y,
+            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
+            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
+            devices=self.devices_,
+            ignore_pretraining_limits=self.ignore_pretraining_limits,
+        )
+
+        # Minimal preprocessing for prompt tuning: no categorical features,
+        # all-numerical schema, identity preprocessor that preserves grads.
+        if (
+            self.categorical_features_indices is not None
+            and len(self.categorical_features_indices) > 0
+        ):
+            raise ValueError(
+                "Categorical features are not supported for differentiable input."
+            )
+        n_features = X.shape[1]
+        features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
+        self.inferred_feature_schema_ = FeatureSchema(features=features)
+        self.n_features_in_ = n_features
+        self.n_train_samples_ = int(X.shape[0])
+
+        # z-normalise y as a torch op so that gradients flow if y has them.
+        y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor(
+            y, dtype=torch.float32
+        )
+        y_mean = y_float.mean()
+        y_std = y_float.std() + 1e-20
+        self.y_train_mean_ = y_mean.detach().item()
+        self.y_train_std_ = y_std.detach().item()
+        y_normalized = (y_float - y_mean) / y_std
+
+        # raw_space_bardist_ is a constant lookup in caller's target scale; we
+        # detach so the buffer does not accidentally hold onto y's grad graph.
+        borders = self.znorm_space_bardist_.borders.detach()
+        self.raw_space_bardist_ = FullSupportBarDistribution(
+            borders * self.y_train_std_ + self.y_train_mean_,
+        ).float()
+
+        preprocessor_configs = [PreprocessorConfig("none", differentiable=True)]
+        # Polynomial features go through sklearn StandardScaler on numpy and
+        # are not differentiable; force "no" regardless of the runtime default
+        # (the regressor config defaults to a non-zero value).
+        ensemble_configs = generate_regression_ensemble_configs(
+            num_estimators=self.n_estimators,
+            add_fingerprint_feature=self.inference_config_.FINGERPRINT_FEATURE,
+            feature_shift_decoder=self.inference_config_.FEATURE_SHIFT_METHOD,
+            polynomial_features="no",
+            preprocessor_configs=preprocessor_configs,
+            target_transforms=[None],
+            random_state=rng,
+            num_models=len(self.models_),
+            outlier_removal_std=self.inference_config_.get_resolved_outlier_removal_std(
+                estimator_type=self.estimator_type
+            ),
+        )
+        assert len(ensemble_configs) == self.n_estimators
+
+        return ensemble_configs, X, y_normalized
+
     def _initialize_dataset_preprocessing(
         self,
         X: XType,
@@ -793,6 +880,75 @@ def fit_from_preprocessed(
 
         return self
 
+    @track_model_call(model_method="fit", param_names=["X", "y"])
+    def fit_with_differentiable_input(
+        self, X: torch.Tensor, y: torch.Tensor
+    ) -> Self:
+        """Fit the model with differentiable input.
+
+        Mirror of ``TabPFNClassifier.fit_with_differentiable_input``. Lets
+        gradients flow from a downstream loss back through ``X`` (and ``y``,
+        if it carries grads) into upstream torch modules. Use this instead
+        of ``fit`` when ``differentiable_input=True``.
+
+        Args:
+            X: The input data as a torch tensor.
+            y: The target variable as a torch tensor.
+
+        Returns:
+            self
+        """
+        if self.fit_mode != "fit_preprocessors":
+            logging.warning(
+                "The model was not in 'fit_preprocessors' mode. "
+                "Automatically switching to 'fit_preprocessors' mode for differentiable"
+                " input."
+            )
+            self.fit_mode = "fit_preprocessors"
+
+        static_seed, rng = infer_random_state(self.random_state)
+
+        is_first_fit_call = not hasattr(self, "models_")
+        if is_first_fit_call:
+            byte_size = self._initialize_model_variables()
+            ensemble_configs, X, y = self._initialize_for_differentiable_input(
+                X=X, y=y, rng=rng
+            )
+            self.ensemble_configs_ = ensemble_configs  # Store for prompt tuning reuse
+            remove_non_differentiable_preprocessing_from_models(models=self.models_)
+        else:
+            _, _, byte_size = determine_precision(
+                self.inference_precision, self.devices_
+            )
+            ensemble_configs = self.ensemble_configs_  # Reuse from first fit
+
+        self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
+            configs=ensemble_configs,
+            n_samples=X.shape[0],
+            feature_schema=self.inferred_feature_schema_,
+            random_state=static_seed,
+            n_preprocessing_jobs=self.n_preprocessing_jobs,
+            feature_subsampling_method=FeatureSubsamplingMethod(
+                self.inference_config_.FEATURE_SUBSAMPLING_METHOD
+            ),
+            constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT,
+            subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES,
+        )
+
+        self.executor_ = InferenceEngineCachePreprocessing(
+            X_train=X,
+            y_train=y,
+            models=self.models_,
+            ensemble_preprocessor=self.ensemble_preprocessor_,
+            devices=self.devices_,
+            dtype_byte_size=byte_size,
+            force_inference_dtype=self.forced_inference_dtype_,
+            save_peak_mem=self.memory_saving_mode,
+            inference_mode=False,
+        )
+
+        return self
+
     @config_context(transform_output="default")  # type: ignore
     @track_model_call(model_method="fit", param_names=["X", "y"])
     def fit(self, X: XType, y: YType) -> Self:
@@ -807,7 +963,8 @@ def fit(self, X: XType, y: YType) -> Self:
         """
         if self.differentiable_input:
             raise ValueError(
-                "Differentiable input is not supported for regressors yet."
+                "differentiable_input=True requires fit_with_differentiable_input "
+                "with torch tensor X and y, not fit()."
             )
 
         if self.fit_mode == "batched":
@@ -1122,8 +1279,11 @@ def _iter_forward_executor(
         check_is_fitted(self)
         # Ensure torch.inference_mode is OFF to allow gradients
         if self.fit_mode in ["fit_preprocessors", "batched"]:
-            # only these two modes support this option
-            self.executor_.use_torch_inference_mode(use_inference=use_inference_mode)
+            # only these two modes support this option.
+            # Don't enable inference mode when differentiable_input=True (prompt
+            # tuning) to allow gradients to flow through.
+            actual_inference_mode = use_inference_mode and not self.differentiable_input
+            self.executor_.use_torch_inference_mode(use_inference=actual_inference_mode)
         std_borders = self.znorm_space_bardist_.borders.cpu().numpy()
         for output, config in self.executor_.iter_outputs(
             X, autocast=self.use_autocast_, task_type="regression"
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
index 0e3c046a0..e9c94c401 100644
--- a/tests/test_regressor_interface.py
+++ b/tests/test_regressor_interface.py
@@ -976,3 +976,96 @@ def test__create_default_for_version__passes_through_overrides() -> None:
 
     assert estimator.n_estimators == 16
     assert estimator.softmax_temperature == 0.9
+
+
+# ---------------------------------------------------------------------------
+# differentiable_input
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("device", devices)
+def test__fit_with_differentiable_input__grad_flows_to_upstream_module(
+    device: str,
+) -> None:
+    """End-to-end: a loss computed from forward(use_inference_mode=True) after
+    fit_with_differentiable_input must produce a non-zero, finite gradient on
+    an upstream torch module's weights.
+    """
+    torch.manual_seed(0)
+    D, N_train, N_test = 8, 30, 10
+    linear = nn.Linear(D, D).to(device)
+
+    X_train = linear(torch.randn(N_train, D, device=device))
+    X_test = linear(torch.randn(N_test, D, device=device))
+    y_train = torch.randn(N_train, device=device)
+    y_test = torch.randn(N_test, device=device)
+
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device=device,
+        differentiable_input=True,
+    )
+    reg.fit_with_differentiable_input(X_train, y_train)
+
+    averaged_logits, _outputs, borders = reg.forward(
+        X_test, use_inference_mode=True
+    )
+
+    # averaged_logits is [N_borders, N_samples] after the transpose in
+    # forward(); reduce to a scalar per sample via softmax over bin centers.
+    per_sample_logits = averaged_logits.transpose(0, 1)  # [N_test, N_borders]
+    border_t = torch.as_tensor(
+        borders[0],
+        device=per_sample_logits.device,
+        dtype=per_sample_logits.dtype,
+    )
+    n_logits = per_sample_logits.shape[-1]
+    if border_t.numel() == n_logits + 1:
+        bin_centers = (border_t[:-1] + border_t[1:]) / 2.0
+    else:
+        bin_centers = border_t
+    probs = torch.softmax(per_sample_logits.float(), dim=-1)
+    pred_z = (probs * bin_centers).sum(dim=-1)
+    pred = pred_z * float(reg.y_train_std_) + float(reg.y_train_mean_)
+
+    loss = torch.nn.functional.mse_loss(pred.float(), y_test.float())
+    assert loss.requires_grad
+    loss.backward()
+
+    grad = linear.weight.grad
+    assert grad is not None, "gradient did not reach upstream nn.Linear"
+    assert torch.isfinite(grad).all(), "gradient contained NaN/Inf"
+    assert grad.norm().item() > 0, "gradient norm is zero — graph was detached"
+
+
+def test__fit__differentiable_input_true__raises_helpful_error() -> None:
+    """Calling .fit() (instead of fit_with_differentiable_input) when
+    differentiable_input=True must raise a clear error pointing users to the
+    correct API rather than silently running a non-differentiable path.
+    """
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = np.random.default_rng(0).standard_normal((20, 4)).astype(np.float32)
+    y = np.random.default_rng(0).standard_normal(20).astype(np.float32)
+    with pytest.raises(ValueError, match="fit_with_differentiable_input"):
+        reg.fit(X, y)
+
+
+def test__fit_with_differentiable_input__categorical_features_rejected() -> None:
+    """The differentiable path does not support categorical features."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+        categorical_features_indices=[0],
+    )
+    X = torch.randn(20, 4)
+    y = torch.randn(20)
+    with pytest.raises(ValueError, match="Categorical features"):
+        reg.fit_with_differentiable_input(X, y)

From f8f97e261fc56ec25056b8ed51ec831b509a7f38 Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Wed, 6 May 2026 19:25:22 -0700
Subject: [PATCH 2/9] Add changelog entry for PR #923

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changelog/923.fixed.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog/923.fixed.md

diff --git a/changelog/923.fixed.md b/changelog/923.fixed.md
new file mode 100644
index 000000000..09d861805
--- /dev/null
+++ b/changelog/923.fixed.md
@@ -0,0 +1 @@
+Add `TabPFNRegressor.fit_with_differentiable_input(X, y)` so gradients can flow from a downstream loss back through the regressor into upstream torch modules feeding `X` (and `y`, when it carries grads). Mirrors the existing classifier-side path — previously `TabPFNRegressor.fit` raised `ValueError("Differentiable input is not supported for regressors yet.")` and there was no differentiable counterpart.

From cb6c047a52e99e7f279995860f9a35239ce66ba6 Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Wed, 6 May 2026 19:39:12 -0700
Subject: [PATCH 3/9] Refresh target stats on every
 fit_with_differentiable_input call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address gemini-code-assist review on PR #923: the second fit
call previously skipped re-normalising y, leaving y_train_mean_,
y_train_std_, raw_space_bardist_ stuck on the first fit's stats —
silently miscaling predictions when the new target distribution
differed.

Split _initialize_for_differentiable_input into:
  - _initialize_for_differentiable_input: first-call-only setup
    (categorical check, feature schema, ensemble configs). Cached
    in self.ensemble_configs_.
  - _refresh_targets_for_differentiable_input: per-call setup
    (validate_dataset_size, z-normalise y, rebuild raw_space_bardist_,
    update n_train_samples_). Runs on every fit.

fit_with_differentiable_input's else branch now calls the per-call
helper so subsequent fits track the current target distribution
while still reusing the loaded model and ensemble configs.

Add test__fit_with_differentiable_input__second_call_refreshes_target_stats
that fits twice with very different y distributions and checks
y_train_mean_, y_train_std_, and raw_space_bardist_.borders all move.
---
 src/tabpfn/regressor.py           | 82 ++++++++++++++++++-------------
 tests/test_regressor_interface.py | 33 +++++++++++++
 2 files changed, 82 insertions(+), 33 deletions(-)

diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index 8efe45de1..5facebe14 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -647,33 +647,61 @@ def _initialize_model_variables(self) -> int:
         """
         return initialize_model_variables_helper(self, self.estimator_type)
 
+    def _refresh_targets_for_differentiable_input(
+        self, X: torch.Tensor, y: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Per-fit-call data-dependent setup for the differentiable path.
+
+        Validates input shape, z-normalises ``y`` as a torch op (preserves
+        grads), updates the standardisation stats, and rebuilds
+        ``raw_space_bardist_`` in the caller's current target scale. Run on
+        every ``fit_with_differentiable_input`` call so the regressor's
+        target stats always match the data being fit; the model load and
+        ensemble configs are cached in ``_initialize_for_differentiable_input``
+        and run only on the first call.
+        """
+        validate_dataset_size(
+            X=X,
+            y=y,
+            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
+            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
+            devices=self.devices_,
+            ignore_pretraining_limits=self.ignore_pretraining_limits,
+        )
+        self.n_train_samples_ = int(X.shape[0])
+
+        y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor(
+            y, dtype=torch.float32
+        )
+        y_mean = y_float.mean()
+        y_std = y_float.std() + 1e-20
+        self.y_train_mean_ = y_mean.detach().item()
+        self.y_train_std_ = y_std.detach().item()
+        y_normalized = (y_float - y_mean) / y_std
+
+        # raw_space_bardist_ is a constant lookup in the caller's target
+        # scale; detach so the buffer does not hold onto y's grad graph.
+        borders = self.znorm_space_bardist_.borders.detach()
+        self.raw_space_bardist_ = FullSupportBarDistribution(
+            borders * self.y_train_std_ + self.y_train_mean_,
+        ).float()
+        return X, y_normalized
+
     def _initialize_for_differentiable_input(
         self,
         X: torch.Tensor,
         y: torch.Tensor,
         rng: np.random.Generator,
     ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]:
-        """Initialize the model for differentiable input.
+        """First-call setup for the differentiable path.
 
         Mirrors the classifier-side helper so that gradients can flow from a
         loss back to upstream torch modules feeding ``X`` (and optionally
         ``y``). Skips the standard numpy preprocessing path and uses a
-        differentiable identity preprocessor.
-
-        Returns the ensemble configs together with ``X`` and the
-        z-normalised ``y``. The standardisation parameters are stored on
-        ``self`` so ``raw_space_bardist_`` reflects the caller's target
-        scale.
+        differentiable identity preprocessor. Subsequent calls reuse the
+        feature schema and ensemble configs but re-run target normalization
+        via ``_refresh_targets_for_differentiable_input``.
         """
-        validate_dataset_size(
-            X=X,
-            y=y,
-            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
-            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
-            devices=self.devices_,
-            ignore_pretraining_limits=self.ignore_pretraining_limits,
-        )
-
         # Minimal preprocessing for prompt tuning: no categorical features,
         # all-numerical schema, identity preprocessor that preserves grads.
         if (
@@ -687,24 +715,8 @@ def _initialize_for_differentiable_input(
         features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
         self.inferred_feature_schema_ = FeatureSchema(features=features)
         self.n_features_in_ = n_features
-        self.n_train_samples_ = int(X.shape[0])
 
-        # z-normalise y as a torch op so that gradients flow if y has them.
-        y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor(
-            y, dtype=torch.float32
-        )
-        y_mean = y_float.mean()
-        y_std = y_float.std() + 1e-20
-        self.y_train_mean_ = y_mean.detach().item()
-        self.y_train_std_ = y_std.detach().item()
-        y_normalized = (y_float - y_mean) / y_std
-
-        # raw_space_bardist_ is a constant lookup in caller's target scale; we
-        # detach so the buffer does not accidentally hold onto y's grad graph.
-        borders = self.znorm_space_bardist_.borders.detach()
-        self.raw_space_bardist_ = FullSupportBarDistribution(
-            borders * self.y_train_std_ + self.y_train_mean_,
-        ).float()
+        X, y_normalized = self._refresh_targets_for_differentiable_input(X, y)
 
         preprocessor_configs = [PreprocessorConfig("none", differentiable=True)]
         # Polynomial features go through sklearn StandardScaler on numpy and
@@ -921,6 +933,10 @@ def fit_with_differentiable_input(
                 self.inference_precision, self.devices_
             )
             ensemble_configs = self.ensemble_configs_  # Reuse from first fit
+            # Re-validate and re-normalise y for the new fit data so that
+            # raw_space_bardist_ and y_train_mean_/std_ track the current
+            # targets. The model load and ensemble configs stay cached.
+            X, y = self._refresh_targets_for_differentiable_input(X, y)
 
         self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
             configs=ensemble_configs,
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
index e9c94c401..e2e8f16e8 100644
--- a/tests/test_regressor_interface.py
+++ b/tests/test_regressor_interface.py
@@ -1069,3 +1069,36 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None
     y = torch.randn(20)
     with pytest.raises(ValueError, match="Categorical features"):
         reg.fit_with_differentiable_input(X, y)
+
+
+def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None:
+    """A second call with different y must update y_train_mean_/std_ and the
+    raw_space_bardist_; only the model load and ensemble configs are cached."""
+    torch.manual_seed(0)
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X1 = torch.randn(20, 4)
+    y1 = torch.randn(20) * 10.0 + 100.0  # mean ~100, std ~10
+    reg.fit_with_differentiable_input(X1, y1)
+    mean1, std1 = reg.y_train_mean_, reg.y_train_std_
+    bardist_borders1 = reg.raw_space_bardist_.borders.clone()
+
+    X2 = torch.randn(20, 4)
+    y2 = torch.randn(20) * 0.5 - 5.0  # mean ~-5, std ~0.5
+    reg.fit_with_differentiable_input(X2, y2)
+    mean2, std2 = reg.y_train_mean_, reg.y_train_std_
+
+    assert abs(mean2 - mean1) > 1.0, (
+        f"y_train_mean_ should reflect new y; got {mean1} -> {mean2}"
+    )
+    assert abs(std2 - std1) > 1.0, (
+        f"y_train_std_ should reflect new y; got {std1} -> {std2}"
+    )
+    # raw_space_bardist_ borders are derived from y stats; they must move.
+    assert not torch.allclose(reg.raw_space_bardist_.borders, bardist_borders1), (
+        "raw_space_bardist_ must be rebuilt to the new target scale"
+    )

From e9b23f22e80a8eb1f7bfd460d7b1d222f7169f80 Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Wed, 6 May 2026 20:05:49 -0700
Subject: [PATCH 4/9] Address gemini and Copilot review on PR #923

Fixes the medium-severity comments raised on the differentiable_input
regressor path:

1. Feature instances per column: replace
   `[Feature(...)] * n_features` with a list comprehension so each
   column has its own dataclass and a later in-place update on one
   column does not leak across all columns.

2. y stats numerical robustness: switch `y_float.std()` (PyTorch's
   default `correction=1`, which differs from `np.std` and returns
   NaN for N=1) to `clamp(y_float.std(correction=0), min=1e-20)`.
   This matches the standard `fit()` path's `np.std` semantics and
   stays finite for single-sample input.

3. Constant-target guard: a constant y collapses the bardist borders
   to a single point and trips
   `FullSupportBarDistribution`'s strictly-increasing assertion.
   `fit()` short-circuits this with `is_constant_target_`; the
   differentiable path has no analogue, so reject up front with a
   clear ValueError pointing users at `fit()`.

4. Sequential preprocessing for diff input: force
   `n_preprocessing_jobs=1` inside `fit_with_differentiable_input`.
   When X carries an autograd graph, joblib's process-boundary
   pickling breaks the graph; sequential execution preserves it.

The detach-then-`.item()` of `y_train_mean_/std_` is intentional and
not changed: `raw_space_bardist_` is a frozen lookup buffer that
should not hold a y-grad graph; users wanting fully differentiable
target scaling should z-normalise y externally so mean/std become
constants here. Documented inline.

New tests:
- feature_schema_columns_are_independent: catches the alias bug.
- std_matches_population_definition: locks in `np.std` semantics.
- constant_target_rejected: locks in the explicit guard.
- single_sample_y_does_not_nan: confirms N=1 hits the guard cleanly
  rather than producing NaN deep in the bardist.

All 9 differentiable_input tests pass on CPU and CUDA.
---
 src/tabpfn/regressor.py           | 32 ++++++++++++--
 tests/test_regressor_interface.py | 73 +++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index 5facebe14..f30cfbadd 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -674,7 +674,24 @@ def _refresh_targets_for_differentiable_input(
             y, dtype=torch.float32
         )
         y_mean = y_float.mean()
-        y_std = y_float.std() + 1e-20
+        # Match the standard fit path's np.std (population std, ddof=0).
+        # torch.std defaults to correction=1 (sample std), which differs from
+        # numpy and returns NaN for N=1; clamp keeps the divisor non-zero.
+        y_std = torch.clamp(y_float.std(correction=0), min=1e-20)
+        # Constant targets would collapse the bardist borders to a single
+        # point; the differentiable path has no analogue of fit()'s
+        # is_constant_target_ short-circuit, so reject up front.
+        if y_std.detach().item() <= 1e-12:
+            raise ValueError(
+                "Constant or near-constant target (std≈0) is not supported "
+                "by fit_with_differentiable_input; there is no signal to "
+                "predict differentiably. Use fit() for constant-target data."
+            )
+        # Detach when storing as Python floats — raw_space_bardist_ is a
+        # frozen lookup table and must not hold a y-grad graph. Users who
+        # need fully differentiable target scaling should z-normalise y
+        # themselves before calling fit_with_differentiable_input so the
+        # mean/std are constants here.
         self.y_train_mean_ = y_mean.detach().item()
         self.y_train_std_ = y_std.detach().item()
         y_normalized = (y_float - y_mean) / y_std
@@ -712,7 +729,13 @@ def _initialize_for_differentiable_input(
                 "Categorical features are not supported for differentiable input."
             )
         n_features = X.shape[1]
-        features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
+        # One Feature instance per column — list multiplication would share
+        # the same dataclass and any later in-place update would leak across
+        # columns.
+        features = [
+            Feature(name=None, modality=FeatureModality.NUMERICAL)
+            for _ in range(n_features)
+        ]
         self.inferred_feature_schema_ = FeatureSchema(features=features)
         self.n_features_in_ = n_features
 
@@ -938,12 +961,15 @@ def fit_with_differentiable_input(
             # targets. The model load and ensemble configs stay cached.
             X, y = self._refresh_targets_for_differentiable_input(X, y)
 
+        # Force sequential preprocessing: with differentiable input, X carries
+        # an autograd graph that does not survive joblib's process-boundary
+        # pickling. Sequential execution preserves the graph in-process.
         self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
             configs=ensemble_configs,
             n_samples=X.shape[0],
             feature_schema=self.inferred_feature_schema_,
             random_state=static_seed,
-            n_preprocessing_jobs=self.n_preprocessing_jobs,
+            n_preprocessing_jobs=1,
             feature_subsampling_method=FeatureSubsamplingMethod(
                 self.inference_config_.FEATURE_SUBSAMPLING_METHOD
             ),
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
index e2e8f16e8..ea874d409 100644
--- a/tests/test_regressor_interface.py
+++ b/tests/test_regressor_interface.py
@@ -1071,6 +1071,79 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None
         reg.fit_with_differentiable_input(X, y)
 
 
+def test__fit_with_differentiable_input__constant_target_rejected() -> None:
+    """A constant-target y has no signal to predict differentiably and would
+    collapse the bardist borders; reject with a clear error."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(5, 4)
+    y = torch.full((5,), 3.14)
+    with pytest.raises(ValueError, match="Constant or near-constant target"):
+        reg.fit_with_differentiable_input(X, y)
+
+
+def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None:
+    """torch.std defaults to sample std (correction=1) which returns NaN for
+    N=1. Our path uses correction=0 (population std) so std is well defined
+    even for a single sample (it just collapses to 0, which then trips the
+    constant-target guard — what we want). Verify the failure mode is the
+    explicit ValueError, not a downstream NaN."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(1, 4)
+    y = torch.tensor([2.0])
+    with pytest.raises(ValueError, match="Constant or near-constant target"):
+        reg.fit_with_differentiable_input(X, y)
+
+
+def test__fit_with_differentiable_input__std_matches_population_definition() -> None:
+    """The differentiable path's y_train_std_ should match np.std (population
+    std, ddof=0), not torch's default sample std (correction=1), so it lines
+    up with the standard fit() path."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(20, 4)
+    y_np = np.random.default_rng(0).standard_normal(20).astype(np.float32)
+    y = torch.from_numpy(y_np)
+    reg.fit_with_differentiable_input(X, y)
+    expected = float(np.std(y_np))  # ddof=0
+    assert abs(reg.y_train_std_ - expected) < 1e-5, (
+        f"y_train_std_ should equal np.std(y) (population std); "
+        f"got {reg.y_train_std_}, expected {expected}"
+    )
+
+
+def test__fit_with_differentiable_input__feature_schema_columns_are_independent() -> None:
+    """Each column's Feature must be a distinct instance — list multiplication
+    `[Feature(...)] * n` would alias all columns to one mutable dataclass."""
+    reg = TabPFNRegressor(
+        n_estimators=1,
+        ignore_pretraining_limits=True,
+        device="cpu",
+        differentiable_input=True,
+    )
+    X = torch.randn(10, 4)
+    y = torch.randn(10)
+    reg.fit_with_differentiable_input(X, y)
+    feats = reg.inferred_feature_schema_.features
+    assert len(feats) == 4
+    # Distinct instances, not aliases.
+    ids = {id(f) for f in feats}
+    assert len(ids) == 4, "feature columns share the same Feature instance"
+
+
 def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None:
     """A second call with different y must update y_train_mean_/std_ and the
     raw_space_bardist_; only the model load and ensemble configs are cached."""

From bebc23249db642ec42c0220e56b289440e2c2023 Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Fri, 8 May 2026 08:24:51 -0700
Subject: [PATCH 5/9] Fix ruff D209 and E501 in differentiable_input tests

---
 tests/test_regressor_interface.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
index ea874d409..613d6837f 100644
--- a/tests/test_regressor_interface.py
+++ b/tests/test_regressor_interface.py
@@ -1073,7 +1073,8 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None
 
 def test__fit_with_differentiable_input__constant_target_rejected() -> None:
     """A constant-target y has no signal to predict differentiably and would
-    collapse the bardist borders; reject with a clear error."""
+    collapse the bardist borders; reject with a clear error.
+    """
     reg = TabPFNRegressor(
         n_estimators=1,
         ignore_pretraining_limits=True,
@@ -1091,7 +1092,8 @@ def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None:
     N=1. Our path uses correction=0 (population std) so std is well defined
     even for a single sample (it just collapses to 0, which then trips the
     constant-target guard — what we want). Verify the failure mode is the
-    explicit ValueError, not a downstream NaN."""
+    explicit ValueError, not a downstream NaN.
+    """
     reg = TabPFNRegressor(
         n_estimators=1,
         ignore_pretraining_limits=True,
@@ -1107,7 +1109,8 @@ def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None:
 def test__fit_with_differentiable_input__std_matches_population_definition() -> None:
     """The differentiable path's y_train_std_ should match np.std (population
     std, ddof=0), not torch's default sample std (correction=1), so it lines
-    up with the standard fit() path."""
+    up with the standard fit() path.
+    """
     reg = TabPFNRegressor(
         n_estimators=1,
         ignore_pretraining_limits=True,
@@ -1125,9 +1128,10 @@ def test__fit_with_differentiable_input__std_matches_population_definition() ->
     )
 
 
-def test__fit_with_differentiable_input__feature_schema_columns_are_independent() -> None:
+def test__fit_with_differentiable_input__feature_schema_cols_independent() -> None:
     """Each column's Feature must be a distinct instance — list multiplication
-    `[Feature(...)] * n` would alias all columns to one mutable dataclass."""
+    `[Feature(...)] * n` would alias all columns to one mutable dataclass.
+    """
     reg = TabPFNRegressor(
         n_estimators=1,
         ignore_pretraining_limits=True,
@@ -1146,7 +1150,8 @@ def test__fit_with_differentiable_input__feature_schema_columns_are_independent(
 
 def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None:
     """A second call with different y must update y_train_mean_/std_ and the
-    raw_space_bardist_; only the model load and ensemble configs are cached."""
+    raw_space_bardist_; only the model load and ensemble configs are cached.
+    """
     torch.manual_seed(0)
     reg = TabPFNRegressor(
         n_estimators=1,

From c25071aafe0095332b6f8ff98207d55df1bb40d3 Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Sat, 9 May 2026 14:04:04 -0700
Subject: [PATCH 6/9] Apply ruff format to regressor and test files

---
 src/tabpfn/regressor.py           | 10 +++++-----
 tests/test_regressor_interface.py |  4 +---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index f30cfbadd..24f8d6360 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -670,8 +670,10 @@ def _refresh_targets_for_differentiable_input(
         )
         self.n_train_samples_ = int(X.shape[0])
 
-        y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor(
-            y, dtype=torch.float32
+        y_float = (
+            y.float()
+            if isinstance(y, torch.Tensor)
+            else torch.as_tensor(y, dtype=torch.float32)
         )
         y_mean = y_float.mean()
         # Match the standard fit path's np.std (population std, ddof=0).
@@ -916,9 +918,7 @@ def fit_from_preprocessed(
         return self
 
     @track_model_call(model_method="fit", param_names=["X", "y"])
-    def fit_with_differentiable_input(
-        self, X: torch.Tensor, y: torch.Tensor
-    ) -> Self:
+    def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Self:
         """Fit the model with differentiable input.
 
         Mirror of ``TabPFNClassifier.fit_with_differentiable_input``. Lets
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
index 613d6837f..d66435d9b 100644
--- a/tests/test_regressor_interface.py
+++ b/tests/test_regressor_interface.py
@@ -1008,9 +1008,7 @@ def test__fit_with_differentiable_input__grad_flows_to_upstream_module(
     )
     reg.fit_with_differentiable_input(X_train, y_train)
 
-    averaged_logits, _outputs, borders = reg.forward(
-        X_test, use_inference_mode=True
-    )
+    averaged_logits, _outputs, borders = reg.forward(X_test, use_inference_mode=True)
 
     # averaged_logits is [N_borders, N_samples] after the transpose in
     # forward(); reduce to a scalar per sample via softmax over bin centers.

From 644aac833680f5680b4402570621c13bf73e850e Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Tue, 12 May 2026 11:11:33 -0700
Subject: [PATCH 7/9] Fix missing n_estimators_ in differentiable_input fit
 path

The differentiable-input fit path on TabPFNRegressor never set
self.n_estimators_, so forward() / predict() crashed on tqdm(total=...)
with AttributeError. Two call sites were missing the assignment:

1. _initialize_for_differentiable_input now sets n_estimators_ via
   scale_n_estimators_for_feature_coverage, mirroring classifier.py:650.
2. fit_with_differentiable_input's else branch (subsequent fits) now
   re-asserts n_estimators_ from cached ensemble configs, mirroring
   classifier.py:948.

The stale assert len(...) == self.n_estimators (missing underscore) is
fixed at the same time.
---
 src/tabpfn/regressor.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index 24f8d6360..8bb0d864a 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -744,11 +744,19 @@ def _initialize_for_differentiable_input(
         X, y_normalized = self._refresh_targets_for_differentiable_input(X, y)
 
         preprocessor_configs = [PreprocessorConfig("none", differentiable=True)]
+        # n_estimators_ mirrors classifier.py:650 — must be set here so
+        # downstream predict()/forward() (which use self.n_estimators_) work
+        # after the differentiable fit path.
+        self.n_estimators_ = scale_n_estimators_for_feature_coverage(
+            n_estimators=self.n_estimators,
+            n_total_features=n_features,
+            preprocessor_configs=preprocessor_configs,
+        )
         # Polynomial features go through sklearn StandardScaler on numpy and
         # are not differentiable; force "no" regardless of the runtime default
         # (the regressor config defaults to a non-zero value).
         ensemble_configs = generate_regression_ensemble_configs(
-            num_estimators=self.n_estimators,
+            num_estimators=self.n_estimators_,
             add_fingerprint_feature=self.inference_config_.FINGERPRINT_FEATURE,
             feature_shift_decoder=self.inference_config_.FEATURE_SHIFT_METHOD,
             polynomial_features="no",
@@ -760,7 +768,7 @@ def _initialize_for_differentiable_input(
                 estimator_type=self.estimator_type
             ),
         )
-        assert len(ensemble_configs) == self.n_estimators
+        assert len(ensemble_configs) == self.n_estimators_
 
         return ensemble_configs, X, y_normalized
 
@@ -956,6 +964,9 @@ def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Sel
                 self.inference_precision, self.devices_
             )
             ensemble_configs = self.ensemble_configs_  # Reuse from first fit
+            # Mirror classifier.py:948 — re-assert n_estimators_ from cached
+            # configs so it survives across calls (and after pickling).
+            self.n_estimators_ = len(ensemble_configs)
             # Re-validate and re-normalise y for the new fit data so that
             # raw_space_bardist_ and y_train_mean_/std_ track the current
             # targets. The model load and ensemble configs stay cached.

From 01af696f591ff3c2c0672067db18b1ee9acd10fb Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Tue, 12 May 2026 11:12:24 -0700
Subject: [PATCH 8/9] Reduce duplication in differentiable_input path on
 TabPFNRegressor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per klemens-floege review on PR #923. No behaviour change — same
differentiable-input semantics, just less code duplication.

- Share the categorical-features guard. New
  reject_categoricals_for_differentiable_input() in base.py replaces the
  identical inline checks in TabPFNClassifier and TabPFNRegressor.
- Extract _rebuild_raw_space_bardist() on TabPFNRegressor. The same
  three-line construction (borders * std + mean as a
  FullSupportBarDistribution) appears in the standard fit path and the
  differentiable path; the helper detaches borders unconditionally so the
  buffer never holds a y autograd graph (no-op for the standard path).
- Extract _build_ensemble_preprocessor_and_executor() on TabPFNRegressor.
  The two paths' executor-build blocks now share one method; deltas are
  only n_preprocessing_jobs (1 in the differentiable path so the autograd
  graph survives joblib's process-boundary pickling) and inference_mode
  (False under differentiable input).
- Inline _refresh_targets_for_differentiable_input back into
  fit_with_differentiable_input. Lifecycle is clearer with the y-target
  validation, normalisation, and bardist rebuild laid out linearly after
  the first-call / cached-state branch.
- Consolidate three bad-input ValueError tests into one
  pytest.parametrize covering categorical_features, constant_target, and
  single_sample cases.
---
 src/tabpfn/base.py                |  18 ++
 src/tabpfn/classifier.py          |  11 +-
 src/tabpfn/regressor.py           | 264 +++++++++++++++---------------
 tests/test_regressor_interface.py |  84 +++++-----
 4 files changed, 194 insertions(+), 183 deletions(-)

diff --git a/src/tabpfn/base.py b/src/tabpfn/base.py
index 01cd6ef32..fc34f1cf7 100644
--- a/src/tabpfn/base.py
+++ b/src/tabpfn/base.py
@@ -370,6 +370,24 @@ def create_inference_engine(  # noqa: PLR0913
     raise ValueError(f"Invalid fit_mode: {fit_mode}")
 
 
+def reject_categoricals_for_differentiable_input(
+    categorical_features_indices: Sequence[int] | None,
+) -> None:
+    """Reject categorical features in the differentiable-input fit path.
+
+    The differentiable path uses an identity preprocessor (no
+    ordinal-encoding step), so categorical columns have no valid handling
+    and would corrupt the prompt-tuning signal.
+    """
+    if (
+        categorical_features_indices is not None
+        and len(categorical_features_indices) > 0
+    ):
+        raise ValueError(
+            "Categorical features are not supported for differentiable input."
+        )
+
+
 def initialize_model_variables_helper(
     calling_instance: TabPFNRegressor | TabPFNClassifier,
     model_type: Literal["regressor", "classifier"],
diff --git a/src/tabpfn/classifier.py b/src/tabpfn/classifier.py
index 21b21d33d..70504b206 100644
--- a/src/tabpfn/classifier.py
+++ b/src/tabpfn/classifier.py
@@ -41,6 +41,7 @@
     get_embeddings,
     initialize_model_variables_helper,
     initialize_telemetry,
+    reject_categoricals_for_differentiable_input,
 )
 from tabpfn.constants import (
     PROBABILITY_EPSILON_ROUND_ZERO,
@@ -635,13 +636,9 @@ def _initialize_for_differentiable_input(
         )
 
         # Minimal preprocessing for prompt tuning
-        if (
-            self.categorical_features_indices is not None
-            and len(self.categorical_features_indices) > 0
-        ):
-            raise ValueError(
-                "Categorical features are not supported for differentiable input."
-            )
+        reject_categoricals_for_differentiable_input(
+            self.categorical_features_indices
+        )
         n_features = X.shape[1]
         features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
         self.inferred_feature_schema_ = FeatureSchema(features=features)
diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index 8bb0d864a..e9fb39732 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -48,6 +48,7 @@
     get_embeddings,
     initialize_model_variables_helper,
     initialize_telemetry,
+    reject_categoricals_for_differentiable_input,
 )
 from tabpfn.constants import (
     REGRESSION_CONSTANT_TARGET_BORDER_EPSILON,
@@ -57,7 +58,6 @@
 from tabpfn.inference import (
     InferenceEngine,
     InferenceEngineBatchedNoPreprocessing,
-    InferenceEngineCachePreprocessing,
 )
 from tabpfn.model_loading import (
     ModelSource,
@@ -647,89 +647,92 @@ def _initialize_model_variables(self) -> int:
         """
         return initialize_model_variables_helper(self, self.estimator_type)
 
-    def _refresh_targets_for_differentiable_input(
-        self, X: torch.Tensor, y: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Per-fit-call data-dependent setup for the differentiable path.
-
-        Validates input shape, z-normalises ``y`` as a torch op (preserves
-        grads), updates the standardisation stats, and rebuilds
-        ``raw_space_bardist_`` in the caller's current target scale. Run on
-        every ``fit_with_differentiable_input`` call so the regressor's
-        target stats always match the data being fit; the model load and
-        ensemble configs are cached in ``_initialize_for_differentiable_input``
-        and run only on the first call.
-        """
-        validate_dataset_size(
-            X=X,
-            y=y,
-            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
-            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
-            devices=self.devices_,
-            ignore_pretraining_limits=self.ignore_pretraining_limits,
-        )
-        self.n_train_samples_ = int(X.shape[0])
-
-        y_float = (
-            y.float()
-            if isinstance(y, torch.Tensor)
-            else torch.as_tensor(y, dtype=torch.float32)
-        )
-        y_mean = y_float.mean()
-        # Match the standard fit path's np.std (population std, ddof=0).
-        # torch.std defaults to correction=1 (sample std), which differs from
-        # numpy and returns NaN for N=1; clamp keeps the divisor non-zero.
-        y_std = torch.clamp(y_float.std(correction=0), min=1e-20)
-        # Constant targets would collapse the bardist borders to a single
-        # point; the differentiable path has no analogue of fit()'s
-        # is_constant_target_ short-circuit, so reject up front.
-        if y_std.detach().item() <= 1e-12:
-            raise ValueError(
-                "Constant or near-constant target (std≈0) is not supported "
-                "by fit_with_differentiable_input; there is no signal to "
-                "predict differentiably. Use fit() for constant-target data."
-            )
-        # Detach when storing as Python floats — raw_space_bardist_ is a
-        # frozen lookup table and must not hold a y-grad graph. Users who
-        # need fully differentiable target scaling should z-normalise y
-        # themselves before calling fit_with_differentiable_input so the
-        # mean/std are constants here.
-        self.y_train_mean_ = y_mean.detach().item()
-        self.y_train_std_ = y_std.detach().item()
-        y_normalized = (y_float - y_mean) / y_std
+    def _rebuild_raw_space_bardist(self) -> None:
+        """Rebuild ``raw_space_bardist_`` from current ``y_train_mean_``/std_.
 
-        # raw_space_bardist_ is a constant lookup in the caller's target
-        # scale; detach so the buffer does not hold onto y's grad graph.
+        Detaches the znorm-space borders so the rebuilt buffer never holds a
+        y autograd graph — required for the differentiable-input path and a
+        no-op for the standard path. Both ``y_train_mean_`` and
+        ``y_train_std_`` must already be set as Python floats.
+        """
         borders = self.znorm_space_bardist_.borders.detach()
         self.raw_space_bardist_ = FullSupportBarDistribution(
             borders * self.y_train_std_ + self.y_train_mean_,
         ).float()
-        return X, y_normalized
+
+    def _build_ensemble_preprocessor_and_executor(
+        self,
+        *,
+        X: Any,
+        y: Any,
+        ensemble_configs: list[RegressorEnsembleConfig],
+        static_seed: int,
+        byte_size: int,
+        n_preprocessing_jobs: int,
+        inference_mode: bool,
+    ) -> None:
+        """Build ``self.ensemble_preprocessor_`` and ``self.executor_``.
+
+        Shared between the standard fit path and the differentiable-input
+        path. The two paths differ only in ``n_preprocessing_jobs``
+        (forced to 1 in the differentiable path so the autograd graph on
+        ``X`` survives joblib's process-boundary pickling) and
+        ``inference_mode`` (False under differentiable input so backprop
+        works through the executor).
+        """
+        self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
+            configs=ensemble_configs,
+            n_samples=X.shape[0],
+            feature_schema=self.inferred_feature_schema_,
+            # Use static_seed so we're independent of any random generation
+            # inside the initialize functions above.
+            random_state=static_seed,
+            n_preprocessing_jobs=n_preprocessing_jobs,
+            keep_fitted_cache=(self.fit_mode == "fit_with_cache"),
+            enable_gpu_preprocessing=self.inference_config_.ENABLE_GPU_PREPROCESSING,
+            feature_subsampling_method=FeatureSubsamplingMethod(
+                self.inference_config_.FEATURE_SUBSAMPLING_METHOD
+            ),
+            constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT,
+            subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES,
+            importance_top_k_count=self.inference_config_.FEATURE_SUBSAMPLING_IMPORTANCE_TOP_K_COUNT,
+            X_train=X,
+            y_train=y,
+            task_type=self.estimator_type,
+        )
+        self.executor_ = create_inference_engine(
+            fit_mode=self.fit_mode,
+            X_train=X,
+            y_train=y,
+            ensemble_preprocessor=self.ensemble_preprocessor_,
+            models=self.models_,
+            devices_=self.devices_,
+            byte_size=byte_size,
+            forced_inference_dtype_=self.forced_inference_dtype_,
+            memory_saving_mode=self.memory_saving_mode,
+            use_autocast_=self.use_autocast_,
+            inference_mode=inference_mode,
+        )
 
     def _initialize_for_differentiable_input(
         self,
         X: torch.Tensor,
-        y: torch.Tensor,
         rng: np.random.Generator,
-    ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]:
+    ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor]:
         """First-call setup for the differentiable path.
 
         Mirrors the classifier-side helper so that gradients can flow from a
         loss back to upstream torch modules feeding ``X`` (and optionally
         ``y``). Skips the standard numpy preprocessing path and uses a
-        differentiable identity preprocessor. Subsequent calls reuse the
-        feature schema and ensemble configs but re-run target normalization
-        via ``_refresh_targets_for_differentiable_input``.
+        differentiable identity preprocessor. y-target normalization happens
+        every call inside ``fit_with_differentiable_input``; this helper is
+        only for the cached feature-schema and ensemble-config setup.
         """
         # Minimal preprocessing for prompt tuning: no categorical features,
         # all-numerical schema, identity preprocessor that preserves grads.
-        if (
-            self.categorical_features_indices is not None
-            and len(self.categorical_features_indices) > 0
-        ):
-            raise ValueError(
-                "Categorical features are not supported for differentiable input."
-            )
+        reject_categoricals_for_differentiable_input(
+            self.categorical_features_indices
+        )
         n_features = X.shape[1]
         # One Feature instance per column — list multiplication would share
         # the same dataclass and any later in-place update would leak across
@@ -741,12 +744,7 @@ def _initialize_for_differentiable_input(
         self.inferred_feature_schema_ = FeatureSchema(features=features)
         self.n_features_in_ = n_features
 
-        X, y_normalized = self._refresh_targets_for_differentiable_input(X, y)
-
         preprocessor_configs = [PreprocessorConfig("none", differentiable=True)]
-        # n_estimators_ mirrors classifier.py:650 — must be set here so
-        # downstream predict()/forward() (which use self.n_estimators_) work
-        # after the differentiable fit path.
         self.n_estimators_ = scale_n_estimators_for_feature_coverage(
             n_estimators=self.n_estimators,
             n_total_features=n_features,
@@ -770,7 +768,7 @@ def _initialize_for_differentiable_input(
         )
         assert len(ensemble_configs) == self.n_estimators_
 
-        return ensemble_configs, X, y_normalized
+        return ensemble_configs, X
 
     def _initialize_dataset_preprocessing(
         self,
@@ -954,8 +952,8 @@ def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Sel
         is_first_fit_call = not hasattr(self, "models_")
         if is_first_fit_call:
             byte_size = self._initialize_model_variables()
-            ensemble_configs, X, y = self._initialize_for_differentiable_input(
-                X=X, y=y, rng=rng
+            ensemble_configs, X = self._initialize_for_differentiable_input(
+                X=X, rng=rng
             )
             self.ensemble_configs_ = ensemble_configs  # Store for prompt tuning reuse
             remove_non_differentiable_preprocessing_from_models(models=self.models_)
@@ -964,39 +962,59 @@ def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Sel
                 self.inference_precision, self.devices_
             )
             ensemble_configs = self.ensemble_configs_  # Reuse from first fit
-            # Mirror classifier.py:948 — re-assert n_estimators_ from cached
-            # configs so it survives across calls (and after pickling).
+            # Mirror classifier.py: re-assert n_estimators_ from cached
+            # configs so a subsequent call after pickling restores it.
             self.n_estimators_ = len(ensemble_configs)
-            # Re-validate and re-normalise y for the new fit data so that
-            # raw_space_bardist_ and y_train_mean_/std_ track the current
-            # targets. The model load and ensemble configs stay cached.
-            X, y = self._refresh_targets_for_differentiable_input(X, y)
 
-        # Force sequential preprocessing: with differentiable input, X carries
+        # Refresh target stats and rebuild the raw-space bardist on every
+        # call so they track the current fit data; cached state is only the
+        # model load, feature schema, and ensemble configs above.
+        validate_dataset_size(
+            X=X,
+            y=y,
+            max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES,
+            max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES,
+            devices=self.devices_,
+            ignore_pretraining_limits=self.ignore_pretraining_limits,
+        )
+        self.n_train_samples_ = int(X.shape[0])
+
+        y_float = (
+            y.float()
+            if isinstance(y, torch.Tensor)
+            else torch.as_tensor(y, dtype=torch.float32)
+        )
+        y_mean = y_float.mean()
+        # Match the standard fit's np.std (population std, ddof=0). torch.std
+        # defaults to correction=1 and returns NaN for N=1; clamp keeps the
+        # divisor non-zero. The constant-target guard below catches the
+        # remaining bardist-collapse case.
+        y_std = torch.clamp(y_float.std(correction=0), min=1e-20)
+        if y_std.detach().item() <= 1e-12:
+            raise ValueError(
+                "Constant or near-constant target (std≈0) is not supported "
+                "by fit_with_differentiable_input; there is no signal to "
+                "predict differentiably. Use fit() for constant-target data."
+            )
+        # Detach when storing as Python floats — raw_space_bardist_ is a
+        # frozen lookup and must not hold a y autograd graph. Users who need
+        # fully differentiable target scaling should z-normalise y themselves
+        # before calling so the mean/std are constants here.
+        self.y_train_mean_ = y_mean.detach().item()
+        self.y_train_std_ = y_std.detach().item()
+        y = (y_float - y_mean) / y_std
+        self._rebuild_raw_space_bardist()
+
+        # Force sequential preprocessing: with differentiable input X carries
         # an autograd graph that does not survive joblib's process-boundary
         # pickling. Sequential execution preserves the graph in-process.
-        self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor(
-            configs=ensemble_configs,
-            n_samples=X.shape[0],
-            feature_schema=self.inferred_feature_schema_,
-            random_state=static_seed,
+        self._build_ensemble_preprocessor_and_executor(
+            X=X,
+            y=y,
+            ensemble_configs=ensemble_configs,
+            static_seed=static_seed,
+            byte_size=byte_size,
             n_preprocessing_jobs=1,
-            feature_subsampling_method=FeatureSubsamplingMethod(
-                self.inference_config_.FEATURE_SUBSAMPLING_METHOD
-            ),
-            constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT,
-            subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES,
-        )
-
-        self.executor_ = InferenceEngineCachePreprocessing(
-            X_train=X,
-            y_train=y,
-            models=self.models_,
-            ensemble_preprocessor=self.ensemble_preprocessor_,
-            devices=self.devices_,
-            dtype_byte_size=byte_size,
-            force_inference_dtype=self.forced_inference_dtype_,
-            save_peak_mem=self.memory_saving_mode,
             inference_mode=False,
         )
 
@@ -1069,43 +1087,17 @@ def fit(self, X: XType, y: YType) -> Self:
         self.y_train_std_ = std.item() + 1e-20
         self.y_train_mean_ = mean.item()
         y = (y - self.y_train_mean_) / self.y_train_std_
-        self.raw_space_bardist_ = FullSupportBarDistribution(
-            self.znorm_space_bardist_.borders * self.y_train_std_ + self.y_train_mean_,
-        ).float()
+        self._rebuild_raw_space_bardist()
 
-        ensemble_preprocessor = TabPFNEnsemblePreprocessor(
-            configs=ensemble_configs,
-            n_samples=X.shape[0],
-            feature_schema=self.inferred_feature_schema_,
-            # Note: we use the static_seed so we're independent of the random generation
-            # inside the initialize function above
-            random_state=static_seed,
-            n_preprocessing_jobs=self.n_preprocessing_jobs,
-            keep_fitted_cache=(self.fit_mode == "fit_with_cache"),
-            enable_gpu_preprocessing=self.inference_config_.ENABLE_GPU_PREPROCESSING,
-            feature_subsampling_method=FeatureSubsamplingMethod(
-                self.inference_config_.FEATURE_SUBSAMPLING_METHOD
-            ),
-            constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT,
-            subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES,
-            importance_top_k_count=self.inference_config_.FEATURE_SUBSAMPLING_IMPORTANCE_TOP_K_COUNT,
-            X_train=X,
-            y_train=y,
-            task_type=self.estimator_type,
-        )
-
-        self.executor_ = create_inference_engine(
-            fit_mode=self.fit_mode,
-            X_train=X,
-            y_train=y,
-            ensemble_preprocessor=ensemble_preprocessor,
-            models=self.models_,
-            devices_=self.devices_,
+        self._build_ensemble_preprocessor_and_executor(
+            X=X,
+            y=y,
+            ensemble_configs=ensemble_configs,
+            static_seed=static_seed,
             byte_size=byte_size,
-            forced_inference_dtype_=self.forced_inference_dtype_,
-            memory_saving_mode=self.memory_saving_mode,
-            use_autocast_=self.use_autocast_,
+            n_preprocessing_jobs=self.n_preprocessing_jobs,
             # TODO: Standard fit usually uses inference_mode=True, before it was enabled
+            inference_mode=True,
         )
 
         return self
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
index d66435d9b..54bd44450 100644
--- a/tests/test_regressor_interface.py
+++ b/tests/test_regressor_interface.py
@@ -1054,53 +1054,57 @@ def test__fit__differentiable_input_true__raises_helpful_error() -> None:
         reg.fit(X, y)
 
 
-def test__fit_with_differentiable_input__categorical_features_rejected() -> None:
-    """The differentiable path does not support categorical features."""
-    reg = TabPFNRegressor(
-        n_estimators=1,
-        ignore_pretraining_limits=True,
-        device="cpu",
-        differentiable_input=True,
-        categorical_features_indices=[0],
-    )
-    X = torch.randn(20, 4)
-    y = torch.randn(20)
-    with pytest.raises(ValueError, match="Categorical features"):
-        reg.fit_with_differentiable_input(X, y)
-
-
-def test__fit_with_differentiable_input__constant_target_rejected() -> None:
-    """A constant-target y has no signal to predict differentiably and would
-    collapse the bardist borders; reject with a clear error.
-    """
-    reg = TabPFNRegressor(
-        n_estimators=1,
-        ignore_pretraining_limits=True,
-        device="cpu",
-        differentiable_input=True,
-    )
-    X = torch.randn(5, 4)
-    y = torch.full((5,), 3.14)
-    with pytest.raises(ValueError, match="Constant or near-constant target"):
-        reg.fit_with_differentiable_input(X, y)
-
-
-def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None:
-    """torch.std defaults to sample std (correction=1) which returns NaN for
-    N=1. Our path uses correction=0 (population std) so std is well defined
-    even for a single sample (it just collapses to 0, which then trips the
-    constant-target guard — what we want). Verify the failure mode is the
-    explicit ValueError, not a downstream NaN.
+@pytest.mark.parametrize(
+    ("case_id", "extra_kwargs", "X", "y", "match"),
+    [
+        # The differentiable path uses an identity preprocessor and has no
+        # ordinal-encoding step, so categorical columns have no valid handling.
+        (
+            "categorical_features",
+            {"categorical_features_indices": [0]},
+            torch.randn(20, 4),
+            torch.randn(20),
+            "Categorical features",
+        ),
+        # Constant target collapses the bardist borders to a single point.
+        (
+            "constant_target",
+            {},
+            torch.randn(5, 4),
+            torch.full((5,), 3.14),
+            "Constant or near-constant target",
+        ),
+        # torch.std defaults to correction=1 and returns NaN for N=1; our path
+        # uses correction=0 so std collapses to 0 and trips the constant-target
+        # guard instead of a downstream NaN.
+        (
+            "single_sample",
+            {},
+            torch.randn(1, 4),
+            torch.tensor([2.0]),
+            "Constant or near-constant target",
+        ),
+    ],
+)
+def test__fit_with_differentiable_input__bad_input_raises_value_error(
+    case_id: str,
+    extra_kwargs: dict[str, object],
+    X: torch.Tensor,
+    y: torch.Tensor,
+    match: str,
+) -> None:
+    """Bad inputs to the differentiable fit path must raise ValueError with a
+    clear message rather than producing NaNs or crashing downstream.
     """
+    del case_id  # Only used for parametrize ids.
     reg = TabPFNRegressor(
         n_estimators=1,
         ignore_pretraining_limits=True,
         device="cpu",
         differentiable_input=True,
+        **extra_kwargs,  # type: ignore[arg-type]
     )
-    X = torch.randn(1, 4)
-    y = torch.tensor([2.0])
-    with pytest.raises(ValueError, match="Constant or near-constant target"):
+    with pytest.raises(ValueError, match=match):
         reg.fit_with_differentiable_input(X, y)
 
 

From 6988b86f3d2044e032567b6a8f6d5a64e8b78a26 Mon Sep 17 00:00:00 2001
From: lujiazho <lujiazho@usc.edu>
Date: Fri, 15 May 2026 11:40:47 -0700
Subject: [PATCH 9/9] Apply ruff format to classifier and regressor

---
 src/tabpfn/classifier.py | 4 +---
 src/tabpfn/regressor.py  | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/tabpfn/classifier.py b/src/tabpfn/classifier.py
index 70504b206..f3dc426ad 100644
--- a/src/tabpfn/classifier.py
+++ b/src/tabpfn/classifier.py
@@ -636,9 +636,7 @@ def _initialize_for_differentiable_input(
         )
 
         # Minimal preprocessing for prompt tuning
-        reject_categoricals_for_differentiable_input(
-            self.categorical_features_indices
-        )
+        reject_categoricals_for_differentiable_input(self.categorical_features_indices)
         n_features = X.shape[1]
         features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features
         self.inferred_feature_schema_ = FeatureSchema(features=features)
diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
index e9fb39732..bd3fed727 100644
--- a/src/tabpfn/regressor.py
+++ b/src/tabpfn/regressor.py
@@ -730,9 +730,7 @@ def _initialize_for_differentiable_input(
         """
         # Minimal preprocessing for prompt tuning: no categorical features,
         # all-numerical schema, identity preprocessor that preserves grads.
-        reject_categoricals_for_differentiable_input(
-            self.categorical_features_indices
-        )
+        reject_categoricals_for_differentiable_input(self.categorical_features_indices)
         n_features = X.shape[1]
         # One Feature instance per column — list multiplication would share
         # the same dataclass and any later in-place update would leak across