From e828b14f0bab9c7437d920f978c3acdf3fcebe1c Mon Sep 17 00:00:00 2001 From: lujiazho Date: Wed, 6 May 2026 18:17:27 -0700 Subject: [PATCH 1/9] Support differentiable_input on TabPFNRegressor Mirrors the classifier-side prompt-tuning path so gradients can flow from a downstream loss back through TabPFNRegressor to upstream torch modules feeding X (and y, when it carries grads). Previously, TabPFNRegressor.fit raised ValueError("Differentiable input is not supported for regressors yet.") and there was no fit_with_differentiable_input. What this changes: - _initialize_for_differentiable_input(X, y, rng): minimal preprocessing that uses PreprocessorConfig("none", differentiable=True), z-normalises y as a torch op (preserves grads), and rebuilds raw_space_bardist_ in the caller's target scale. Polynomial features are forced to "no" since the polynomial step relies on sklearn StandardScaler on numpy. - fit_with_differentiable_input(X, y): mirrors the classifier method; builds an InferenceEngineCachePreprocessing with inference_mode=False. - _iter_forward_executor: gates use_inference_mode on differentiable_input so a user calling forward(X, use_inference_mode=True) after fit_with_differentiable_input still gets gradients (parallel to the classifier's existing actual_inference_mode gate). - fit() now raises a clearer ValueError pointing users to the new method when differentiable_input=True, instead of silently converting torch tensors to numpy. Tests: - end-to-end gradient-flow test (CPU + CUDA): a loss computed from forward output produces a finite, non-zero gradient on an upstream nn.Linear's weight. - guard tests for fit() with differentiable_input=True and for categorical features under the differentiable path. --- src/tabpfn/regressor.py | 170 +++++++++++++++++++++++++++++- tests/test_regressor_interface.py | 93 ++++++++++++++++ 2 files changed, 258 insertions(+), 5 deletions(-) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index ae5daddf3..8efe45de1 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -54,7 +54,11 @@ ModelVersion, ) from tabpfn.errors import TabPFNValidationError, handle_oom_errors -from tabpfn.inference import InferenceEngine, InferenceEngineBatchedNoPreprocessing +from tabpfn.inference import ( + InferenceEngine, + InferenceEngineBatchedNoPreprocessing, + InferenceEngineCachePreprocessing, +) from tabpfn.model_loading import ( ModelSource, load_fitted_tabpfn_model, @@ -65,12 +69,13 @@ from tabpfn.preprocessing import ( EnsembleConfig, FeatureSubsamplingMethod, + PreprocessorConfig, RegressorEnsembleConfig, clean_data, generate_regression_ensemble_configs, ) from tabpfn.preprocessing.clean import fix_dtypes, process_text_na_dataframe -from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema +from tabpfn.preprocessing.datamodel import Feature, FeatureModality, FeatureSchema from tabpfn.preprocessing.ensemble import ( TabPFNEnsemblePreprocessor, scale_n_estimators_for_feature_coverage, @@ -83,12 +88,14 @@ DevicesSpecification, convert_batch_of_cat_ix_to_schema, infer_random_state, + remove_non_differentiable_preprocessing_from_models, transform_borders_one, translate_probs_across_borders, ) from tabpfn.validation import ( ensure_compatible_fit_inputs, ensure_compatible_predict_input_sklearn, + validate_dataset_size, ) if TYPE_CHECKING: @@ -640,6 +647,86 @@ def _initialize_model_variables(self) -> int: """ return initialize_model_variables_helper(self, self.estimator_type) + def _initialize_for_differentiable_input( + self, + X: torch.Tensor, + y: torch.Tensor, + rng: np.random.Generator, + ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]: + """Initialize the model for differentiable input. + + Mirrors the classifier-side helper so that gradients can flow from a + loss back to upstream torch modules feeding ``X`` (and optionally + ``y``). Skips the standard numpy preprocessing path and uses a + differentiable identity preprocessor. + + Returns the ensemble configs together with ``X`` and the + z-normalised ``y``. The standardisation parameters are stored on + ``self`` so ``raw_space_bardist_`` reflects the caller's target + scale. + """ + validate_dataset_size( + X=X, + y=y, + max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES, + max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES, + devices=self.devices_, + ignore_pretraining_limits=self.ignore_pretraining_limits, + ) + + # Minimal preprocessing for prompt tuning: no categorical features, + # all-numerical schema, identity preprocessor that preserves grads. + if ( + self.categorical_features_indices is not None + and len(self.categorical_features_indices) > 0 + ): + raise ValueError( + "Categorical features are not supported for differentiable input." + ) + n_features = X.shape[1] + features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features + self.inferred_feature_schema_ = FeatureSchema(features=features) + self.n_features_in_ = n_features + self.n_train_samples_ = int(X.shape[0]) + + # z-normalise y as a torch op so that gradients flow if y has them. + y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor( + y, dtype=torch.float32 + ) + y_mean = y_float.mean() + y_std = y_float.std() + 1e-20 + self.y_train_mean_ = y_mean.detach().item() + self.y_train_std_ = y_std.detach().item() + y_normalized = (y_float - y_mean) / y_std + + # raw_space_bardist_ is a constant lookup in caller's target scale; we + # detach so the buffer does not accidentally hold onto y's grad graph. + borders = self.znorm_space_bardist_.borders.detach() + self.raw_space_bardist_ = FullSupportBarDistribution( + borders * self.y_train_std_ + self.y_train_mean_, + ).float() + + preprocessor_configs = [PreprocessorConfig("none", differentiable=True)] + # Polynomial features go through sklearn StandardScaler on numpy and + # are not differentiable; force "no" regardless of the runtime default + # (the regressor config defaults to a non-zero value). + ensemble_configs = generate_regression_ensemble_configs( + num_estimators=self.n_estimators, + add_fingerprint_feature=self.inference_config_.FINGERPRINT_FEATURE, + feature_shift_decoder=self.inference_config_.FEATURE_SHIFT_METHOD, + polynomial_features="no", + preprocessor_configs=preprocessor_configs, + target_transforms=[None], + random_state=rng, + num_models=len(self.models_), + outlier_removal_std=self.inference_config_.get_resolved_outlier_removal_std( + estimator_type=self.estimator_type + ), + ) + assert len(ensemble_configs) == self.n_estimators + + return ensemble_configs, X, y_normalized + def _initialize_dataset_preprocessing( self, X: XType, @@ -793,6 +880,75 @@ def fit_from_preprocessed( return self + @track_model_call(model_method="fit", param_names=["X", "y"]) + def fit_with_differentiable_input( + self, X: torch.Tensor, y: torch.Tensor + ) -> Self: + """Fit the model with differentiable input. + + Mirror of ``TabPFNClassifier.fit_with_differentiable_input``. Lets + gradients flow from a downstream loss back through ``X`` (and ``y``, + if it carries grads) into upstream torch modules. Use this instead + of ``fit`` when ``differentiable_input=True``. + + Args: + X: The input data as a torch tensor. + y: The target variable as a torch tensor. + + Returns: + self + """ + if self.fit_mode != "fit_preprocessors": + logging.warning( + "The model was not in 'fit_preprocessors' mode. " + "Automatically switching to 'fit_preprocessors' mode for differentiable" + " input." + ) + self.fit_mode = "fit_preprocessors" + + static_seed, rng = infer_random_state(self.random_state) + + is_first_fit_call = not hasattr(self, "models_") + if is_first_fit_call: + byte_size = self._initialize_model_variables() + ensemble_configs, X, y = self._initialize_for_differentiable_input( + X=X, y=y, rng=rng + ) + self.ensemble_configs_ = ensemble_configs # Store for prompt tuning reuse + remove_non_differentiable_preprocessing_from_models(models=self.models_) + else: + _, _, byte_size = determine_precision( + self.inference_precision, self.devices_ + ) + ensemble_configs = self.ensemble_configs_ # Reuse from first fit + + self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor( + configs=ensemble_configs, + n_samples=X.shape[0], + feature_schema=self.inferred_feature_schema_, + random_state=static_seed, + n_preprocessing_jobs=self.n_preprocessing_jobs, + feature_subsampling_method=FeatureSubsamplingMethod( + self.inference_config_.FEATURE_SUBSAMPLING_METHOD + ), + constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT, + subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES, + ) + + self.executor_ = InferenceEngineCachePreprocessing( + X_train=X, + y_train=y, + models=self.models_, + ensemble_preprocessor=self.ensemble_preprocessor_, + devices=self.devices_, + dtype_byte_size=byte_size, + force_inference_dtype=self.forced_inference_dtype_, + save_peak_mem=self.memory_saving_mode, + inference_mode=False, + ) + + return self + @config_context(transform_output="default") # type: ignore @track_model_call(model_method="fit", param_names=["X", "y"]) def fit(self, X: XType, y: YType) -> Self: @@ -807,7 +963,8 @@ def fit(self, X: XType, y: YType) -> Self: """ if self.differentiable_input: raise ValueError( - "Differentiable input is not supported for regressors yet." + "differentiable_input=True requires fit_with_differentiable_input " + "with torch tensor X and y, not fit()." ) if self.fit_mode == "batched": @@ -1122,8 +1279,11 @@ def _iter_forward_executor( check_is_fitted(self) # Ensure torch.inference_mode is OFF to allow gradients if self.fit_mode in ["fit_preprocessors", "batched"]: - # only these two modes support this option - self.executor_.use_torch_inference_mode(use_inference=use_inference_mode) + # only these two modes support this option. + # Don't enable inference mode when differentiable_input=True (prompt + # tuning) to allow gradients to flow through. + actual_inference_mode = use_inference_mode and not self.differentiable_input + self.executor_.use_torch_inference_mode(use_inference=actual_inference_mode) std_borders = self.znorm_space_bardist_.borders.cpu().numpy() for output, config in self.executor_.iter_outputs( X, autocast=self.use_autocast_, task_type="regression" diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py index 0e3c046a0..e9c94c401 100644 --- a/tests/test_regressor_interface.py +++ b/tests/test_regressor_interface.py @@ -976,3 +976,96 @@ def test__create_default_for_version__passes_through_overrides() -> None: assert estimator.n_estimators == 16 assert estimator.softmax_temperature == 0.9 + + +# --------------------------------------------------------------------------- +# differentiable_input +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("device", devices) +def test__fit_with_differentiable_input__grad_flows_to_upstream_module( + device: str, +) -> None: + """End-to-end: a loss computed from forward(use_inference_mode=True) after + fit_with_differentiable_input must produce a non-zero, finite gradient on + an upstream torch module's weights. + """ + torch.manual_seed(0) + D, N_train, N_test = 8, 30, 10 + linear = nn.Linear(D, D).to(device) + + X_train = linear(torch.randn(N_train, D, device=device)) + X_test = linear(torch.randn(N_test, D, device=device)) + y_train = torch.randn(N_train, device=device) + y_test = torch.randn(N_test, device=device) + + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device=device, + differentiable_input=True, + ) + reg.fit_with_differentiable_input(X_train, y_train) + + averaged_logits, _outputs, borders = reg.forward( + X_test, use_inference_mode=True + ) + + # averaged_logits is [N_borders, N_samples] after the transpose in + # forward(); reduce to a scalar per sample via softmax over bin centers. + per_sample_logits = averaged_logits.transpose(0, 1) # [N_test, N_borders] + border_t = torch.as_tensor( + borders[0], + device=per_sample_logits.device, + dtype=per_sample_logits.dtype, + ) + n_logits = per_sample_logits.shape[-1] + if border_t.numel() == n_logits + 1: + bin_centers = (border_t[:-1] + border_t[1:]) / 2.0 + else: + bin_centers = border_t + probs = torch.softmax(per_sample_logits.float(), dim=-1) + pred_z = (probs * bin_centers).sum(dim=-1) + pred = pred_z * float(reg.y_train_std_) + float(reg.y_train_mean_) + + loss = torch.nn.functional.mse_loss(pred.float(), y_test.float()) + assert loss.requires_grad + loss.backward() + + grad = linear.weight.grad + assert grad is not None, "gradient did not reach upstream nn.Linear" + assert torch.isfinite(grad).all(), "gradient contained NaN/Inf" + assert grad.norm().item() > 0, "gradient norm is zero — graph was detached" + + +def test__fit__differentiable_input_true__raises_helpful_error() -> None: + """Calling .fit() (instead of fit_with_differentiable_input) when + differentiable_input=True must raise a clear error pointing users to the + correct API rather than silently running a non-differentiable path. + """ + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + ) + X = np.random.default_rng(0).standard_normal((20, 4)).astype(np.float32) + y = np.random.default_rng(0).standard_normal(20).astype(np.float32) + with pytest.raises(ValueError, match="fit_with_differentiable_input"): + reg.fit(X, y) + + +def test__fit_with_differentiable_input__categorical_features_rejected() -> None: + """The differentiable path does not support categorical features.""" + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + categorical_features_indices=[0], + ) + X = torch.randn(20, 4) + y = torch.randn(20) + with pytest.raises(ValueError, match="Categorical features"): + reg.fit_with_differentiable_input(X, y) From f8f97e261fc56ec25056b8ed51ec831b509a7f38 Mon Sep 17 00:00:00 2001 From: lujiazho Date: Wed, 6 May 2026 19:25:22 -0700 Subject: [PATCH 2/9] Add changelog entry for PR #923 Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog/923.fixed.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/923.fixed.md diff --git a/changelog/923.fixed.md b/changelog/923.fixed.md new file mode 100644 index 000000000..09d861805 --- /dev/null +++ b/changelog/923.fixed.md @@ -0,0 +1 @@ +Add `TabPFNRegressor.fit_with_differentiable_input(X, y)` so gradients can flow from a downstream loss back through the regressor into upstream torch modules feeding `X` (and `y`, when it carries grads). Mirrors the existing classifier-side path — previously `TabPFNRegressor.fit` raised `ValueError("Differentiable input is not supported for regressors yet.")` and there was no differentiable counterpart. From cb6c047a52e99e7f279995860f9a35239ce66ba6 Mon Sep 17 00:00:00 2001 From: lujiazho Date: Wed, 6 May 2026 19:39:12 -0700 Subject: [PATCH 3/9] Refresh target stats on every fit_with_differentiable_input call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address gemini-code-assist review on PR #923: the second fit call previously skipped re-normalising y, leaving y_train_mean_, y_train_std_, raw_space_bardist_ stuck on the first fit's stats — silently miscaling predictions when the new target distribution differed. Split _initialize_for_differentiable_input into: - _initialize_for_differentiable_input: first-call-only setup (categorical check, feature schema, ensemble configs). Cached in self.ensemble_configs_. - _refresh_targets_for_differentiable_input: per-call setup (validate_dataset_size, z-normalise y, rebuild raw_space_bardist_, update n_train_samples_). Runs on every fit. fit_with_differentiable_input's else branch now calls the per-call helper so subsequent fits track the current target distribution while still reusing the loaded model and ensemble configs. Add test__fit_with_differentiable_input__second_call_refreshes_target_stats that fits twice with very different y distributions and checks y_train_mean_, y_train_std_, and raw_space_bardist_.borders all move. --- src/tabpfn/regressor.py | 82 ++++++++++++++++++------------- tests/test_regressor_interface.py | 33 +++++++++++++ 2 files changed, 82 insertions(+), 33 deletions(-) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index 8efe45de1..5facebe14 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -647,33 +647,61 @@ def _initialize_model_variables(self) -> int: """ return initialize_model_variables_helper(self, self.estimator_type) + def _refresh_targets_for_differentiable_input( + self, X: torch.Tensor, y: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """Per-fit-call data-dependent setup for the differentiable path. + + Validates input shape, z-normalises ``y`` as a torch op (preserves + grads), updates the standardisation stats, and rebuilds + ``raw_space_bardist_`` in the caller's current target scale. Run on + every ``fit_with_differentiable_input`` call so the regressor's + target stats always match the data being fit; the model load and + ensemble configs are cached in ``_initialize_for_differentiable_input`` + and run only on the first call. + """ + validate_dataset_size( + X=X, + y=y, + max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES, + max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES, + devices=self.devices_, + ignore_pretraining_limits=self.ignore_pretraining_limits, + ) + self.n_train_samples_ = int(X.shape[0]) + + y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor( + y, dtype=torch.float32 + ) + y_mean = y_float.mean() + y_std = y_float.std() + 1e-20 + self.y_train_mean_ = y_mean.detach().item() + self.y_train_std_ = y_std.detach().item() + y_normalized = (y_float - y_mean) / y_std + + # raw_space_bardist_ is a constant lookup in the caller's target + # scale; detach so the buffer does not hold onto y's grad graph. + borders = self.znorm_space_bardist_.borders.detach() + self.raw_space_bardist_ = FullSupportBarDistribution( + borders * self.y_train_std_ + self.y_train_mean_, + ).float() + return X, y_normalized + def _initialize_for_differentiable_input( self, X: torch.Tensor, y: torch.Tensor, rng: np.random.Generator, ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]: - """Initialize the model for differentiable input. + """First-call setup for the differentiable path. Mirrors the classifier-side helper so that gradients can flow from a loss back to upstream torch modules feeding ``X`` (and optionally ``y``). Skips the standard numpy preprocessing path and uses a - differentiable identity preprocessor. - - Returns the ensemble configs together with ``X`` and the - z-normalised ``y``. The standardisation parameters are stored on - ``self`` so ``raw_space_bardist_`` reflects the caller's target - scale. + differentiable identity preprocessor. Subsequent calls reuse the + feature schema and ensemble configs but re-run target normalization + via ``_refresh_targets_for_differentiable_input``. """ - validate_dataset_size( - X=X, - y=y, - max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES, - max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES, - devices=self.devices_, - ignore_pretraining_limits=self.ignore_pretraining_limits, - ) - # Minimal preprocessing for prompt tuning: no categorical features, # all-numerical schema, identity preprocessor that preserves grads. if ( @@ -687,24 +715,8 @@ def _initialize_for_differentiable_input( features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features self.inferred_feature_schema_ = FeatureSchema(features=features) self.n_features_in_ = n_features - self.n_train_samples_ = int(X.shape[0]) - # z-normalise y as a torch op so that gradients flow if y has them. - y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor( - y, dtype=torch.float32 - ) - y_mean = y_float.mean() - y_std = y_float.std() + 1e-20 - self.y_train_mean_ = y_mean.detach().item() - self.y_train_std_ = y_std.detach().item() - y_normalized = (y_float - y_mean) / y_std - - # raw_space_bardist_ is a constant lookup in caller's target scale; we - # detach so the buffer does not accidentally hold onto y's grad graph. - borders = self.znorm_space_bardist_.borders.detach() - self.raw_space_bardist_ = FullSupportBarDistribution( - borders * self.y_train_std_ + self.y_train_mean_, - ).float() + X, y_normalized = self._refresh_targets_for_differentiable_input(X, y) preprocessor_configs = [PreprocessorConfig("none", differentiable=True)] # Polynomial features go through sklearn StandardScaler on numpy and @@ -921,6 +933,10 @@ def fit_with_differentiable_input( self.inference_precision, self.devices_ ) ensemble_configs = self.ensemble_configs_ # Reuse from first fit + # Re-validate and re-normalise y for the new fit data so that + # raw_space_bardist_ and y_train_mean_/std_ track the current + # targets. The model load and ensemble configs stay cached. + X, y = self._refresh_targets_for_differentiable_input(X, y) self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor( configs=ensemble_configs, diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py index e9c94c401..e2e8f16e8 100644 --- a/tests/test_regressor_interface.py +++ b/tests/test_regressor_interface.py @@ -1069,3 +1069,36 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None y = torch.randn(20) with pytest.raises(ValueError, match="Categorical features"): reg.fit_with_differentiable_input(X, y) + + +def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None: + """A second call with different y must update y_train_mean_/std_ and the + raw_space_bardist_; only the model load and ensemble configs are cached.""" + torch.manual_seed(0) + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + ) + X1 = torch.randn(20, 4) + y1 = torch.randn(20) * 10.0 + 100.0 # mean ~100, std ~10 + reg.fit_with_differentiable_input(X1, y1) + mean1, std1 = reg.y_train_mean_, reg.y_train_std_ + bardist_borders1 = reg.raw_space_bardist_.borders.clone() + + X2 = torch.randn(20, 4) + y2 = torch.randn(20) * 0.5 - 5.0 # mean ~-5, std ~0.5 + reg.fit_with_differentiable_input(X2, y2) + mean2, std2 = reg.y_train_mean_, reg.y_train_std_ + + assert abs(mean2 - mean1) > 1.0, ( + f"y_train_mean_ should reflect new y; got {mean1} -> {mean2}" + ) + assert abs(std2 - std1) > 1.0, ( + f"y_train_std_ should reflect new y; got {std1} -> {std2}" + ) + # raw_space_bardist_ borders are derived from y stats; they must move. + assert not torch.allclose(reg.raw_space_bardist_.borders, bardist_borders1), ( + "raw_space_bardist_ must be rebuilt to the new target scale" + ) From e9b23f22e80a8eb1f7bfd460d7b1d222f7169f80 Mon Sep 17 00:00:00 2001 From: lujiazho Date: Wed, 6 May 2026 20:05:49 -0700 Subject: [PATCH 4/9] Address gemini and Copilot review on PR #923 Fixes the medium-severity comments raised on the differentiable_input regressor path: 1. Feature instances per column: replace `[Feature(...)] * n_features` with a list comprehension so each column has its own dataclass and a later in-place update on one column does not leak across all columns. 2. y stats numerical robustness: switch `y_float.std()` (PyTorch's default `correction=1`, which differs from `np.std` and returns NaN for N=1) to `clamp(y_float.std(correction=0), min=1e-20)`. This matches the standard `fit()` path's `np.std` semantics and stays finite for single-sample input. 3. Constant-target guard: a constant y collapses the bardist borders to a single point and trips `FullSupportBarDistribution`'s strictly-increasing assertion. `fit()` short-circuits this with `is_constant_target_`; the differentiable path has no analogue, so reject up front with a clear ValueError pointing users at `fit()`. 4. Sequential preprocessing for diff input: force `n_preprocessing_jobs=1` inside `fit_with_differentiable_input`. When X carries an autograd graph, joblib's process-boundary pickling breaks the graph; sequential execution preserves it. The detach-then-`.item()` of `y_train_mean_/std_` is intentional and not changed: `raw_space_bardist_` is a frozen lookup buffer that should not hold a y-grad graph; users wanting fully differentiable target scaling should z-normalise y externally so mean/std become constants here. Documented inline. New tests: - feature_schema_columns_are_independent: catches the alias bug. - std_matches_population_definition: locks in `np.std` semantics. - constant_target_rejected: locks in the explicit guard. - single_sample_y_does_not_nan: confirms N=1 hits the guard cleanly rather than producing NaN deep in the bardist. All 9 differentiable_input tests pass on CPU and CUDA. --- src/tabpfn/regressor.py | 32 ++++++++++++-- tests/test_regressor_interface.py | 73 +++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index 5facebe14..f30cfbadd 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -674,7 +674,24 @@ def _refresh_targets_for_differentiable_input( y, dtype=torch.float32 ) y_mean = y_float.mean() - y_std = y_float.std() + 1e-20 + # Match the standard fit path's np.std (population std, ddof=0). + # torch.std defaults to correction=1 (sample std), which differs from + # numpy and returns NaN for N=1; clamp keeps the divisor non-zero. + y_std = torch.clamp(y_float.std(correction=0), min=1e-20) + # Constant targets would collapse the bardist borders to a single + # point; the differentiable path has no analogue of fit()'s + # is_constant_target_ short-circuit, so reject up front. + if y_std.detach().item() <= 1e-12: + raise ValueError( + "Constant or near-constant target (std≈0) is not supported " + "by fit_with_differentiable_input; there is no signal to " + "predict differentiably. Use fit() for constant-target data." + ) + # Detach when storing as Python floats — raw_space_bardist_ is a + # frozen lookup table and must not hold a y-grad graph. Users who + # need fully differentiable target scaling should z-normalise y + # themselves before calling fit_with_differentiable_input so the + # mean/std are constants here. self.y_train_mean_ = y_mean.detach().item() self.y_train_std_ = y_std.detach().item() y_normalized = (y_float - y_mean) / y_std @@ -712,7 +729,13 @@ def _initialize_for_differentiable_input( "Categorical features are not supported for differentiable input." ) n_features = X.shape[1] - features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features + # One Feature instance per column — list multiplication would share + # the same dataclass and any later in-place update would leak across + # columns. + features = [ + Feature(name=None, modality=FeatureModality.NUMERICAL) + for _ in range(n_features) + ] self.inferred_feature_schema_ = FeatureSchema(features=features) self.n_features_in_ = n_features @@ -938,12 +961,15 @@ def fit_with_differentiable_input( # targets. The model load and ensemble configs stay cached. X, y = self._refresh_targets_for_differentiable_input(X, y) + # Force sequential preprocessing: with differentiable input, X carries + # an autograd graph that does not survive joblib's process-boundary + # pickling. Sequential execution preserves the graph in-process. self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor( configs=ensemble_configs, n_samples=X.shape[0], feature_schema=self.inferred_feature_schema_, random_state=static_seed, - n_preprocessing_jobs=self.n_preprocessing_jobs, + n_preprocessing_jobs=1, feature_subsampling_method=FeatureSubsamplingMethod( self.inference_config_.FEATURE_SUBSAMPLING_METHOD ), diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py index e2e8f16e8..ea874d409 100644 --- a/tests/test_regressor_interface.py +++ b/tests/test_regressor_interface.py @@ -1071,6 +1071,79 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None reg.fit_with_differentiable_input(X, y) +def test__fit_with_differentiable_input__constant_target_rejected() -> None: + """A constant-target y has no signal to predict differentiably and would + collapse the bardist borders; reject with a clear error.""" + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + ) + X = torch.randn(5, 4) + y = torch.full((5,), 3.14) + with pytest.raises(ValueError, match="Constant or near-constant target"): + reg.fit_with_differentiable_input(X, y) + + +def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None: + """torch.std defaults to sample std (correction=1) which returns NaN for + N=1. Our path uses correction=0 (population std) so std is well defined + even for a single sample (it just collapses to 0, which then trips the + constant-target guard — what we want). Verify the failure mode is the + explicit ValueError, not a downstream NaN.""" + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + ) + X = torch.randn(1, 4) + y = torch.tensor([2.0]) + with pytest.raises(ValueError, match="Constant or near-constant target"): + reg.fit_with_differentiable_input(X, y) + + +def test__fit_with_differentiable_input__std_matches_population_definition() -> None: + """The differentiable path's y_train_std_ should match np.std (population + std, ddof=0), not torch's default sample std (correction=1), so it lines + up with the standard fit() path.""" + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + ) + X = torch.randn(20, 4) + y_np = np.random.default_rng(0).standard_normal(20).astype(np.float32) + y = torch.from_numpy(y_np) + reg.fit_with_differentiable_input(X, y) + expected = float(np.std(y_np)) # ddof=0 + assert abs(reg.y_train_std_ - expected) < 1e-5, ( + f"y_train_std_ should equal np.std(y) (population std); " + f"got {reg.y_train_std_}, expected {expected}" + ) + + +def test__fit_with_differentiable_input__feature_schema_columns_are_independent() -> None: + """Each column's Feature must be a distinct instance — list multiplication + `[Feature(...)] * n` would alias all columns to one mutable dataclass.""" + reg = TabPFNRegressor( + n_estimators=1, + ignore_pretraining_limits=True, + device="cpu", + differentiable_input=True, + ) + X = torch.randn(10, 4) + y = torch.randn(10) + reg.fit_with_differentiable_input(X, y) + feats = reg.inferred_feature_schema_.features + assert len(feats) == 4 + # Distinct instances, not aliases. + ids = {id(f) for f in feats} + assert len(ids) == 4, "feature columns share the same Feature instance" + + def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None: """A second call with different y must update y_train_mean_/std_ and the raw_space_bardist_; only the model load and ensemble configs are cached.""" From bebc23249db642ec42c0220e56b289440e2c2023 Mon Sep 17 00:00:00 2001 From: lujiazho Date: Fri, 8 May 2026 08:24:51 -0700 Subject: [PATCH 5/9] Fix ruff D209 and E501 in differentiable_input tests --- tests/test_regressor_interface.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py index ea874d409..613d6837f 100644 --- a/tests/test_regressor_interface.py +++ b/tests/test_regressor_interface.py @@ -1073,7 +1073,8 @@ def test__fit_with_differentiable_input__categorical_features_rejected() -> None def test__fit_with_differentiable_input__constant_target_rejected() -> None: """A constant-target y has no signal to predict differentiably and would - collapse the bardist borders; reject with a clear error.""" + collapse the bardist borders; reject with a clear error. + """ reg = TabPFNRegressor( n_estimators=1, ignore_pretraining_limits=True, @@ -1091,7 +1092,8 @@ def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None: N=1. Our path uses correction=0 (population std) so std is well defined even for a single sample (it just collapses to 0, which then trips the constant-target guard — what we want). Verify the failure mode is the - explicit ValueError, not a downstream NaN.""" + explicit ValueError, not a downstream NaN. + """ reg = TabPFNRegressor( n_estimators=1, ignore_pretraining_limits=True, @@ -1107,7 +1109,8 @@ def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None: def test__fit_with_differentiable_input__std_matches_population_definition() -> None: """The differentiable path's y_train_std_ should match np.std (population std, ddof=0), not torch's default sample std (correction=1), so it lines - up with the standard fit() path.""" + up with the standard fit() path. + """ reg = TabPFNRegressor( n_estimators=1, ignore_pretraining_limits=True, @@ -1125,9 +1128,10 @@ def test__fit_with_differentiable_input__std_matches_population_definition() -> ) -def test__fit_with_differentiable_input__feature_schema_columns_are_independent() -> None: +def test__fit_with_differentiable_input__feature_schema_cols_independent() -> None: """Each column's Feature must be a distinct instance — list multiplication - `[Feature(...)] * n` would alias all columns to one mutable dataclass.""" + `[Feature(...)] * n` would alias all columns to one mutable dataclass. + """ reg = TabPFNRegressor( n_estimators=1, ignore_pretraining_limits=True, @@ -1146,7 +1150,8 @@ def test__fit_with_differentiable_input__feature_schema_columns_are_independent( def test__fit_with_differentiable_input__second_call_refreshes_target_stats() -> None: """A second call with different y must update y_train_mean_/std_ and the - raw_space_bardist_; only the model load and ensemble configs are cached.""" + raw_space_bardist_; only the model load and ensemble configs are cached. + """ torch.manual_seed(0) reg = TabPFNRegressor( n_estimators=1, From c25071aafe0095332b6f8ff98207d55df1bb40d3 Mon Sep 17 00:00:00 2001 From: lujiazho Date: Sat, 9 May 2026 14:04:04 -0700 Subject: [PATCH 6/9] Apply ruff format to regressor and test files --- src/tabpfn/regressor.py | 10 +++++----- tests/test_regressor_interface.py | 4 +--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index f30cfbadd..24f8d6360 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -670,8 +670,10 @@ def _refresh_targets_for_differentiable_input( ) self.n_train_samples_ = int(X.shape[0]) - y_float = y.float() if isinstance(y, torch.Tensor) else torch.as_tensor( - y, dtype=torch.float32 + y_float = ( + y.float() + if isinstance(y, torch.Tensor) + else torch.as_tensor(y, dtype=torch.float32) ) y_mean = y_float.mean() # Match the standard fit path's np.std (population std, ddof=0). @@ -916,9 +918,7 @@ def fit_from_preprocessed( return self @track_model_call(model_method="fit", param_names=["X", "y"]) - def fit_with_differentiable_input( - self, X: torch.Tensor, y: torch.Tensor - ) -> Self: + def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Self: """Fit the model with differentiable input. Mirror of ``TabPFNClassifier.fit_with_differentiable_input``. Lets diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py index 613d6837f..d66435d9b 100644 --- a/tests/test_regressor_interface.py +++ b/tests/test_regressor_interface.py @@ -1008,9 +1008,7 @@ def test__fit_with_differentiable_input__grad_flows_to_upstream_module( ) reg.fit_with_differentiable_input(X_train, y_train) - averaged_logits, _outputs, borders = reg.forward( - X_test, use_inference_mode=True - ) + averaged_logits, _outputs, borders = reg.forward(X_test, use_inference_mode=True) # averaged_logits is [N_borders, N_samples] after the transpose in # forward(); reduce to a scalar per sample via softmax over bin centers. From 644aac833680f5680b4402570621c13bf73e850e Mon Sep 17 00:00:00 2001 From: lujiazho Date: Tue, 12 May 2026 11:11:33 -0700 Subject: [PATCH 7/9] Fix missing n_estimators_ in differentiable_input fit path The differentiable-input fit path on TabPFNRegressor never set self.n_estimators_, so forward() / predict() crashed on tqdm(total=...) with AttributeError. Two call sites were missing the assignment: 1. _initialize_for_differentiable_input now sets n_estimators_ via scale_n_estimators_for_feature_coverage, mirroring classifier.py:650. 2. fit_with_differentiable_input's else branch (subsequent fits) now re-asserts n_estimators_ from cached ensemble configs, mirroring classifier.py:948. The stale assert len(...) == self.n_estimators (missing underscore) is fixed at the same time. --- src/tabpfn/regressor.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index 24f8d6360..8bb0d864a 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -744,11 +744,19 @@ def _initialize_for_differentiable_input( X, y_normalized = self._refresh_targets_for_differentiable_input(X, y) preprocessor_configs = [PreprocessorConfig("none", differentiable=True)] + # n_estimators_ mirrors classifier.py:650 — must be set here so + # downstream predict()/forward() (which use self.n_estimators_) work + # after the differentiable fit path. + self.n_estimators_ = scale_n_estimators_for_feature_coverage( + n_estimators=self.n_estimators, + n_total_features=n_features, + preprocessor_configs=preprocessor_configs, + ) # Polynomial features go through sklearn StandardScaler on numpy and # are not differentiable; force "no" regardless of the runtime default # (the regressor config defaults to a non-zero value). ensemble_configs = generate_regression_ensemble_configs( - num_estimators=self.n_estimators, + num_estimators=self.n_estimators_, add_fingerprint_feature=self.inference_config_.FINGERPRINT_FEATURE, feature_shift_decoder=self.inference_config_.FEATURE_SHIFT_METHOD, polynomial_features="no", @@ -760,7 +768,7 @@ def _initialize_for_differentiable_input( estimator_type=self.estimator_type ), ) - assert len(ensemble_configs) == self.n_estimators + assert len(ensemble_configs) == self.n_estimators_ return ensemble_configs, X, y_normalized @@ -956,6 +964,9 @@ def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Sel self.inference_precision, self.devices_ ) ensemble_configs = self.ensemble_configs_ # Reuse from first fit + # Mirror classifier.py:948 — re-assert n_estimators_ from cached + # configs so it survives across calls (and after pickling). + self.n_estimators_ = len(ensemble_configs) # Re-validate and re-normalise y for the new fit data so that # raw_space_bardist_ and y_train_mean_/std_ track the current # targets. The model load and ensemble configs stay cached. From 01af696f591ff3c2c0672067db18b1ee9acd10fb Mon Sep 17 00:00:00 2001 From: lujiazho Date: Tue, 12 May 2026 11:12:24 -0700 Subject: [PATCH 8/9] Reduce duplication in differentiable_input path on TabPFNRegressor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per klemens-floege review on PR #923. No behaviour change — same differentiable-input semantics, just less code duplication. - Share the categorical-features guard. New reject_categoricals_for_differentiable_input() in base.py replaces the identical inline checks in TabPFNClassifier and TabPFNRegressor. - Extract _rebuild_raw_space_bardist() on TabPFNRegressor. The same three-line construction (borders * std + mean as a FullSupportBarDistribution) appears in the standard fit path and the differentiable path; the helper detaches borders unconditionally so the buffer never holds a y autograd graph (no-op for the standard path). - Extract _build_ensemble_preprocessor_and_executor() on TabPFNRegressor. The two paths' executor-build blocks now share one method; deltas are only n_preprocessing_jobs (1 in the differentiable path so the autograd graph survives joblib's process-boundary pickling) and inference_mode (False under differentiable input). - Inline _refresh_targets_for_differentiable_input back into fit_with_differentiable_input. Lifecycle is clearer with the y-target validation, normalisation, and bardist rebuild laid out linearly after the first-call / cached-state branch. - Consolidate three bad-input ValueError tests into one pytest.parametrize covering categorical_features, constant_target, and single_sample cases. --- src/tabpfn/base.py | 18 ++ src/tabpfn/classifier.py | 11 +- src/tabpfn/regressor.py | 264 +++++++++++++++--------------- tests/test_regressor_interface.py | 84 +++++----- 4 files changed, 194 insertions(+), 183 deletions(-) diff --git a/src/tabpfn/base.py b/src/tabpfn/base.py index 01cd6ef32..fc34f1cf7 100644 --- a/src/tabpfn/base.py +++ b/src/tabpfn/base.py @@ -370,6 +370,24 @@ def create_inference_engine( # noqa: PLR0913 raise ValueError(f"Invalid fit_mode: {fit_mode}") +def reject_categoricals_for_differentiable_input( + categorical_features_indices: Sequence[int] | None, +) -> None: + """Reject categorical features in the differentiable-input fit path. + + The differentiable path uses an identity preprocessor (no + ordinal-encoding step), so categorical columns have no valid handling + and would corrupt the prompt-tuning signal. + """ + if ( + categorical_features_indices is not None + and len(categorical_features_indices) > 0 + ): + raise ValueError( + "Categorical features are not supported for differentiable input." + ) + + def initialize_model_variables_helper( calling_instance: TabPFNRegressor | TabPFNClassifier, model_type: Literal["regressor", "classifier"], diff --git a/src/tabpfn/classifier.py b/src/tabpfn/classifier.py index 21b21d33d..70504b206 100644 --- a/src/tabpfn/classifier.py +++ b/src/tabpfn/classifier.py @@ -41,6 +41,7 @@ get_embeddings, initialize_model_variables_helper, initialize_telemetry, + reject_categoricals_for_differentiable_input, ) from tabpfn.constants import ( PROBABILITY_EPSILON_ROUND_ZERO, @@ -635,13 +636,9 @@ def _initialize_for_differentiable_input( ) # Minimal preprocessing for prompt tuning - if ( - self.categorical_features_indices is not None - and len(self.categorical_features_indices) > 0 - ): - raise ValueError( - "Categorical features are not supported for differentiable input." - ) + reject_categoricals_for_differentiable_input( + self.categorical_features_indices + ) n_features = X.shape[1] features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features self.inferred_feature_schema_ = FeatureSchema(features=features) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index 8bb0d864a..e9fb39732 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -48,6 +48,7 @@ get_embeddings, initialize_model_variables_helper, initialize_telemetry, + reject_categoricals_for_differentiable_input, ) from tabpfn.constants import ( REGRESSION_CONSTANT_TARGET_BORDER_EPSILON, @@ -57,7 +58,6 @@ from tabpfn.inference import ( InferenceEngine, InferenceEngineBatchedNoPreprocessing, - InferenceEngineCachePreprocessing, ) from tabpfn.model_loading import ( ModelSource, @@ -647,89 +647,92 @@ def _initialize_model_variables(self) -> int: """ return initialize_model_variables_helper(self, self.estimator_type) - def _refresh_targets_for_differentiable_input( - self, X: torch.Tensor, y: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor]: - """Per-fit-call data-dependent setup for the differentiable path. - - Validates input shape, z-normalises ``y`` as a torch op (preserves - grads), updates the standardisation stats, and rebuilds - ``raw_space_bardist_`` in the caller's current target scale. Run on - every ``fit_with_differentiable_input`` call so the regressor's - target stats always match the data being fit; the model load and - ensemble configs are cached in ``_initialize_for_differentiable_input`` - and run only on the first call. - """ - validate_dataset_size( - X=X, - y=y, - max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES, - max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES, - devices=self.devices_, - ignore_pretraining_limits=self.ignore_pretraining_limits, - ) - self.n_train_samples_ = int(X.shape[0]) - - y_float = ( - y.float() - if isinstance(y, torch.Tensor) - else torch.as_tensor(y, dtype=torch.float32) - ) - y_mean = y_float.mean() - # Match the standard fit path's np.std (population std, ddof=0). - # torch.std defaults to correction=1 (sample std), which differs from - # numpy and returns NaN for N=1; clamp keeps the divisor non-zero. - y_std = torch.clamp(y_float.std(correction=0), min=1e-20) - # Constant targets would collapse the bardist borders to a single - # point; the differentiable path has no analogue of fit()'s - # is_constant_target_ short-circuit, so reject up front. - if y_std.detach().item() <= 1e-12: - raise ValueError( - "Constant or near-constant target (std≈0) is not supported " - "by fit_with_differentiable_input; there is no signal to " - "predict differentiably. Use fit() for constant-target data." - ) - # Detach when storing as Python floats — raw_space_bardist_ is a - # frozen lookup table and must not hold a y-grad graph. Users who - # need fully differentiable target scaling should z-normalise y - # themselves before calling fit_with_differentiable_input so the - # mean/std are constants here. - self.y_train_mean_ = y_mean.detach().item() - self.y_train_std_ = y_std.detach().item() - y_normalized = (y_float - y_mean) / y_std + def _rebuild_raw_space_bardist(self) -> None: + """Rebuild ``raw_space_bardist_`` from current ``y_train_mean_``/std_. - # raw_space_bardist_ is a constant lookup in the caller's target - # scale; detach so the buffer does not hold onto y's grad graph. + Detaches the znorm-space borders so the rebuilt buffer never holds a + y autograd graph — required for the differentiable-input path and a + no-op for the standard path. Both ``y_train_mean_`` and + ``y_train_std_`` must already be set as Python floats. + """ borders = self.znorm_space_bardist_.borders.detach() self.raw_space_bardist_ = FullSupportBarDistribution( borders * self.y_train_std_ + self.y_train_mean_, ).float() - return X, y_normalized + + def _build_ensemble_preprocessor_and_executor( + self, + *, + X: Any, + y: Any, + ensemble_configs: list[RegressorEnsembleConfig], + static_seed: int, + byte_size: int, + n_preprocessing_jobs: int, + inference_mode: bool, + ) -> None: + """Build ``self.ensemble_preprocessor_`` and ``self.executor_``. + + Shared between the standard fit path and the differentiable-input + path. The two paths differ only in ``n_preprocessing_jobs`` + (forced to 1 in the differentiable path so the autograd graph on + ``X`` survives joblib's process-boundary pickling) and + ``inference_mode`` (False under differentiable input so backprop + works through the executor). + """ + self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor( + configs=ensemble_configs, + n_samples=X.shape[0], + feature_schema=self.inferred_feature_schema_, + # Use static_seed so we're independent of any random generation + # inside the initialize functions above. + random_state=static_seed, + n_preprocessing_jobs=n_preprocessing_jobs, + keep_fitted_cache=(self.fit_mode == "fit_with_cache"), + enable_gpu_preprocessing=self.inference_config_.ENABLE_GPU_PREPROCESSING, + feature_subsampling_method=FeatureSubsamplingMethod( + self.inference_config_.FEATURE_SUBSAMPLING_METHOD + ), + constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT, + subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES, + importance_top_k_count=self.inference_config_.FEATURE_SUBSAMPLING_IMPORTANCE_TOP_K_COUNT, + X_train=X, + y_train=y, + task_type=self.estimator_type, + ) + self.executor_ = create_inference_engine( + fit_mode=self.fit_mode, + X_train=X, + y_train=y, + ensemble_preprocessor=self.ensemble_preprocessor_, + models=self.models_, + devices_=self.devices_, + byte_size=byte_size, + forced_inference_dtype_=self.forced_inference_dtype_, + memory_saving_mode=self.memory_saving_mode, + use_autocast_=self.use_autocast_, + inference_mode=inference_mode, + ) def _initialize_for_differentiable_input( self, X: torch.Tensor, - y: torch.Tensor, rng: np.random.Generator, - ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor, torch.Tensor]: + ) -> tuple[list[RegressorEnsembleConfig], torch.Tensor]: """First-call setup for the differentiable path. Mirrors the classifier-side helper so that gradients can flow from a loss back to upstream torch modules feeding ``X`` (and optionally ``y``). Skips the standard numpy preprocessing path and uses a - differentiable identity preprocessor. Subsequent calls reuse the - feature schema and ensemble configs but re-run target normalization - via ``_refresh_targets_for_differentiable_input``. + differentiable identity preprocessor. y-target normalization happens + every call inside ``fit_with_differentiable_input``; this helper is + only for the cached feature-schema and ensemble-config setup. """ # Minimal preprocessing for prompt tuning: no categorical features, # all-numerical schema, identity preprocessor that preserves grads. - if ( - self.categorical_features_indices is not None - and len(self.categorical_features_indices) > 0 - ): - raise ValueError( - "Categorical features are not supported for differentiable input." - ) + reject_categoricals_for_differentiable_input( + self.categorical_features_indices + ) n_features = X.shape[1] # One Feature instance per column — list multiplication would share # the same dataclass and any later in-place update would leak across @@ -741,12 +744,7 @@ def _initialize_for_differentiable_input( self.inferred_feature_schema_ = FeatureSchema(features=features) self.n_features_in_ = n_features - X, y_normalized = self._refresh_targets_for_differentiable_input(X, y) - preprocessor_configs = [PreprocessorConfig("none", differentiable=True)] - # n_estimators_ mirrors classifier.py:650 — must be set here so - # downstream predict()/forward() (which use self.n_estimators_) work - # after the differentiable fit path. self.n_estimators_ = scale_n_estimators_for_feature_coverage( n_estimators=self.n_estimators, n_total_features=n_features, @@ -770,7 +768,7 @@ def _initialize_for_differentiable_input( ) assert len(ensemble_configs) == self.n_estimators_ - return ensemble_configs, X, y_normalized + return ensemble_configs, X def _initialize_dataset_preprocessing( self, @@ -954,8 +952,8 @@ def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Sel is_first_fit_call = not hasattr(self, "models_") if is_first_fit_call: byte_size = self._initialize_model_variables() - ensemble_configs, X, y = self._initialize_for_differentiable_input( - X=X, y=y, rng=rng + ensemble_configs, X = self._initialize_for_differentiable_input( + X=X, rng=rng ) self.ensemble_configs_ = ensemble_configs # Store for prompt tuning reuse remove_non_differentiable_preprocessing_from_models(models=self.models_) @@ -964,39 +962,59 @@ def fit_with_differentiable_input(self, X: torch.Tensor, y: torch.Tensor) -> Sel self.inference_precision, self.devices_ ) ensemble_configs = self.ensemble_configs_ # Reuse from first fit - # Mirror classifier.py:948 — re-assert n_estimators_ from cached - # configs so it survives across calls (and after pickling). + # Mirror classifier.py: re-assert n_estimators_ from cached + # configs so a subsequent call after pickling restores it. self.n_estimators_ = len(ensemble_configs) - # Re-validate and re-normalise y for the new fit data so that - # raw_space_bardist_ and y_train_mean_/std_ track the current - # targets. The model load and ensemble configs stay cached. - X, y = self._refresh_targets_for_differentiable_input(X, y) - # Force sequential preprocessing: with differentiable input, X carries + # Refresh target stats and rebuild the raw-space bardist on every + # call so they track the current fit data; cached state is only the + # model load, feature schema, and ensemble configs above. + validate_dataset_size( + X=X, + y=y, + max_num_samples=self.inference_config_.MAX_NUMBER_OF_SAMPLES, + max_num_features=self.inference_config_.MAX_NUMBER_OF_FEATURES, + devices=self.devices_, + ignore_pretraining_limits=self.ignore_pretraining_limits, + ) + self.n_train_samples_ = int(X.shape[0]) + + y_float = ( + y.float() + if isinstance(y, torch.Tensor) + else torch.as_tensor(y, dtype=torch.float32) + ) + y_mean = y_float.mean() + # Match the standard fit's np.std (population std, ddof=0). torch.std + # defaults to correction=1 and returns NaN for N=1; clamp keeps the + # divisor non-zero. The constant-target guard below catches the + # remaining bardist-collapse case. + y_std = torch.clamp(y_float.std(correction=0), min=1e-20) + if y_std.detach().item() <= 1e-12: + raise ValueError( + "Constant or near-constant target (std≈0) is not supported " + "by fit_with_differentiable_input; there is no signal to " + "predict differentiably. Use fit() for constant-target data." + ) + # Detach when storing as Python floats — raw_space_bardist_ is a + # frozen lookup and must not hold a y autograd graph. Users who need + # fully differentiable target scaling should z-normalise y themselves + # before calling so the mean/std are constants here. + self.y_train_mean_ = y_mean.detach().item() + self.y_train_std_ = y_std.detach().item() + y = (y_float - y_mean) / y_std + self._rebuild_raw_space_bardist() + + # Force sequential preprocessing: with differentiable input X carries # an autograd graph that does not survive joblib's process-boundary # pickling. Sequential execution preserves the graph in-process. - self.ensemble_preprocessor_ = TabPFNEnsemblePreprocessor( - configs=ensemble_configs, - n_samples=X.shape[0], - feature_schema=self.inferred_feature_schema_, - random_state=static_seed, + self._build_ensemble_preprocessor_and_executor( + X=X, + y=y, + ensemble_configs=ensemble_configs, + static_seed=static_seed, + byte_size=byte_size, n_preprocessing_jobs=1, - feature_subsampling_method=FeatureSubsamplingMethod( - self.inference_config_.FEATURE_SUBSAMPLING_METHOD - ), - constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT, - subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES, - ) - - self.executor_ = InferenceEngineCachePreprocessing( - X_train=X, - y_train=y, - models=self.models_, - ensemble_preprocessor=self.ensemble_preprocessor_, - devices=self.devices_, - dtype_byte_size=byte_size, - force_inference_dtype=self.forced_inference_dtype_, - save_peak_mem=self.memory_saving_mode, inference_mode=False, ) @@ -1069,43 +1087,17 @@ def fit(self, X: XType, y: YType) -> Self: self.y_train_std_ = std.item() + 1e-20 self.y_train_mean_ = mean.item() y = (y - self.y_train_mean_) / self.y_train_std_ - self.raw_space_bardist_ = FullSupportBarDistribution( - self.znorm_space_bardist_.borders * self.y_train_std_ + self.y_train_mean_, - ).float() + self._rebuild_raw_space_bardist() - ensemble_preprocessor = TabPFNEnsemblePreprocessor( - configs=ensemble_configs, - n_samples=X.shape[0], - feature_schema=self.inferred_feature_schema_, - # Note: we use the static_seed so we're independent of the random generation - # inside the initialize function above - random_state=static_seed, - n_preprocessing_jobs=self.n_preprocessing_jobs, - keep_fitted_cache=(self.fit_mode == "fit_with_cache"), - enable_gpu_preprocessing=self.inference_config_.ENABLE_GPU_PREPROCESSING, - feature_subsampling_method=FeatureSubsamplingMethod( - self.inference_config_.FEATURE_SUBSAMPLING_METHOD - ), - constant_feature_count=self.inference_config_.FEATURE_SUBSAMPLING_CONSTANT_FEATURE_COUNT, - subsample_samples=self.inference_config_.SUBSAMPLE_SAMPLES, - importance_top_k_count=self.inference_config_.FEATURE_SUBSAMPLING_IMPORTANCE_TOP_K_COUNT, - X_train=X, - y_train=y, - task_type=self.estimator_type, - ) - - self.executor_ = create_inference_engine( - fit_mode=self.fit_mode, - X_train=X, - y_train=y, - ensemble_preprocessor=ensemble_preprocessor, - models=self.models_, - devices_=self.devices_, + self._build_ensemble_preprocessor_and_executor( + X=X, + y=y, + ensemble_configs=ensemble_configs, + static_seed=static_seed, byte_size=byte_size, - forced_inference_dtype_=self.forced_inference_dtype_, - memory_saving_mode=self.memory_saving_mode, - use_autocast_=self.use_autocast_, + n_preprocessing_jobs=self.n_preprocessing_jobs, # TODO: Standard fit usually uses inference_mode=True, before it was enabled + inference_mode=True, ) return self diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py index d66435d9b..54bd44450 100644 --- a/tests/test_regressor_interface.py +++ b/tests/test_regressor_interface.py @@ -1054,53 +1054,57 @@ def test__fit__differentiable_input_true__raises_helpful_error() -> None: reg.fit(X, y) -def test__fit_with_differentiable_input__categorical_features_rejected() -> None: - """The differentiable path does not support categorical features.""" - reg = TabPFNRegressor( - n_estimators=1, - ignore_pretraining_limits=True, - device="cpu", - differentiable_input=True, - categorical_features_indices=[0], - ) - X = torch.randn(20, 4) - y = torch.randn(20) - with pytest.raises(ValueError, match="Categorical features"): - reg.fit_with_differentiable_input(X, y) - - -def test__fit_with_differentiable_input__constant_target_rejected() -> None: - """A constant-target y has no signal to predict differentiably and would - collapse the bardist borders; reject with a clear error. - """ - reg = TabPFNRegressor( - n_estimators=1, - ignore_pretraining_limits=True, - device="cpu", - differentiable_input=True, - ) - X = torch.randn(5, 4) - y = torch.full((5,), 3.14) - with pytest.raises(ValueError, match="Constant or near-constant target"): - reg.fit_with_differentiable_input(X, y) - - -def test__fit_with_differentiable_input__single_sample_y_does_not_nan() -> None: - """torch.std defaults to sample std (correction=1) which returns NaN for - N=1. Our path uses correction=0 (population std) so std is well defined - even for a single sample (it just collapses to 0, which then trips the - constant-target guard — what we want). Verify the failure mode is the - explicit ValueError, not a downstream NaN. +@pytest.mark.parametrize( + ("case_id", "extra_kwargs", "X", "y", "match"), + [ + # The differentiable path uses an identity preprocessor and has no + # ordinal-encoding step, so categorical columns have no valid handling. + ( + "categorical_features", + {"categorical_features_indices": [0]}, + torch.randn(20, 4), + torch.randn(20), + "Categorical features", + ), + # Constant target collapses the bardist borders to a single point. + ( + "constant_target", + {}, + torch.randn(5, 4), + torch.full((5,), 3.14), + "Constant or near-constant target", + ), + # torch.std defaults to correction=1 and returns NaN for N=1; our path + # uses correction=0 so std collapses to 0 and trips the constant-target + # guard instead of a downstream NaN. + ( + "single_sample", + {}, + torch.randn(1, 4), + torch.tensor([2.0]), + "Constant or near-constant target", + ), + ], +) +def test__fit_with_differentiable_input__bad_input_raises_value_error( + case_id: str, + extra_kwargs: dict[str, object], + X: torch.Tensor, + y: torch.Tensor, + match: str, +) -> None: + """Bad inputs to the differentiable fit path must raise ValueError with a + clear message rather than producing NaNs or crashing downstream. """ + del case_id # Only used for parametrize ids. reg = TabPFNRegressor( n_estimators=1, ignore_pretraining_limits=True, device="cpu", differentiable_input=True, + **extra_kwargs, # type: ignore[arg-type] ) - X = torch.randn(1, 4) - y = torch.tensor([2.0]) - with pytest.raises(ValueError, match="Constant or near-constant target"): + with pytest.raises(ValueError, match=match): reg.fit_with_differentiable_input(X, y) From 6988b86f3d2044e032567b6a8f6d5a64e8b78a26 Mon Sep 17 00:00:00 2001 From: lujiazho Date: Fri, 15 May 2026 11:40:47 -0700 Subject: [PATCH 9/9] Apply ruff format to classifier and regressor --- src/tabpfn/classifier.py | 4 +--- src/tabpfn/regressor.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/tabpfn/classifier.py b/src/tabpfn/classifier.py index 70504b206..f3dc426ad 100644 --- a/src/tabpfn/classifier.py +++ b/src/tabpfn/classifier.py @@ -636,9 +636,7 @@ def _initialize_for_differentiable_input( ) # Minimal preprocessing for prompt tuning - reject_categoricals_for_differentiable_input( - self.categorical_features_indices - ) + reject_categoricals_for_differentiable_input(self.categorical_features_indices) n_features = X.shape[1] features = [Feature(name=None, modality=FeatureModality.NUMERICAL)] * n_features self.inferred_feature_schema_ = FeatureSchema(features=features) diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py index e9fb39732..bd3fed727 100644 --- a/src/tabpfn/regressor.py +++ b/src/tabpfn/regressor.py @@ -730,9 +730,7 @@ def _initialize_for_differentiable_input( """ # Minimal preprocessing for prompt tuning: no categorical features, # all-numerical schema, identity preprocessor that preserves grads. - reject_categoricals_for_differentiable_input( - self.categorical_features_indices - ) + reject_categoricals_for_differentiable_input(self.categorical_features_indices) n_features = X.shape[1] # One Feature instance per column — list multiplication would share # the same dataclass and any later in-place update would leak across