From 98bc868b9ae1b70f6e6eef522166a082a3d51c30 Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Wed, 31 Dec 2025 11:32:30 +0100 Subject: [PATCH 01/18] Add conformal prediction for missing data extension Introduces the CP_missing_data extension for TabPFNRegressor, providing conformal prediction intervals in the presence of missing data. Includes implementation, example usage, and tests for calibration and prediction with missing data patterns. --- .../CP_missing_data_example.py | 49 ++++ .../CP_missing_data/CP_missing_data.py | 238 ++++++++++++++++++ .../CP_missing_data/__init__.py | 11 + tests/test_CP_missing_data.py | 98 ++++++++ 4 files changed, 396 insertions(+) create mode 100644 examples/CP_missing_data/CP_missing_data_example.py create mode 100644 src/tabpfn_extensions/CP_missing_data/CP_missing_data.py create mode 100644 src/tabpfn_extensions/CP_missing_data/__init__.py create mode 100644 tests/test_CP_missing_data.py diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/CP_missing_data/CP_missing_data_example.py new file mode 100644 index 00000000..5453f54a --- /dev/null +++ b/examples/CP_missing_data/CP_missing_data_example.py @@ -0,0 +1,49 @@ +"""Provides a detailed example of obtaining conformalised prediction intervals when there is missing data. + +This script demonstrates the complete workflow for obtaining conformal prediction intervals +for the TabPFNRegressor when these are missing values in the dataset. The process is shown +in two steps. Using the training data to train the model and obtain correction terms for +each mask, and appying the corrcetion terms with the trained model to a new dataset. + +Note: This algorithms works well then the missing pattern is small. +""" + +import numpy as np +import pandas as pd +import warnings + +import tabpfn +from tabpfn import TabPFNRegressor + +from sklearn.model_selection import train_test_split +from tabpfn_extensions.CP_missing_data import CP_MDA_TabPFNRegressor, CP_MDA_TabPFNRegressor_newdata + +# generate some data +np.random.seed(42) # For reproducibility +X = np.random.rand(100, 5) +Y = np.random.rand(100) + +# add missing values in X under MCAR +X[np.random.randint(0, 100, 10), np.random.randint(0, 5, 10)] = np.nan + +# Check how many unique patterns there are +unique_patterns = pd.DataFrame(X).isnull().astype(int).drop_duplicates() +print(f"Number of unique missing data patterns: {len(unique_patterns)}") +print("\nUnique patterns:") +print(unique_patterns) + +# Use TabPFN+CP-MDA +model = CP_MDA_TabPFNRegressor(X, Y, quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed = 123) +calibration_results, model_fit = model.fit() +print(calibration_results) + +# Apply the model to new cases +cp_apply = CP_MDA_TabPFNRegressor_newdata(model_fit, X_new = X, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) +CP_results = cp_apply.fit() + +print("\nConformal prediction results:") +print(f"Lower bound (corrected): {CP_results[0][:5]}") # Show first 5 +print(f"Predictions: {CP_results[1][:5]}") +print(f"Upper bound (corrected): {CP_results[2][:5]}") +print(f"Lower bound (uncorrected): {CP_results[3][:5]}") +print(f"Upper bound (uncorrected): {CP_results[4][:5]}") \ No newline at end of file diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py new file mode 100644 index 00000000..bb7b187d --- /dev/null +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -0,0 +1,238 @@ +"""Conformal prediction for TabPFN with missing data patterns. + +This module provides conformal prediction intervals that are calibrated +for different missing data patterns in the input features. +""" + +from __future__ import annotations + +import warnings +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from tabpfn import TabPFNRegressor + + +class CP_MDA_TabPFNRegressor: + """ + Compute the correction terms for missing data masks using conformal prediction. + + Parameters: + X_train : matrix-like of shape (n_samples, n_predictors) + + Y_train : array-like of continious outcome with shape (n_samples,) + + quantiles : array with three arumgnent denoting the qualitens of intrest. + The default is [0.05, 0.5, 0.95], where the first indicates the lowerbound, + the second the median, and the third the upperbound. + + val_size : float between 0 and 1, indicating the size of the validation set + as a fraction of the training data. + + + Returns: + mask_unique: DataFrame with the correction terms for each mask. + + model: Fitted TabPFNRegressor model. + + """ + + def __init__(self, X_train, Y_train, quantiles, val_size, seed): + self.X = pd.DataFrame(X_train) + self.Y = Y_train + self.quantiles = quantiles + self.val_size = val_size + self.alpha = quantiles[0] * 2 + self.seed = seed + + def calc_correction_term(self, predictions, y_val, alpha): + """Calculate the correction term for conformal prediction.""" + # obtain the lowerbound, median, and upperbound + lb, pred, ub = predictions + # calculate difference between bounds and observed values + error_lb = (lb - y_val) + error_ub = (y_val - ub) + s = np.maximum(error_lb, error_ub) + # obtain the right quantile + + Q_use = (1 - alpha) / (1 + 1/len(s)) + correction_term = np.quantile(s, Q_use) + return correction_term + + def split_data(self): + """Split data into training and validation sets.""" + # create df with missing data indicator + missing_bool_df = self.X.isnull().astype(int) + self.X_train, self.X_val, Y_train_arr, Y_val_arr, self.Mask_train, self.Mask_val = train_test_split( + self.X, self.Y, missing_bool_df, test_size=self.val_size, random_state = self.seed + ) + + # Convert Y arrays back to pandas Series to maintain .iloc functionality + self.Y_train = pd.Series(Y_train_arr, index=self.X_train.index) + self.Y_val = pd.Series(Y_val_arr, index=self.X_val.index) + + def run_TABPFN(self): + """Fit the TabPFN model.""" + # fit model + m_fit = TabPFNRegressor() + m_fit.fit(self.X_train, self.Y_train) + self.model = m_fit + + def mask_preprocess(self): + """Preprocess masks and identify nested relationships.""" + # drop duplicates masks + mask_unique = self.Mask_val.drop_duplicates().copy() + # add mask id + mask_unique["mask_id"] = range(1, len(mask_unique) + 1) + # Get mask columns (all columns except mask_id) + mask_cols = [col for col in mask_unique.columns if col != 'mask_id'] + + # Check nesting for all pairs of masks + results = [] + for i, row_a in mask_unique.iterrows(): + mask_a = row_a[mask_cols].values + mask_a_id = row_a['mask_id'] + nested_masks = [] + + for j, row_b in mask_unique.iterrows(): + if i == j: # Skip comparing mask with itself + continue + mask_b = row_b[mask_cols].values + mask_b_id = row_b['mask_id'] + + if ((mask_b == 1) & (mask_a == 0)).sum() == 0: + nested_masks.append(mask_b_id) + + results.append({ + 'mask_id': mask_a_id, + 'nested_masks': nested_masks + }) + + self.mask_unique = mask_unique + self.mask_nested = pd.DataFrame(results) + + def create_calibration_sets(self): + """Create calibration sets for each mask pattern.""" + # obtain list of columns + mask_cols = list(self.Mask_val.columns.values) + + # Using merge to add the id of the mask + df_with_ids = self.Mask_val.merge( + self.mask_unique, + on=mask_cols, + how='left' + ) + + for i in self.mask_unique["mask_id"]: + # select the nested masks + nested_masks = self.mask_nested[self.mask_nested["mask_id"] == i]["nested_masks"].values[0] + # add the mask itself + nested_masks_with_self = nested_masks + [i] # Create new list instead of append + + # obtain indexes for the rows + indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)].index + + # select the validation data based on the indices + X_val_nested = self.X_val.iloc[indexes] + Y_val_nested = self.Y_val.iloc[indexes] + + # obtain predictions + predictions = self.model.predict( + X_val_nested, + output_type="quantiles", + quantiles=self.quantiles + ) + + # calculate correction term + correction_term = self.calc_correction_term(predictions, Y_val_nested, self.alpha) + + # save the correction term to the mask_unique dataframe + self.mask_unique.loc[self.mask_unique["mask_id"] == i, "correction_term"] = correction_term + self.mask_unique.loc[self.mask_unique["mask_id"] == i, "val_size"] = X_val_nested.shape[0] + + + return self.mask_unique, self.model + + def fit(self): + """Convenience method to run the entire pipeline""" + self.split_data() + self.run_TABPFN() + self.mask_preprocess() + mask_unique, model = self.create_calibration_sets() + + return mask_unique, model + +class CP_MDA_TabPFNRegressor_newdata: + """ + Compute the correction terms for missing data masks using conformal prediction. + + Parameters: + + TabPFN: Fitted TabPFNRegressor model. + + X_train : matrix-like of shape (n_samples, n_predictors) + + quantiles : array with three arumgnent denoting the qualitens of intrest used + in fitting the model. The default is [0.05, 0.5, 0.95]. + + calibration_results : matrix with the correction terms for each mask. + + + Returns: + CP_results: DataFrame with shape (n_samples, 5). Included are the corrected lower bound, + prediction, corrected upper bound, non-corrected lower bound, and non-corrected upper bound. + + """ + + def __init__(self,TabPFN, X_new, quantiles, calibration_results): + self.TabPFN = TabPFN + self.X = pd.DataFrame(X_new) + self.quantiles = quantiles + self.calibration_results = calibration_results + + def obtain_preds(self): + """Obtain predictions from fitted model.""" + preds_test = self.TabPFN.predict( + self.X, + output_type="quantiles", + quantiles=self.quantiles + ) + self.preds_test = preds_test + + def match_mask(self): + """Add correction terms to the new masks from the test set.""" + mask_test = self.X.isnull().astype(int) + mask_cols = list(mask_test.columns.values) + + mask_test_cor = mask_test.merge( + self.calibration_results, + on=mask_cols, + how='left' + ) + + # check if there are masks in the test set that are not in the calibration set + new_masks = mask_test_cor[mask_test_cor["correction_term"].isnull()][mask_cols] + + if new_masks.shape[0] > 0: + warnings.warn( + "The following masks are not in the calibration set:\n" + f"{new_masks.to_string()}\n" + "The baseline quantile estimates will be returned for those cases." + ) + + self.mask_test_cor = mask_test_cor + + def perf_correction(self): + """Add correction terms to the new masks from the test set.""" + preds_test = self.preds_test.copy() + lb_corr = preds_test[0] - self.mask_test_cor["correction_term"].values + ub_corr = preds_test[2] + self.mask_test_cor["correction_term"].values + + return lb_corr, preds_test[1], ub_corr, preds_test[0], preds_test[2] + + def fit(self): + """Convenience method to run the entire pipeline""" + self.obtain_preds() + self.match_mask() + CP_results = self.perf_correction() + return CP_results diff --git a/src/tabpfn_extensions/CP_missing_data/__init__.py b/src/tabpfn_extensions/CP_missing_data/__init__.py new file mode 100644 index 00000000..594e90ba --- /dev/null +++ b/src/tabpfn_extensions/CP_missing_data/__init__.py @@ -0,0 +1,11 @@ +"""Conformal prediction for missing data module for tabpfn_extensions package.""" + +from .CP_missing_data import ( + CP_MDA_TabPFNRegressor, + CP_MDA_TabPFNRegressor_newdata, +) + +__all__ = [ + "CP_MDA_TabPFNRegressor", + "CP_MDA_TabPFNRegressor_newdata", +] diff --git a/tests/test_CP_missing_data.py b/tests/test_CP_missing_data.py new file mode 100644 index 00000000..1c8e2458 --- /dev/null +++ b/tests/test_CP_missing_data.py @@ -0,0 +1,98 @@ +"""Tests for the CP_missing_data extension. + +This file tests the CP_MDA_TabPFNRegressor and CP_MDA_TabPFNRegressor_newdata functions, +which attempts to obtain correct uncertainity estimates in case if missing data. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest +from numpy.testing import assert_array_equal + +try: + from tabpfn_extensions.CP_missing_data import ( + CP_MDA_TabPFNRegressor, + CP_MDA_TabPFNRegressor_newdata, + ) +except ImportError: + pytest.skip("Required libraries (tabpfn) not installed", allow_module_level=True) + +# -------- Fixtures -------- + +@pytest.fixture +def X_train(): + return np.array([ + [0.1, np.nan], [0.3, 0.4], [np.nan, 0.6], [0.7, 0.8], + [0.2, np.nan], [0.2, np.nan], [0.9, 0.4], [np.nan, 0.4], + [0.3, 0.2], [np.nan, 0.9], [0.8, np.nan], [0.1, 0.2], + [np.nan, 0.5], [0.3, 0.7], [0.7, np.nan], [0.7, np.nan], + [0.3, 0.4], [np.nan, 0.2], [0.9, 0.7], [np.nan, 0.3], + [0.3, 0.7], [0.4, 0.8], [0.5, 0.4], [0.7, 0.2], [0.8, 0.3], + ]) + + +@pytest.fixture +def Y_train(): + return np.array([1,3,1,2,3,4,5,6,1,2,3,4,5,6,7,2,3,5,6,8,4,2,1,2,3]) + + +@pytest.fixture +def X_new(): + return np.array([ + [0.1, 0.1], + [0.3, np.nan], + [np.nan, 0.6], + ]) + + +@pytest.fixture +def seed(): + return 123 + + +# -- Test -- + +def test_model_CP(X_train, Y_train, seed): + """Tests if the calibration corrections are of the correct shape and type.""" + model = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles = [0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + calibration_results, model_fit = model.fit() + + # not the best check since we do not control which cases are in the valset + missing_df = pd.DataFrame(X_train).isnull().astype(int).drop_duplicates() + + # check type, size of the calibration results + assert calibration_results.shape[0] == missing_df.shape[0] + assert calibration_results.shape[1] == 5 + assert isinstance(calibration_results, pd.DataFrame) + + +def test_reproducibility(X_train, Y_train, seed): + """Tests that random_state ensures deterministic correction terms.""" + + model_1 = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + calibration_results_1, model_fit_1 = model_1.fit() + + # Second model with the same seed + model_2 = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles=[0.05, 0.5, 0.95] , val_size = 0.5, seed = seed) + calibration_results_2, model_fit_2 = model_2.fit() + + # Assert that the outputs are identical + assert_array_equal(calibration_results_1, calibration_results_2) + + +def test_predict(X_train, Y_train, seed, X_new): + """Tests if the predictions have the correct shape and type.""" + + # fit model + model = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + calibration_results, model_fit = model.fit() + + # Apply the model to new cases + cp_apply = CP_MDA_TabPFNRegressor_newdata(model_fit, X_new = X_new, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) + CP_results = cp_apply.fit() + + assert CP_results[1].size== X_new.shape[0] + assert isinstance(CP_results[1], np.ndarray) + assert len(CP_results)== 5 \ No newline at end of file From 840ca599dc7c5710f7cc96f80cac1bd68cac6b6e Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Wed, 31 Dec 2025 11:46:35 +0100 Subject: [PATCH 02/18] Import TabPFN in a flexible way --- examples/CP_missing_data/CP_missing_data_example.py | 8 ++++++-- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 8 +++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/CP_missing_data/CP_missing_data_example.py index 5453f54a..0babc385 100644 --- a/examples/CP_missing_data/CP_missing_data_example.py +++ b/examples/CP_missing_data/CP_missing_data_example.py @@ -12,8 +12,12 @@ import pandas as pd import warnings -import tabpfn -from tabpfn import TabPFNRegressor +try: + # Try standard TabPFN package first + from tabpfn import TabPFNRegressor +except ImportError: + # Fall back to TabPFN client + from tabpfn_client import TabPFNRegressor from sklearn.model_selection import train_test_split from tabpfn_extensions.CP_missing_data import CP_MDA_TabPFNRegressor, CP_MDA_TabPFNRegressor_newdata diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index bb7b187d..f4cb6317 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -10,7 +10,13 @@ import numpy as np import pandas as pd from sklearn.model_selection import train_test_split -from tabpfn import TabPFNRegressor + +try: + # Try standard TabPFN package first + from tabpfn import TabPFNRegressor +except ImportError: + # Fall back to TabPFN client + from tabpfn_client import TabPFNRegressor class CP_MDA_TabPFNRegressor: From baf8528ac5c72196a5b32aef5b5d30698d9aace4 Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Wed, 31 Dec 2025 11:55:56 +0100 Subject: [PATCH 03/18] Update example Less features to reduce the number of masks for the example --- examples/CP_missing_data/CP_missing_data_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/CP_missing_data/CP_missing_data_example.py index 0babc385..c7561043 100644 --- a/examples/CP_missing_data/CP_missing_data_example.py +++ b/examples/CP_missing_data/CP_missing_data_example.py @@ -24,11 +24,11 @@ # generate some data np.random.seed(42) # For reproducibility -X = np.random.rand(100, 5) +X = np.random.rand(100, 2) Y = np.random.rand(100) # add missing values in X under MCAR -X[np.random.randint(0, 100, 10), np.random.randint(0, 5, 10)] = np.nan +X[np.random.randint(0, 100, 40), np.random.randint(0, 2, 40)] = np.nan # Check how many unique patterns there are unique_patterns = pd.DataFrame(X).isnull().astype(int).drop_duplicates() From de6575f8a93f42edb4424f59c6ba31ce344a972f Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Wed, 31 Dec 2025 12:19:00 +0100 Subject: [PATCH 04/18] update the correct scoring rule --- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index f4cb6317..efa5d686 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -59,9 +59,9 @@ def calc_correction_term(self, predictions, y_val, alpha): error_lb = (lb - y_val) error_ub = (y_val - ub) s = np.maximum(error_lb, error_ub) - # obtain the right quantile - Q_use = (1 - alpha) / (1 + 1/len(s)) + # obtain the emperical quantile + Q_use = (1 - alpha) * (1 + 1/len(s)) correction_term = np.quantile(s, Q_use) return correction_term @@ -132,6 +132,7 @@ def create_calibration_sets(self): for i in self.mask_unique["mask_id"]: # select the nested masks nested_masks = self.mask_nested[self.mask_nested["mask_id"] == i]["nested_masks"].values[0] + # add the mask itself nested_masks_with_self = nested_masks + [i] # Create new list instead of append From 7ba1129d71f7fade3fccae16d8d20256aaa575f9 Mon Sep 17 00:00:00 2001 From: Florian D van Leeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:23:35 +0100 Subject: [PATCH 05/18] Update src/tabpfn_extensions/CP_missing_data/CP_missing_data.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index efa5d686..65b836a1 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -26,11 +26,11 @@ class CP_MDA_TabPFNRegressor: Parameters: X_train : matrix-like of shape (n_samples, n_predictors) - Y_train : array-like of continious outcome with shape (n_samples,) + Y_train : array-like of continuous outcome with shape (n_samples,) - quantiles : array with three arumgnent denoting the qualitens of intrest. - The default is [0.05, 0.5, 0.95], where the first indicates the lowerbound, - the second the median, and the third the upperbound. + quantiles : array with three arguments denoting the quantiles of interest. + The default is [0.05, 0.5, 0.95], where the first indicates the lower bound, + the second the median, and the third the upper bound. val_size : float between 0 and 1, indicating the size of the validation set as a fraction of the training data. From 2fdf17bb66004aab8387c7c8d2b0620b73a23da8 Mon Sep 17 00:00:00 2001 From: Florian D van Leeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:24:01 +0100 Subject: [PATCH 06/18] Update src/tabpfn_extensions/CP_missing_data/__init__.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/tabpfn_extensions/CP_missing_data/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabpfn_extensions/CP_missing_data/__init__.py b/src/tabpfn_extensions/CP_missing_data/__init__.py index 594e90ba..4c76af49 100644 --- a/src/tabpfn_extensions/CP_missing_data/__init__.py +++ b/src/tabpfn_extensions/CP_missing_data/__init__.py @@ -1,7 +1,7 @@ """Conformal prediction for missing data module for tabpfn_extensions package.""" from .CP_missing_data import ( - CP_MDA_TabPFNRegressor, + CP_MDA_TabPFNRegressor, CP_MDA_TabPFNRegressor_newdata, ) From 1f2ae5578815b6e2dedcf08fc3070ca65aef7d60 Mon Sep 17 00:00:00 2001 From: Florian D van Leeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:24:19 +0100 Subject: [PATCH 07/18] Update src/tabpfn_extensions/CP_missing_data/CP_missing_data.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index 65b836a1..6a474497 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -177,9 +177,9 @@ class CP_MDA_TabPFNRegressor_newdata: TabPFN: Fitted TabPFNRegressor model. - X_train : matrix-like of shape (n_samples, n_predictors) + X_new : matrix-like of shape (n_samples, n_predictors) - quantiles : array with three arumgnent denoting the qualitens of intrest used + quantiles : array with three arguments denoting the quantiles of interest used in fitting the model. The default is [0.05, 0.5, 0.95]. calibration_results : matrix with the correction terms for each mask. From 662a17d37b5c23fba535e44707afcb28b856a2f5 Mon Sep 17 00:00:00 2001 From: Florian D van Leeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:24:39 +0100 Subject: [PATCH 08/18] Update examples/CP_missing_data/CP_missing_data_example.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- examples/CP_missing_data/CP_missing_data_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/CP_missing_data/CP_missing_data_example.py index c7561043..0735eb7d 100644 --- a/examples/CP_missing_data/CP_missing_data_example.py +++ b/examples/CP_missing_data/CP_missing_data_example.py @@ -5,7 +5,7 @@ in two steps. Using the training data to train the model and obtain correction terms for each mask, and appying the corrcetion terms with the trained model to a new dataset. -Note: This algorithms works well then the missing pattern is small. +Note: This algorithm works well when the missing pattern is small. """ import numpy as np From 4c0d9c63c358e80c3b2fea5dd160a0e7aeb9bc37 Mon Sep 17 00:00:00 2001 From: Florian D van Leeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:25:09 +0100 Subject: [PATCH 09/18] Update src/tabpfn_extensions/CP_missing_data/CP_missing_data.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index 6a474497..fe9844ea 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -229,8 +229,8 @@ def match_mask(self): self.mask_test_cor = mask_test_cor - def perf_correction(self): - """Add correction terms to the new masks from the test set.""" + def perform_correction(self): + """Apply correction terms to the prediction intervals.""" preds_test = self.preds_test.copy() lb_corr = preds_test[0] - self.mask_test_cor["correction_term"].values ub_corr = preds_test[2] + self.mask_test_cor["correction_term"].values From 73ef78172d448697afb8db68e11edc2bf8ece5ab Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:30:12 +0100 Subject: [PATCH 10/18] Update based on gemini-code-assist --- .../CP_missing_data_example.py | 1 - .../CP_missing_data/CP_missing_data.py | 46 +++++++++---------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/CP_missing_data/CP_missing_data_example.py index 0735eb7d..d042d67c 100644 --- a/examples/CP_missing_data/CP_missing_data_example.py +++ b/examples/CP_missing_data/CP_missing_data_example.py @@ -19,7 +19,6 @@ # Fall back to TabPFN client from tabpfn_client import TabPFNRegressor -from sklearn.model_selection import train_test_split from tabpfn_extensions.CP_missing_data import CP_MDA_TabPFNRegressor, CP_MDA_TabPFNRegressor_newdata # generate some data diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index fe9844ea..fc586fdf 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -207,35 +207,35 @@ def obtain_preds(self): self.preds_test = preds_test def match_mask(self): - """Add correction terms to the new masks from the test set.""" - mask_test = self.X.isnull().astype(int) - mask_cols = list(mask_test.columns.values) - - mask_test_cor = mask_test.merge( - self.calibration_results, - on=mask_cols, - how='left' - ) + """Add correction terms to the new masks from the test set.""" + mask_test = self.X.isnull().astype(int) + mask_cols = list(mask_test.columns.values) + + mask_test_cor = mask_test.merge( + self.calibration_results, + on=mask_cols, + how='left' + ) - # check if there are masks in the test set that are not in the calibration set - new_masks = mask_test_cor[mask_test_cor["correction_term"].isnull()][mask_cols] + # check if there are masks in the test set that are not in the calibration set + new_masks = mask_test_cor[mask_test_cor["correction_term"].isnull()][mask_cols] - if new_masks.shape[0] > 0: - warnings.warn( - "The following masks are not in the calibration set:\n" - f"{new_masks.to_string()}\n" - "The baseline quantile estimates will be returned for those cases." - ) + if new_masks.shape[0] > 0: + warnings.warn( + "The following masks are not in the calibration set:\n" + f"{new_masks.to_string()}\n" + "The baseline quantile estimates will be returned for those cases." + ) - self.mask_test_cor = mask_test_cor + self.mask_test_cor = mask_test_cor def perform_correction(self): - """Apply correction terms to the prediction intervals.""" - preds_test = self.preds_test.copy() - lb_corr = preds_test[0] - self.mask_test_cor["correction_term"].values - ub_corr = preds_test[2] + self.mask_test_cor["correction_term"].values + """Apply correction terms to the prediction intervals.""" + preds_test = self.preds_test.copy() + lb_corr = preds_test[0] - self.mask_test_cor["correction_term"].values + ub_corr = preds_test[2] + self.mask_test_cor["correction_term"].values - return lb_corr, preds_test[1], ub_corr, preds_test[0], preds_test[2] + return lb_corr, preds_test[1], ub_corr, preds_test[0], preds_test[2] def fit(self): """Convenience method to run the entire pipeline""" From 57bcccc245edefe4e9f6d73e03b7e0f799ccd6a9 Mon Sep 17 00:00:00 2001 From: Florian D van Leeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:30:59 +0100 Subject: [PATCH 11/18] Update tests/test_CP_missing_data.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/test_CP_missing_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_CP_missing_data.py b/tests/test_CP_missing_data.py index 1c8e2458..7b9b4a4b 100644 --- a/tests/test_CP_missing_data.py +++ b/tests/test_CP_missing_data.py @@ -59,8 +59,10 @@ def test_model_CP(X_train, Y_train, seed): model = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles = [0.05, 0.5, 0.95], val_size = 0.5, seed = seed) calibration_results, model_fit = model.fit() - # not the best check since we do not control which cases are in the valset - missing_df = pd.DataFrame(X_train).isnull().astype(int).drop_duplicates() + # Replicate the split to get the validation set and find its unique masks. + from sklearn.model_selection import train_test_split + _, X_val, _, _ = train_test_split(X_train, Y_train, test_size=0.5, random_state=seed) + missing_df = pd.DataFrame(X_val).isnull().astype(int).drop_duplicates() # check type, size of the calibration results assert calibration_results.shape[0] == missing_df.shape[0] From 512cf47d46b4bca3838e5246162d0612f45eaa7c Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:06:16 +0100 Subject: [PATCH 12/18] Tidy up the code --- .../CP_missing_data/CP_missing_data.py | 16 ++++++++++++---- tests/test_CP_missing_data.py | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index fc586fdf..9f735b0c 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -62,6 +62,13 @@ def calc_correction_term(self, predictions, y_val, alpha): # obtain the emperical quantile Q_use = (1 - alpha) * (1 + 1/len(s)) + + # Check is Q_use if not larger then 1 + if Q_use > 1: + Q_use = 1 + warnings.warn( + "Some masks have very small calibration sets") + correction_term = np.quantile(s, Q_use) return correction_term @@ -123,7 +130,8 @@ def create_calibration_sets(self): mask_cols = list(self.Mask_val.columns.values) # Using merge to add the id of the mask - df_with_ids = self.Mask_val.merge( + # use original index values + df_with_ids = self.Mask_val.reset_index().merge( self.mask_unique, on=mask_cols, how='left' @@ -137,11 +145,11 @@ def create_calibration_sets(self): nested_masks_with_self = nested_masks + [i] # Create new list instead of append # obtain indexes for the rows - indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)].index + indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)]["index"] # select the validation data based on the indices - X_val_nested = self.X_val.iloc[indexes] - Y_val_nested = self.Y_val.iloc[indexes] + X_val_nested = self.X_val.loc[indexes] + Y_val_nested = self.Y_val.loc[indexes] # obtain predictions predictions = self.model.predict( diff --git a/tests/test_CP_missing_data.py b/tests/test_CP_missing_data.py index 7b9b4a4b..a24bbb5e 100644 --- a/tests/test_CP_missing_data.py +++ b/tests/test_CP_missing_data.py @@ -10,6 +10,7 @@ import pandas as pd import pytest from numpy.testing import assert_array_equal +from sklearn.model_selection import train_test_split try: from tabpfn_extensions.CP_missing_data import ( @@ -60,7 +61,6 @@ def test_model_CP(X_train, Y_train, seed): calibration_results, model_fit = model.fit() # Replicate the split to get the validation set and find its unique masks. - from sklearn.model_selection import train_test_split _, X_val, _, _ = train_test_split(X_train, Y_train, test_size=0.5, random_state=seed) missing_df = pd.DataFrame(X_val).isnull().astype(int).drop_duplicates() From 07ee7e93f705aa9f8cf1747ecc2fd05df7df1ecc Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Mon, 5 Jan 2026 13:52:19 +0100 Subject: [PATCH 13/18] Update CP_missing_data.py Make seed optional and update name change of internal function in pipeline. --- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index 9f735b0c..50ff3808 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -43,7 +43,7 @@ class CP_MDA_TabPFNRegressor: """ - def __init__(self, X_train, Y_train, quantiles, val_size, seed): + def __init__(self, X_train, Y_train, quantiles, val_size, seed=None): self.X = pd.DataFrame(X_train) self.Y = Y_train self.quantiles = quantiles @@ -249,5 +249,5 @@ def fit(self): """Convenience method to run the entire pipeline""" self.obtain_preds() self.match_mask() - CP_results = self.perf_correction() + CP_results = self.perform_correction() return CP_results From 22b16853b066973bda7267d8d75dbad98b68d6ef Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Mon, 5 Jan 2026 18:05:10 +0100 Subject: [PATCH 14/18] Add the masking of nested columns --- src/tabpfn_extensions/CP_missing_data/CP_missing_data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index 50ff3808..2d32013f 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -151,6 +151,14 @@ def create_calibration_sets(self): X_val_nested = self.X_val.loc[indexes] Y_val_nested = self.Y_val.loc[indexes] + # SET ENTIRE COLUMNS TO NaN WHERE THE MASK HAS MISSING VALUES + current_mask = self.mask_unique[self.mask_unique["mask_id"] == i][mask_cols].iloc[0] + + # For each column where the mask indicates missing (value = 1), set entire column to NaN + for col_idx, col_name in enumerate(mask_cols): + if current_mask.iloc[col_idx] == 1: + X_val_nested.loc[:, col_name] = np.nan + # obtain predictions predictions = self.model.predict( X_val_nested, From c9aa878a8698d3387101adc7a6a8a5ebdc10a424 Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Sat, 10 Jan 2026 19:51:18 +0100 Subject: [PATCH 15/18] Changes based on review --- .../CP_missing_data_example.py | 17 +- .../CP_missing_data/CP_missing_data.py | 246 +++++++++++------- .../CP_missing_data/__init__.py | 12 +- tests/test_CP_missing_data.py | 28 +- 4 files changed, 177 insertions(+), 126 deletions(-) diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/CP_missing_data/CP_missing_data_example.py index d042d67c..407c3c9d 100644 --- a/examples/CP_missing_data/CP_missing_data_example.py +++ b/examples/CP_missing_data/CP_missing_data_example.py @@ -12,14 +12,9 @@ import pandas as pd import warnings -try: - # Try standard TabPFN package first - from tabpfn import TabPFNRegressor -except ImportError: - # Fall back to TabPFN client - from tabpfn_client import TabPFNRegressor +from tabpfn_extensions.utils import TabPFNClassifier, TabPFNRegressor -from tabpfn_extensions.CP_missing_data import CP_MDA_TabPFNRegressor, CP_MDA_TabPFNRegressor_newdata +from tabpfn_extensions.cp_missing_data import CPMDATabPFNRegressor, CPMDATabPFNRegressorNewData # generate some data np.random.seed(42) # For reproducibility @@ -36,13 +31,13 @@ print(unique_patterns) # Use TabPFN+CP-MDA -model = CP_MDA_TabPFNRegressor(X, Y, quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed = 123) -calibration_results, model_fit = model.fit() +model = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed = 123) +calibration_results, model_fit = model.fit(X, Y) print(calibration_results) # Apply the model to new cases -cp_apply = CP_MDA_TabPFNRegressor_newdata(model_fit, X_new = X, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) -CP_results = cp_apply.fit() +cp_apply = CPMDATabPFNRegressorNewData(model_fit, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) +CP_results = cp_apply.predict(X) print("\nConformal prediction results:") print(f"Lower bound (corrected): {CP_results[0][:5]}") # Show first 5 diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py index 2d32013f..25a80a44 100644 --- a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py +++ b/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py @@ -7,26 +7,19 @@ from __future__ import annotations import warnings +from typing import Optional + import numpy as np import pandas as pd +from numpy.typing import ArrayLike from sklearn.model_selection import train_test_split -try: - # Try standard TabPFN package first - from tabpfn import TabPFNRegressor -except ImportError: - # Fall back to TabPFN client - from tabpfn_client import TabPFNRegressor - +from tabpfn_extensions.utils import TabPFNRegressor -class CP_MDA_TabPFNRegressor: - """ - Compute the correction terms for missing data masks using conformal prediction. +class CPMDATabPFNRegressor: + """Compute the correction terms for missing data masks using conformal prediction. Parameters: - X_train : matrix-like of shape (n_samples, n_predictors) - - Y_train : array-like of continuous outcome with shape (n_samples,) quantiles : array with three arguments denoting the quantiles of interest. The default is [0.05, 0.5, 0.95], where the first indicates the lower bound, @@ -43,15 +36,23 @@ class CP_MDA_TabPFNRegressor: """ - def __init__(self, X_train, Y_train, quantiles, val_size, seed=None): - self.X = pd.DataFrame(X_train) - self.Y = Y_train + def __init__( + self, + quantiles: list[float], + val_size: float, + seed: Optional[int] = None + ) -> None: self.quantiles = quantiles self.val_size = val_size self.alpha = quantiles[0] * 2 self.seed = seed - def calc_correction_term(self, predictions, y_val, alpha): + def calc_correction_term( + self, + predictions: tuple[np.ndarray, np.ndarray, np.ndarray], + y_val: pd.Series, + alpha: float + ) -> float: """Calculate the correction term for conformal prediction.""" # obtain the lowerbound, median, and upperbound lb, pred, ub = predictions @@ -67,134 +68,168 @@ def calc_correction_term(self, predictions, y_val, alpha): if Q_use > 1: Q_use = 1 warnings.warn( - "Some masks have very small calibration sets") + "Some masks have very small calibration sets", stacklevel=2) correction_term = np.quantile(s, Q_use) return correction_term - def split_data(self): + def split_data(self, + x: pd.DataFrame, + y: np.ndarray + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.DataFrame]: """Split data into training and validation sets.""" # create df with missing data indicator - missing_bool_df = self.X.isnull().astype(int) - self.X_train, self.X_val, Y_train_arr, Y_val_arr, self.Mask_train, self.Mask_val = train_test_split( - self.X, self.Y, missing_bool_df, test_size=self.val_size, random_state = self.seed + missing_bool_df = x.isna().astype(int) + x_train, x_val, y_train_arr, y_val_arr, mask_train, mask_val = train_test_split( + x, y, missing_bool_df, test_size=self.val_size, random_state = self.seed ) - # Convert Y arrays back to pandas Series to maintain .iloc functionality - self.Y_train = pd.Series(Y_train_arr, index=self.X_train.index) - self.Y_val = pd.Series(Y_val_arr, index=self.X_val.index) + # Convert y arrays back to pandas Series to maintain .iloc functionality + y_train = pd.Series(y_train_arr, index=x_train.index) + y_val = pd.Series(y_val_arr, index=x_val.index) + + return x_train, x_val, y_train, y_val, mask_train, mask_val - def run_TABPFN(self): + def run_TABPFN(self, + x_train: pd.DataFrame, + y_train: pd.Series + ) -> TabPFNRegressor: """Fit the TabPFN model.""" # fit model - m_fit = TabPFNRegressor() - m_fit.fit(self.X_train, self.Y_train) - self.model = m_fit - - def mask_preprocess(self): + model = TabPFNRegressor() + model.fit(x_train, y_train) + return(model) + + def mask_preprocess( + self, + mask_val: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame]: """Preprocess masks and identify nested relationships.""" # drop duplicates masks - mask_unique = self.Mask_val.drop_duplicates().copy() + mask_unique = mask_val.drop_duplicates().copy() # add mask id mask_unique["mask_id"] = range(1, len(mask_unique) + 1) # Get mask columns (all columns except mask_id) - mask_cols = [col for col in mask_unique.columns if col != 'mask_id'] + mask_cols = [col for col in mask_unique.columns if col != "mask_id"] # Check nesting for all pairs of masks results = [] for i, row_a in mask_unique.iterrows(): mask_a = row_a[mask_cols].values - mask_a_id = row_a['mask_id'] + mask_a_id = row_a["mask_id"] nested_masks = [] for j, row_b in mask_unique.iterrows(): if i == j: # Skip comparing mask with itself continue mask_b = row_b[mask_cols].values - mask_b_id = row_b['mask_id'] + mask_b_id = row_b["mask_id"] if ((mask_b == 1) & (mask_a == 0)).sum() == 0: nested_masks.append(mask_b_id) results.append({ - 'mask_id': mask_a_id, - 'nested_masks': nested_masks + "mask_id": mask_a_id, + "nested_masks": nested_masks }) - self.mask_unique = mask_unique - self.mask_nested = pd.DataFrame(results) - - def create_calibration_sets(self): + mask_nested = pd.DataFrame(results) + return mask_unique, mask_nested + + def create_calibration_sets( + self, + x_val: pd.DataFrame, + y_val: pd.Series, + mask_val: pd.DataFrame, + mask_unique: pd.DataFrame, + mask_nested: pd.DataFrame, + model: TabPFNRegressor + ) -> tuple[pd.DataFrame, TabPFNRegressor]: """Create calibration sets for each mask pattern.""" # obtain list of columns - mask_cols = list(self.Mask_val.columns.values) + mask_cols = list(mask_val.columns.values) # Using merge to add the id of the mask # use original index values - df_with_ids = self.Mask_val.reset_index().merge( - self.mask_unique, + df_with_ids = mask_val.reset_index().merge( + mask_unique, on=mask_cols, - how='left' + how="left" ) - for i in self.mask_unique["mask_id"]: + for i in mask_unique["mask_id"]: # select the nested masks - nested_masks = self.mask_nested[self.mask_nested["mask_id"] == i]["nested_masks"].values[0] + nested_masks = mask_nested[mask_nested["mask_id"] == i]["nested_masks"].values[0] # add the mask itself - nested_masks_with_self = nested_masks + [i] # Create new list instead of append + nested_masks_with_self = [*nested_masks, i] # obtain indexes for the rows indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)]["index"] # select the validation data based on the indices - X_val_nested = self.X_val.loc[indexes] - Y_val_nested = self.Y_val.loc[indexes] + x_val_nested = x_val.loc[indexes] + y_val_nested = y_val.loc[indexes] # SET ENTIRE COLUMNS TO NaN WHERE THE MASK HAS MISSING VALUES - current_mask = self.mask_unique[self.mask_unique["mask_id"] == i][mask_cols].iloc[0] + current_mask = mask_unique[mask_unique["mask_id"] == i][mask_cols].iloc[0] # For each column where the mask indicates missing (value = 1), set entire column to NaN for col_idx, col_name in enumerate(mask_cols): if current_mask.iloc[col_idx] == 1: - X_val_nested.loc[:, col_name] = np.nan + x_val_nested.loc[:, col_name] = np.nan # obtain predictions - predictions = self.model.predict( - X_val_nested, + predictions = model.predict( + x_val_nested, output_type="quantiles", quantiles=self.quantiles ) # calculate correction term - correction_term = self.calc_correction_term(predictions, Y_val_nested, self.alpha) + correction_term = self.calc_correction_term(predictions, y_val_nested, self.alpha) # save the correction term to the mask_unique dataframe - self.mask_unique.loc[self.mask_unique["mask_id"] == i, "correction_term"] = correction_term - self.mask_unique.loc[self.mask_unique["mask_id"] == i, "val_size"] = X_val_nested.shape[0] + mask_unique.loc[mask_unique["mask_id"] == i, "correction_term"] = correction_term + mask_unique.loc[mask_unique["mask_id"] == i, "val_size"] = x_val_nested.shape[0] + + + return mask_unique, model + + def fit( + self, + x_train: ArrayLike, + y_train: ArrayLike + ) -> tuple[pd.DataFrame, TabPFNRegressor]: + """Convenience method to run the entire pipeline + Parameters: + x_train : matrix-like of shape (n_samples, n_predictors) - return self.mask_unique, self.model + y_train : array-like of continuous outcome with shape (n_samples,) + """ - def fit(self): - """Convenience method to run the entire pipeline""" - self.split_data() - self.run_TABPFN() - self.mask_preprocess() - mask_unique, model = self.create_calibration_sets() + # Store and parse the data + + x = pd.DataFrame(x_train) + y = y_train + + # Run trough all the functions + x_train, x_val, y_train, y_val, mask_train, mask_val = self.split_data(x, y) + model = self.run_TABPFN(x_train, y_train) + mask_unique, mask_nested = self.mask_preprocess(mask_val) + mask_unique, model = self.create_calibration_sets( + x_val, y_val, mask_val, mask_unique, mask_nested, model) return mask_unique, model -class CP_MDA_TabPFNRegressor_newdata: - """ - Compute the correction terms for missing data masks using conformal prediction. +class CPMDATabPFNRegressorNewData: + """Compute the correction terms for missing data masks using conformal prediction. Parameters: TabPFN: Fitted TabPFNRegressor model. - X_new : matrix-like of shape (n_samples, n_predictors) - quantiles : array with three arguments denoting the quantiles of interest used in fitting the model. The default is [0.05, 0.5, 0.95]. @@ -207,55 +242,76 @@ class CP_MDA_TabPFNRegressor_newdata: """ - def __init__(self,TabPFN, X_new, quantiles, calibration_results): - self.TabPFN = TabPFN - self.X = pd.DataFrame(X_new) + def __init__( + self, + tabpfn: TabPFNRegressor, + quantiles: list[float], + calibration_results: pd.DataFrame + ) -> None: + self.tabpfn = tabpfn self.quantiles = quantiles self.calibration_results = calibration_results - def obtain_preds(self): + def obtain_preds(self, + x: pd.DataFrame) -> np.ndarray: """Obtain predictions from fitted model.""" - preds_test = self.TabPFN.predict( - self.X, + preds = self.tabpfn.predict( + x, output_type="quantiles", quantiles=self.quantiles ) - self.preds_test = preds_test + return preds - def match_mask(self): + def match_mask(self, + x: pd.DataFrame) -> pd.DataFrame: """Add correction terms to the new masks from the test set.""" - mask_test = self.X.isnull().astype(int) + mask_test = x.isna().astype(int) mask_cols = list(mask_test.columns.values) mask_test_cor = mask_test.merge( self.calibration_results, on=mask_cols, - how='left' + how="left" ) # check if there are masks in the test set that are not in the calibration set - new_masks = mask_test_cor[mask_test_cor["correction_term"].isnull()][mask_cols] + new_masks = mask_test_cor[mask_test_cor["correction_term"].isna()][mask_cols] if new_masks.shape[0] > 0: warnings.warn( "The following masks are not in the calibration set:\n" f"{new_masks.to_string()}\n" - "The baseline quantile estimates will be returned for those cases." + "The baseline quantile estimates will be returned for those cases.", stacklevel=2 ) - self.mask_test_cor = mask_test_cor + return mask_test_cor - def perform_correction(self): + def perform_correction( + self, + preds: np.ndarray, + mask_test_cor: pd.DataFrame + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Apply correction terms to the prediction intervals.""" - preds_test = self.preds_test.copy() - lb_corr = preds_test[0] - self.mask_test_cor["correction_term"].values - ub_corr = preds_test[2] + self.mask_test_cor["correction_term"].values - - return lb_corr, preds_test[1], ub_corr, preds_test[0], preds_test[2] - - def fit(self): - """Convenience method to run the entire pipeline""" - self.obtain_preds() - self.match_mask() - CP_results = self.perform_correction() - return CP_results + + lb_corr = preds[0] - mask_test_cor["correction_term"].values + ub_corr = preds[2] + mask_test_cor["correction_term"].values + + return lb_corr, preds[1], ub_corr, preds[0], preds[2] + + def predict( + self, + x_new: ArrayLike + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Convenience method to run the entire pipeline + Parameters: + + x_new : matrix-like of shape (n_samples, n_predictors) + """ + + x = pd.DataFrame(x_new) + + preds = self.obtain_preds(x) + mask_test_cor = self.match_mask(x) + cp_results = self.perform_correction(preds, mask_test_cor) + + return cp_results diff --git a/src/tabpfn_extensions/CP_missing_data/__init__.py b/src/tabpfn_extensions/CP_missing_data/__init__.py index 4c76af49..18f6e08d 100644 --- a/src/tabpfn_extensions/CP_missing_data/__init__.py +++ b/src/tabpfn_extensions/CP_missing_data/__init__.py @@ -1,11 +1,11 @@ """Conformal prediction for missing data module for tabpfn_extensions package.""" -from .CP_missing_data import ( - CP_MDA_TabPFNRegressor, - CP_MDA_TabPFNRegressor_newdata, +from .cp_missing_data import ( + CPMDATabPFNRegressor, + CPMDATabPFNRegressorNewData, ) __all__ = [ - "CP_MDA_TabPFNRegressor", - "CP_MDA_TabPFNRegressor_newdata", -] + "CPMDATabPFNRegressor", + "CPMDATabPFNRegressorNewData", +] \ No newline at end of file diff --git a/tests/test_CP_missing_data.py b/tests/test_CP_missing_data.py index a24bbb5e..45b1e97d 100644 --- a/tests/test_CP_missing_data.py +++ b/tests/test_CP_missing_data.py @@ -1,6 +1,6 @@ """Tests for the CP_missing_data extension. -This file tests the CP_MDA_TabPFNRegressor and CP_MDA_TabPFNRegressor_newdata functions, +This file tests the CPMDATabPFNRegressor and CPMDATabPFNRegressorNewData functions, which attempts to obtain correct uncertainity estimates in case if missing data. """ @@ -13,9 +13,9 @@ from sklearn.model_selection import train_test_split try: - from tabpfn_extensions.CP_missing_data import ( - CP_MDA_TabPFNRegressor, - CP_MDA_TabPFNRegressor_newdata, + from tabpfn_extensions.cp_missing_data import ( + CPMDATabPFNRegressor, + CPMDATabPFNRegressorNewData, ) except ImportError: pytest.skip("Required libraries (tabpfn) not installed", allow_module_level=True) @@ -57,8 +57,8 @@ def seed(): def test_model_CP(X_train, Y_train, seed): """Tests if the calibration corrections are of the correct shape and type.""" - model = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles = [0.05, 0.5, 0.95], val_size = 0.5, seed = seed) - calibration_results, model_fit = model.fit() + model = CPMDATabPFNRegressor(quantiles = [0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + calibration_results, model_fit = model.fit(X_train, Y_train) # Replicate the split to get the validation set and find its unique masks. _, X_val, _, _ = train_test_split(X_train, Y_train, test_size=0.5, random_state=seed) @@ -73,12 +73,12 @@ def test_model_CP(X_train, Y_train, seed): def test_reproducibility(X_train, Y_train, seed): """Tests that random_state ensures deterministic correction terms.""" - model_1 = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) - calibration_results_1, model_fit_1 = model_1.fit() + model_1 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + calibration_results_1, model_fit_1 = model_1.fit(X_train, Y_train) # Second model with the same seed - model_2 = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles=[0.05, 0.5, 0.95] , val_size = 0.5, seed = seed) - calibration_results_2, model_fit_2 = model_2.fit() + model_2 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95] , val_size = 0.5, seed = seed) + calibration_results_2, model_fit_2 = model_2.fit(X_train, Y_train) # Assert that the outputs are identical assert_array_equal(calibration_results_1, calibration_results_2) @@ -88,12 +88,12 @@ def test_predict(X_train, Y_train, seed, X_new): """Tests if the predictions have the correct shape and type.""" # fit model - model = CP_MDA_TabPFNRegressor(X_train, Y_train, quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) - calibration_results, model_fit = model.fit() + model = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + calibration_results, model_fit = model.fit(X_train, Y_train) # Apply the model to new cases - cp_apply = CP_MDA_TabPFNRegressor_newdata(model_fit, X_new = X_new, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) - CP_results = cp_apply.fit() + cp_apply = CPMDATabPFNRegressorNewData(model_fit, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) + CP_results = cp_apply.predict(X_new) assert CP_results[1].size== X_new.shape[0] assert isinstance(CP_results[1], np.ndarray) From ce539a418e0f47e78bd79e30fd2736c3696b235f Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Tue, 3 Feb 2026 12:18:10 +0100 Subject: [PATCH 16/18] Fix folder capitalization --- .../cp_missing_data_example.py} | 0 .../{CP_missing_data => cp_missing_data}/__init__.py | 0 .../CP_missing_data.py => cp_missing_data/cp_missing_data.py} | 0 tests/{test_CP_missing_data.py => test_cp_missing_data.py} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename examples/{CP_missing_data/CP_missing_data_example.py => cp_missing_data/cp_missing_data_example.py} (100%) rename src/tabpfn_extensions/{CP_missing_data => cp_missing_data}/__init__.py (100%) rename src/tabpfn_extensions/{CP_missing_data/CP_missing_data.py => cp_missing_data/cp_missing_data.py} (100%) rename tests/{test_CP_missing_data.py => test_cp_missing_data.py} (100%) diff --git a/examples/CP_missing_data/CP_missing_data_example.py b/examples/cp_missing_data/cp_missing_data_example.py similarity index 100% rename from examples/CP_missing_data/CP_missing_data_example.py rename to examples/cp_missing_data/cp_missing_data_example.py diff --git a/src/tabpfn_extensions/CP_missing_data/__init__.py b/src/tabpfn_extensions/cp_missing_data/__init__.py similarity index 100% rename from src/tabpfn_extensions/CP_missing_data/__init__.py rename to src/tabpfn_extensions/cp_missing_data/__init__.py diff --git a/src/tabpfn_extensions/CP_missing_data/CP_missing_data.py b/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py similarity index 100% rename from src/tabpfn_extensions/CP_missing_data/CP_missing_data.py rename to src/tabpfn_extensions/cp_missing_data/cp_missing_data.py diff --git a/tests/test_CP_missing_data.py b/tests/test_cp_missing_data.py similarity index 100% rename from tests/test_CP_missing_data.py rename to tests/test_cp_missing_data.py From d42d29da76d6a9710b93ab7f9bf814a8bda63e5a Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Tue, 3 Feb 2026 13:07:55 +0100 Subject: [PATCH 17/18] Changes based on the ruff linting & formatting --- .../cp_missing_data/cp_missing_data.py | 96 ++++++++----------- tests/test_cp_missing_data.py | 13 +-- 2 files changed, 45 insertions(+), 64 deletions(-) diff --git a/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py b/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py index 25a80a44..f9714efd 100644 --- a/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py +++ b/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py @@ -7,40 +7,38 @@ from __future__ import annotations import warnings -from typing import Optional +from typing import TYPE_CHECKING import numpy as np import pandas as pd -from numpy.typing import ArrayLike from sklearn.model_selection import train_test_split from tabpfn_extensions.utils import TabPFNRegressor +if TYPE_CHECKING: + from numpy.typing import ArrayLike + + class CPMDATabPFNRegressor: """Compute the correction terms for missing data masks using conformal prediction. Parameters: - quantiles : array with three arguments denoting the quantiles of interest. The default is [0.05, 0.5, 0.95], where the first indicates the lower bound, the second the median, and the third the upper bound. - val_size : float between 0 and 1, indicating the size of the validation set as a fraction of the training data. - Returns: - mask_unique: DataFrame with the correction terms for each mask. - - model: Fitted TabPFNRegressor model. - + mask_unique: DataFrame with the correction terms for each mask. + model: Fitted TabPFNRegressor model. """ def __init__( - self, - quantiles: list[float], - val_size: float, - seed: Optional[int] = None + self, + quantiles: list[float], + val_size: float, + seed: int | None = None ) -> None: self.quantiles = quantiles self.val_size = val_size @@ -49,8 +47,8 @@ def __init__( def calc_correction_term( self, - predictions: tuple[np.ndarray, np.ndarray, np.ndarray], - y_val: pd.Series, + predictions: tuple[np.ndarray, np.ndarray, np.ndarray], + y_val: pd.Series, alpha: float ) -> float: """Calculate the correction term for conformal prediction.""" @@ -74,7 +72,7 @@ def calc_correction_term( return correction_term def split_data(self, - x: pd.DataFrame, + x: pd.DataFrame, y: np.ndarray ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.DataFrame]: """Split data into training and validation sets.""" @@ -87,11 +85,11 @@ def split_data(self, # Convert y arrays back to pandas Series to maintain .iloc functionality y_train = pd.Series(y_train_arr, index=x_train.index) y_val = pd.Series(y_val_arr, index=x_val.index) - + return x_train, x_val, y_train, y_val, mask_train, mask_val - def run_TABPFN(self, - x_train: pd.DataFrame, + def run_TABPFN(self, + x_train: pd.DataFrame, y_train: pd.Series ) -> TabPFNRegressor: """Fit the TabPFN model.""" @@ -101,7 +99,7 @@ def run_TABPFN(self, return(model) def mask_preprocess( - self, + self, mask_val: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: """Preprocess masks and identify nested relationships.""" @@ -152,7 +150,7 @@ def create_calibration_sets( # Using merge to add the id of the mask # use original index values df_with_ids = mask_val.reset_index().merge( - mask_unique, + mask_unique, on=mask_cols, how="left" ) @@ -160,9 +158,9 @@ def create_calibration_sets( for i in mask_unique["mask_id"]: # select the nested masks nested_masks = mask_nested[mask_nested["mask_id"] == i]["nested_masks"].values[0] - + # add the mask itself - nested_masks_with_self = [*nested_masks, i] + nested_masks_with_self = [*nested_masks, i] # obtain indexes for the rows indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)]["index"] @@ -193,27 +191,23 @@ def create_calibration_sets( mask_unique.loc[mask_unique["mask_id"] == i, "correction_term"] = correction_term mask_unique.loc[mask_unique["mask_id"] == i, "val_size"] = x_val_nested.shape[0] - return mask_unique, model def fit( - self, - x_train: ArrayLike, + self, + x_train: ArrayLike, y_train: ArrayLike ) -> tuple[pd.DataFrame, TabPFNRegressor]: - """Convenience method to run the entire pipeline + """Convenience method to run the entire pipeline. Parameters: x_train : matrix-like of shape (n_samples, n_predictors) - y_train : array-like of continuous outcome with shape (n_samples,) """ - # Store and parse the data - x = pd.DataFrame(x_train) y = y_train - + # Run trough all the functions x_train, x_val, y_train, y_val, mask_train, mask_val = self.split_data(x, y) model = self.run_TABPFN(x_train, y_train) @@ -223,36 +217,28 @@ def fit( return mask_unique, model + class CPMDATabPFNRegressorNewData: """Compute the correction terms for missing data masks using conformal prediction. Parameters: - - TabPFN: Fitted TabPFNRegressor model. - - quantiles : array with three arguments denoting the quantiles of interest used - in fitting the model. The default is [0.05, 0.5, 0.95]. - - calibration_results : matrix with the correction terms for each mask. - - - Returns: - CP_results: DataFrame with shape (n_samples, 5). Included are the corrected lower bound, - prediction, corrected upper bound, non-corrected lower bound, and non-corrected upper bound. - + tabpfn : Fitted TabPFNRegressor model. + quantiles : Array with three arguments denoting the quantiles of interest used + in fitting the model. The default is [0.05, 0.5, 0.95]. + calibration_results : Matrix with the correction terms for each mask. """ def __init__( - self, - tabpfn: TabPFNRegressor, - quantiles: list[float], + self, + tabpfn: TabPFNRegressor, + quantiles: list[float], calibration_results: pd.DataFrame ) -> None: self.tabpfn = tabpfn self.quantiles = quantiles self.calibration_results = calibration_results - def obtain_preds(self, + def obtain_preds(self, x: pd.DataFrame) -> np.ndarray: """Obtain predictions from fitted model.""" preds = self.tabpfn.predict( @@ -287,31 +273,29 @@ def match_mask(self, return mask_test_cor def perform_correction( - self, - preds: np.ndarray, + self, + preds: np.ndarray, mask_test_cor: pd.DataFrame ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Apply correction terms to the prediction intervals.""" - lb_corr = preds[0] - mask_test_cor["correction_term"].values ub_corr = preds[2] + mask_test_cor["correction_term"].values return lb_corr, preds[1], ub_corr, preds[0], preds[2] def predict( - self, + self, x_new: ArrayLike ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """Convenience method to run the entire pipeline - Parameters: + """Convenience method to run the entire pipeline. + Parameters: x_new : matrix-like of shape (n_samples, n_predictors) """ - x = pd.DataFrame(x_new) preds = self.obtain_preds(x) mask_test_cor = self.match_mask(x) cp_results = self.perform_correction(preds, mask_test_cor) - return cp_results + return cp_results \ No newline at end of file diff --git a/tests/test_cp_missing_data.py b/tests/test_cp_missing_data.py index 45b1e97d..9eaa219d 100644 --- a/tests/test_cp_missing_data.py +++ b/tests/test_cp_missing_data.py @@ -1,7 +1,7 @@ """Tests for the CP_missing_data extension. This file tests the CPMDATabPFNRegressor and CPMDATabPFNRegressorNewData functions, -which attempts to obtain correct uncertainity estimates in case if missing data. +which attempts to obtain correct uncertainity estimates in case if missing data. """ from __future__ import annotations @@ -20,8 +20,8 @@ except ImportError: pytest.skip("Required libraries (tabpfn) not installed", allow_module_level=True) -# -------- Fixtures -------- +# -------- Fixtures -------- @pytest.fixture def X_train(): return np.array([ @@ -54,7 +54,6 @@ def seed(): # -- Test -- - def test_model_CP(X_train, Y_train, seed): """Tests if the calibration corrections are of the correct shape and type.""" model = CPMDATabPFNRegressor(quantiles = [0.05, 0.5, 0.95], val_size = 0.5, seed = seed) @@ -62,7 +61,7 @@ def test_model_CP(X_train, Y_train, seed): # Replicate the split to get the validation set and find its unique masks. _, X_val, _, _ = train_test_split(X_train, Y_train, test_size=0.5, random_state=seed) - missing_df = pd.DataFrame(X_val).isnull().astype(int).drop_duplicates() + missing_df = pd.DataFrame(X_val).isna().astype(int).drop_duplicates() # check type, size of the calibration results assert calibration_results.shape[0] == missing_df.shape[0] @@ -72,7 +71,6 @@ def test_model_CP(X_train, Y_train, seed): def test_reproducibility(X_train, Y_train, seed): """Tests that random_state ensures deterministic correction terms.""" - model_1 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) calibration_results_1, model_fit_1 = model_1.fit(X_train, Y_train) @@ -86,12 +84,11 @@ def test_reproducibility(X_train, Y_train, seed): def test_predict(X_train, Y_train, seed, X_new): """Tests if the predictions have the correct shape and type.""" - - # fit model + # fit model model = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) calibration_results, model_fit = model.fit(X_train, Y_train) - # Apply the model to new cases + # Apply the model to new cases cp_apply = CPMDATabPFNRegressorNewData(model_fit, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) CP_results = cp_apply.predict(X_new) From f20d0fdb776c24793a3dcf59fd33d61014cf22bf Mon Sep 17 00:00:00 2001 From: Fdvanleeuwen <94541170+Fdvanleeuwen@users.noreply.github.com> Date: Tue, 3 Feb 2026 15:13:11 +0100 Subject: [PATCH 18/18] Apply ruff formatting --- .../cp_missing_data/__init__.py | 2 +- .../cp_missing_data/cp_missing_data.py | 109 ++++++++---------- tests/test_cp_missing_data.py | 73 ++++++++---- 3 files changed, 100 insertions(+), 84 deletions(-) diff --git a/src/tabpfn_extensions/cp_missing_data/__init__.py b/src/tabpfn_extensions/cp_missing_data/__init__.py index 18f6e08d..2391856f 100644 --- a/src/tabpfn_extensions/cp_missing_data/__init__.py +++ b/src/tabpfn_extensions/cp_missing_data/__init__.py @@ -8,4 +8,4 @@ __all__ = [ "CPMDATabPFNRegressor", "CPMDATabPFNRegressorNewData", -] \ No newline at end of file +] diff --git a/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py b/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py index f9714efd..40aaf9f4 100644 --- a/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py +++ b/src/tabpfn_extensions/cp_missing_data/cp_missing_data.py @@ -35,10 +35,7 @@ class CPMDATabPFNRegressor: """ def __init__( - self, - quantiles: list[float], - val_size: float, - seed: int | None = None + self, quantiles: list[float], val_size: float, seed: int | None = None ) -> None: self.quantiles = quantiles self.val_size = val_size @@ -49,37 +46,37 @@ def calc_correction_term( self, predictions: tuple[np.ndarray, np.ndarray, np.ndarray], y_val: pd.Series, - alpha: float + alpha: float, ) -> float: """Calculate the correction term for conformal prediction.""" # obtain the lowerbound, median, and upperbound lb, pred, ub = predictions # calculate difference between bounds and observed values - error_lb = (lb - y_val) - error_ub = (y_val - ub) + error_lb = lb - y_val + error_ub = y_val - ub s = np.maximum(error_lb, error_ub) # obtain the emperical quantile - Q_use = (1 - alpha) * (1 + 1/len(s)) + Q_use = (1 - alpha) * (1 + 1 / len(s)) # Check is Q_use if not larger then 1 if Q_use > 1: Q_use = 1 - warnings.warn( - "Some masks have very small calibration sets", stacklevel=2) + warnings.warn("Some masks have very small calibration sets", stacklevel=2) correction_term = np.quantile(s, Q_use) return correction_term - def split_data(self, - x: pd.DataFrame, - y: np.ndarray - ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.DataFrame]: + def split_data( + self, x: pd.DataFrame, y: np.ndarray + ) -> tuple[ + pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.DataFrame + ]: """Split data into training and validation sets.""" # create df with missing data indicator missing_bool_df = x.isna().astype(int) x_train, x_val, y_train_arr, y_val_arr, mask_train, mask_val = train_test_split( - x, y, missing_bool_df, test_size=self.val_size, random_state = self.seed + x, y, missing_bool_df, test_size=self.val_size, random_state=self.seed ) # Convert y arrays back to pandas Series to maintain .iloc functionality @@ -88,19 +85,15 @@ def split_data(self, return x_train, x_val, y_train, y_val, mask_train, mask_val - def run_TABPFN(self, - x_train: pd.DataFrame, - y_train: pd.Series - ) -> TabPFNRegressor: + def run_TABPFN(self, x_train: pd.DataFrame, y_train: pd.Series) -> TabPFNRegressor: """Fit the TabPFN model.""" # fit model model = TabPFNRegressor() model.fit(x_train, y_train) - return(model) + return model def mask_preprocess( - self, - mask_val: pd.DataFrame + self, mask_val: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: """Preprocess masks and identify nested relationships.""" # drop duplicates masks @@ -126,10 +119,7 @@ def mask_preprocess( if ((mask_b == 1) & (mask_a == 0)).sum() == 0: nested_masks.append(mask_b_id) - results.append({ - "mask_id": mask_a_id, - "nested_masks": nested_masks - }) + results.append({"mask_id": mask_a_id, "nested_masks": nested_masks}) mask_nested = pd.DataFrame(results) return mask_unique, mask_nested @@ -141,7 +131,7 @@ def create_calibration_sets( mask_val: pd.DataFrame, mask_unique: pd.DataFrame, mask_nested: pd.DataFrame, - model: TabPFNRegressor + model: TabPFNRegressor, ) -> tuple[pd.DataFrame, TabPFNRegressor]: """Create calibration sets for each mask pattern.""" # obtain list of columns @@ -150,20 +140,22 @@ def create_calibration_sets( # Using merge to add the id of the mask # use original index values df_with_ids = mask_val.reset_index().merge( - mask_unique, - on=mask_cols, - how="left" + mask_unique, on=mask_cols, how="left" ) for i in mask_unique["mask_id"]: # select the nested masks - nested_masks = mask_nested[mask_nested["mask_id"] == i]["nested_masks"].values[0] + nested_masks = mask_nested[mask_nested["mask_id"] == i][ + "nested_masks" + ].values[0] # add the mask itself nested_masks_with_self = [*nested_masks, i] # obtain indexes for the rows - indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)]["index"] + indexes = df_with_ids[df_with_ids["mask_id"].isin(nested_masks_with_self)][ + "index" + ] # select the validation data based on the indices x_val_nested = x_val.loc[indexes] @@ -179,24 +171,26 @@ def create_calibration_sets( # obtain predictions predictions = model.predict( - x_val_nested, - output_type="quantiles", - quantiles=self.quantiles + x_val_nested, output_type="quantiles", quantiles=self.quantiles ) # calculate correction term - correction_term = self.calc_correction_term(predictions, y_val_nested, self.alpha) + correction_term = self.calc_correction_term( + predictions, y_val_nested, self.alpha + ) # save the correction term to the mask_unique dataframe - mask_unique.loc[mask_unique["mask_id"] == i, "correction_term"] = correction_term - mask_unique.loc[mask_unique["mask_id"] == i, "val_size"] = x_val_nested.shape[0] + mask_unique.loc[mask_unique["mask_id"] == i, "correction_term"] = ( + correction_term + ) + mask_unique.loc[mask_unique["mask_id"] == i, "val_size"] = ( + x_val_nested.shape[0] + ) return mask_unique, model def fit( - self, - x_train: ArrayLike, - y_train: ArrayLike + self, x_train: ArrayLike, y_train: ArrayLike ) -> tuple[pd.DataFrame, TabPFNRegressor]: """Convenience method to run the entire pipeline. @@ -213,7 +207,8 @@ def fit( model = self.run_TABPFN(x_train, y_train) mask_unique, mask_nested = self.mask_preprocess(mask_val) mask_unique, model = self.create_calibration_sets( - x_val, y_val, mask_val, mask_unique, mask_nested, model) + x_val, y_val, mask_val, mask_unique, mask_nested, model + ) return mask_unique, model @@ -232,33 +227,27 @@ def __init__( self, tabpfn: TabPFNRegressor, quantiles: list[float], - calibration_results: pd.DataFrame + calibration_results: pd.DataFrame, ) -> None: self.tabpfn = tabpfn self.quantiles = quantiles self.calibration_results = calibration_results - def obtain_preds(self, - x: pd.DataFrame) -> np.ndarray: + def obtain_preds(self, x: pd.DataFrame) -> np.ndarray: """Obtain predictions from fitted model.""" preds = self.tabpfn.predict( - x, - output_type="quantiles", - quantiles=self.quantiles + x, output_type="quantiles", quantiles=self.quantiles ) return preds - def match_mask(self, - x: pd.DataFrame) -> pd.DataFrame: + def match_mask(self, x: pd.DataFrame) -> pd.DataFrame: """Add correction terms to the new masks from the test set.""" mask_test = x.isna().astype(int) mask_cols = list(mask_test.columns.values) mask_test_cor = mask_test.merge( - self.calibration_results, - on=mask_cols, - how="left" - ) + self.calibration_results, on=mask_cols, how="left" + ) # check if there are masks in the test set that are not in the calibration set new_masks = mask_test_cor[mask_test_cor["correction_term"].isna()][mask_cols] @@ -267,15 +256,14 @@ def match_mask(self, warnings.warn( "The following masks are not in the calibration set:\n" f"{new_masks.to_string()}\n" - "The baseline quantile estimates will be returned for those cases.", stacklevel=2 + "The baseline quantile estimates will be returned for those cases.", + stacklevel=2, ) return mask_test_cor def perform_correction( - self, - preds: np.ndarray, - mask_test_cor: pd.DataFrame + self, preds: np.ndarray, mask_test_cor: pd.DataFrame ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Apply correction terms to the prediction intervals.""" lb_corr = preds[0] - mask_test_cor["correction_term"].values @@ -284,8 +272,7 @@ def perform_correction( return lb_corr, preds[1], ub_corr, preds[0], preds[2] def predict( - self, - x_new: ArrayLike + self, x_new: ArrayLike ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Convenience method to run the entire pipeline. @@ -298,4 +285,4 @@ def predict( mask_test_cor = self.match_mask(x) cp_results = self.perform_correction(preds, mask_test_cor) - return cp_results \ No newline at end of file + return cp_results diff --git a/tests/test_cp_missing_data.py b/tests/test_cp_missing_data.py index 9eaa219d..0530cbc1 100644 --- a/tests/test_cp_missing_data.py +++ b/tests/test_cp_missing_data.py @@ -24,28 +24,53 @@ # -------- Fixtures -------- @pytest.fixture def X_train(): - return np.array([ - [0.1, np.nan], [0.3, 0.4], [np.nan, 0.6], [0.7, 0.8], - [0.2, np.nan], [0.2, np.nan], [0.9, 0.4], [np.nan, 0.4], - [0.3, 0.2], [np.nan, 0.9], [0.8, np.nan], [0.1, 0.2], - [np.nan, 0.5], [0.3, 0.7], [0.7, np.nan], [0.7, np.nan], - [0.3, 0.4], [np.nan, 0.2], [0.9, 0.7], [np.nan, 0.3], - [0.3, 0.7], [0.4, 0.8], [0.5, 0.4], [0.7, 0.2], [0.8, 0.3], - ]) + return np.array( + [ + [0.1, np.nan], + [0.3, 0.4], + [np.nan, 0.6], + [0.7, 0.8], + [0.2, np.nan], + [0.2, np.nan], + [0.9, 0.4], + [np.nan, 0.4], + [0.3, 0.2], + [np.nan, 0.9], + [0.8, np.nan], + [0.1, 0.2], + [np.nan, 0.5], + [0.3, 0.7], + [0.7, np.nan], + [0.7, np.nan], + [0.3, 0.4], + [np.nan, 0.2], + [0.9, 0.7], + [np.nan, 0.3], + [0.3, 0.7], + [0.4, 0.8], + [0.5, 0.4], + [0.7, 0.2], + [0.8, 0.3], + ] + ) @pytest.fixture def Y_train(): - return np.array([1,3,1,2,3,4,5,6,1,2,3,4,5,6,7,2,3,5,6,8,4,2,1,2,3]) + return np.array( + [1, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 2, 3, 5, 6, 8, 4, 2, 1, 2, 3] + ) @pytest.fixture def X_new(): - return np.array([ - [0.1, 0.1], - [0.3, np.nan], - [np.nan, 0.6], - ]) + return np.array( + [ + [0.1, 0.1], + [0.3, np.nan], + [np.nan, 0.6], + ] + ) @pytest.fixture @@ -56,11 +81,13 @@ def seed(): # -- Test -- def test_model_CP(X_train, Y_train, seed): """Tests if the calibration corrections are of the correct shape and type.""" - model = CPMDATabPFNRegressor(quantiles = [0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + model = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed=seed) calibration_results, model_fit = model.fit(X_train, Y_train) # Replicate the split to get the validation set and find its unique masks. - _, X_val, _, _ = train_test_split(X_train, Y_train, test_size=0.5, random_state=seed) + _, X_val, _, _ = train_test_split( + X_train, Y_train, test_size=0.5, random_state=seed + ) missing_df = pd.DataFrame(X_val).isna().astype(int).drop_duplicates() # check type, size of the calibration results @@ -71,11 +98,11 @@ def test_model_CP(X_train, Y_train, seed): def test_reproducibility(X_train, Y_train, seed): """Tests that random_state ensures deterministic correction terms.""" - model_1 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + model_1 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed=seed) calibration_results_1, model_fit_1 = model_1.fit(X_train, Y_train) # Second model with the same seed - model_2 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95] , val_size = 0.5, seed = seed) + model_2 = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed=seed) calibration_results_2, model_fit_2 = model_2.fit(X_train, Y_train) # Assert that the outputs are identical @@ -85,13 +112,15 @@ def test_reproducibility(X_train, Y_train, seed): def test_predict(X_train, Y_train, seed, X_new): """Tests if the predictions have the correct shape and type.""" # fit model - model = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size = 0.5, seed = seed) + model = CPMDATabPFNRegressor(quantiles=[0.05, 0.5, 0.95], val_size=0.5, seed=seed) calibration_results, model_fit = model.fit(X_train, Y_train) # Apply the model to new cases - cp_apply = CPMDATabPFNRegressorNewData(model_fit, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results) + cp_apply = CPMDATabPFNRegressorNewData( + model_fit, quantiles=[0.05, 0.5, 0.95], calibration_results=calibration_results + ) CP_results = cp_apply.predict(X_new) - assert CP_results[1].size== X_new.shape[0] + assert CP_results[1].size == X_new.shape[0] assert isinstance(CP_results[1], np.ndarray) - assert len(CP_results)== 5 \ No newline at end of file + assert len(CP_results) == 5