From 2931ecc8de2216b245643a8ecbb56319fdacfd75 Mon Sep 17 00:00:00 2001 From: valots12 Date: Thu, 23 Nov 2023 20:20:44 +0100 Subject: [PATCH 1/3] Add SHAP support --- requirements.txt | 3 ++- src/ml.py | 55 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index f91b70a..fb50382 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy==1.20.3 pandas==1.3.4 scipy==1.7.1 scikit-learn==0.24.2 -jupyterlab==3.2.1 \ No newline at end of file +jupyterlab==3.2.1 +shap==0.43.0 \ No newline at end of file diff --git a/src/ml.py b/src/ml.py index 08ad333..2eed533 100644 --- a/src/ml.py +++ b/src/ml.py @@ -4,6 +4,7 @@ from typing import Tuple, List, Optional import numpy as np import pandas as pd +import shap from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.model_selection import KFold, train_test_split from sklearn.base import BaseEstimator @@ -61,7 +62,10 @@ def train_evaluate_model( def get_feature_importances( - trained_model: BaseEstimator, column_names: List[str] + trained_model: BaseEstimator, + x_train: pd.DataFrame, + column_names: List[str], + importance_type: str ) -> pd.DataFrame: """It computes the features importance, given a trained model. @@ -73,16 +77,31 @@ def get_feature_importances( - a DataFrame containing the feature importance (not sorted) as column and the name of the features as index """ - + + # SHAP importance + if importance_type == "shap": + explainer = shap.Explainer(trained_model, x_train) + shap_values = explainer.shap_values(x_train) + + shap_sum = np.abs(shap_values).mean(axis=0) + df_coef = pd.DataFrame([shap_sum.tolist()]).T + df_coef.columns = ['shap_importance'] + df_coef.index = column_names + # inspect coefficients - if hasattr(trained_model, "coef_"): - model_coefficients = trained_model.coef_ - elif hasattr(trained_model, "feature_importances_"): - model_coefficients = trained_model.feature_importances_ - else: - raise ValueError("Could not retrieve the feature importance") + elif importance_type == "model": + if hasattr(trained_model, "coef_"): + model_coefficients = trained_model.coef_ + elif hasattr(trained_model, "feature_importances_"): + model_coefficients = trained_model.feature_importances_ + else: + raise ValueError("Could not retrieve the feature importance") - df_coef = pd.DataFrame(model_coefficients, index=column_names) + df_coef = pd.DataFrame(model_coefficients, index=column_names) + + else: + raise ValueError("Allowed values for importance_type are model and shap") + return df_coef @@ -183,6 +202,7 @@ def train_with_kfold_splitting( labels: pd.DataFrame, model: BaseEstimator, scaler_type: BaseEstimator, + importance_type: str, verbose: bool, random_state: int, ) -> pd.DataFrame: @@ -216,12 +236,13 @@ def train_with_kfold_splitting( scaler_type, verbose, ) + if i == 0: - df_coefs = get_feature_importances(trained_model, x_trains[i].columns) + df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, importance_type) df_coefs.columns = ["cycle_" + str(i + 1)] else: df_coefs["cycle_" + str(i + 1)] = get_feature_importances( - trained_model, x_trains[i].columns + trained_model, x_trains[i], x_trains[i].columns, importance_type ) df_coef = compute_mean_coefficients(df_coefs) @@ -233,6 +254,7 @@ def train_with_simple_splitting( labels: pd.DataFrame, model: BaseEstimator, scaler_type: BaseEstimator, + importance_type: str, verbose: bool, random_state: int, ) -> pd.DataFrame: @@ -259,7 +281,7 @@ def train_with_simple_splitting( trained_model = train_evaluate_model( x_train, y_train, x_test, y_test, model, scaler_type, verbose ) - df_coefs = get_feature_importances(trained_model, x_train.columns) + df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, importance_type) df_coef = compute_mean_coefficients(df_coefs) @@ -274,6 +296,7 @@ def scan_features_pipeline( verbose: bool, random_state: int, noise_type: str, + importance_type: str ) -> pd.DataFrame: """This pipeline performs various operations: - train and evaluate the model @@ -308,11 +331,11 @@ def scan_features_pipeline( if splitting_type == "kfold": df_coef = train_with_kfold_splitting( - x_new, labels, model, scaler_type, verbose, random_state + x_new, labels, model, scaler_type, importance_type, verbose, random_state ) elif splitting_type == "simple": df_coef = train_with_simple_splitting( - x_new, labels, model, scaler_type, verbose, random_state + x_new, labels, model, scaler_type, importance_type, verbose, random_state ) else: raise ValueError("Choice not recognized. Possible choices are kfold or simple") @@ -330,6 +353,7 @@ def get_relevant_features( epochs: int, patience: int, noise_type: str = "gaussian", + importance_type: str = "model", verbose: bool = True, filename_output: Optional[str] = None, random_state: int = 42, @@ -372,6 +396,7 @@ def get_relevant_features( verbose, random_states[epoch], noise_type, + importance_type ) n_features_after = x_new.shape[1] @@ -390,4 +415,4 @@ def get_relevant_features( if filename_output is not None: x_new.to_csv(filename_output, index=False) - return x_new + return x_new \ No newline at end of file From f581db7e546f6f37c4a12d2aeecefdcd25c4723a Mon Sep 17 00:00:00 2001 From: Davide Valoti <63108350+valots12@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:22:37 +0100 Subject: [PATCH 2/3] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6df2323..8919015 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ This function takes as arguments: - `patience`, number of epochs without any improvement of the features selection, before stopping the process (the idea is similar to the early stopping of Tensorflow/Keras) - `splitting_type`, it can be equal to `simple` (for simple train/test split) or `kfold` (for 5-fold splitting). If you choose `kfold`, the feature importance will be computed as the average feature importance for each train/test subset. - `noise_type`, it can be equal to `gaussian` for gaussian noise or `random` for flat random noise +- `importance_type`, it can be equal to `model` for using model coefficients or `shap` for extracting importance using Shapley values - `filename_output`, a string to indicate where to save the file. You can also choose `None` if you do not want to save it - `random_state`, set the random seed that it is used by the k-fold splitting From 55f51640ce88928ab13a84e9d9ea66c93302d042 Mon Sep 17 00:00:00 2001 From: valots12 Date: Thu, 23 Nov 2023 20:32:35 +0100 Subject: [PATCH 3/3] Add scaler before computing shap --- src/ml.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/ml.py b/src/ml.py index 2eed533..1e0ea21 100644 --- a/src/ml.py +++ b/src/ml.py @@ -65,13 +65,17 @@ def get_feature_importances( trained_model: BaseEstimator, x_train: pd.DataFrame, column_names: List[str], + scaler_type: BaseEstimator, importance_type: str ) -> pd.DataFrame: """It computes the features importance, given a trained model. Parameters: - trained_model: a scikit-learn ML trained model + - x_train: training features - column_names: the name of the columns associated to the features + - scaler_type: choose between StandardScaler or MinMaxScaler + - importance_type: the method for selecting feature importance Return: - a DataFrame containing the feature importance (not sorted) as column and @@ -80,6 +84,13 @@ def get_feature_importances( # SHAP importance if importance_type == "shap": + + if scaler_type == "StandardScaler": + scaler = StandardScaler() + elif scaler_type == "MinMaxScaler": + scaler = MinMaxScaler() + x_train = scaler.fit_transform(x_train) + explainer = shap.Explainer(trained_model, x_train) shap_values = explainer.shap_values(x_train) @@ -214,6 +225,7 @@ def train_with_kfold_splitting( - labels: the vector with labels, commonly called y - model: an untrained scikit-learn model - scaler_type: choose between StandardScaler or MinMaxScaler + - importance_type: the method for selecting feature importance - verbose: True or False to tune the level of verbosity - random_state: select the random state of the train/test splitting process @@ -238,11 +250,11 @@ def train_with_kfold_splitting( ) if i == 0: - df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, importance_type) + df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type) df_coefs.columns = ["cycle_" + str(i + 1)] else: df_coefs["cycle_" + str(i + 1)] = get_feature_importances( - trained_model, x_trains[i], x_trains[i].columns, importance_type + trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type ) df_coef = compute_mean_coefficients(df_coefs) @@ -266,6 +278,7 @@ def train_with_simple_splitting( - labels: the vector with labels, commonly called y - model: an untrained scikit-learn model - scaler_type: choose between StandardScaler or MinMaxScaler + - importance_type: the method for selecting feature importance - verbose: True or False to tune the level of verbosity - random_state: select the random state of the train/test splitting process @@ -281,7 +294,7 @@ def train_with_simple_splitting( trained_model = train_evaluate_model( x_train, y_train, x_test, y_test, model, scaler_type, verbose ) - df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, importance_type) + df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, scaler_type, importance_type) df_coef = compute_mean_coefficients(df_coefs) @@ -312,6 +325,7 @@ def scan_features_pipeline( - verbose: True or False to tune the level of verbosity - random_state: select the random state of the train/test splitting process - noise_type: choose between "gaussian" noise or "random" (flat) noise + - importance_type: the method for selecting feature importance Return: - the simplified dataset, containing only the most relevant features @@ -370,6 +384,7 @@ def get_relevant_features( - patience: the number of cycles of non-improvement to wait before stopping the execution of the code - noise_type: choose between "gaussian" noise or "random" (flat) noise + - importance_type: the method for selecting feature importance - verbose: True or False, to tune the level of verbosity - filename_output: name of the simplified dataset if you want to export it, default is None - random_state: select the random seed