diff --git a/README.md b/README.md index 6df2323..8919015 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ This function takes as arguments: - `patience`, number of epochs without any improvement of the features selection, before stopping the process (the idea is similar to the early stopping of Tensorflow/Keras) - `splitting_type`, it can be equal to `simple` (for simple train/test split) or `kfold` (for 5-fold splitting). If you choose `kfold`, the feature importance will be computed as the average feature importance for each train/test subset. - `noise_type`, it can be equal to `gaussian` for gaussian noise or `random` for flat random noise +- `importance_type`, it can be equal to `model` for using model coefficients or `shap` for extracting importance using Shapley values - `filename_output`, a string to indicate where to save the file. You can also choose `None` if you do not want to save it - `random_state`, set the random seed that it is used by the k-fold splitting diff --git a/requirements.txt b/requirements.txt index f91b70a..fb50382 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy==1.20.3 pandas==1.3.4 scipy==1.7.1 scikit-learn==0.24.2 -jupyterlab==3.2.1 \ No newline at end of file +jupyterlab==3.2.1 +shap==0.43.0 \ No newline at end of file diff --git a/src/ml.py b/src/ml.py index 08ad333..1e0ea21 100644 --- a/src/ml.py +++ b/src/ml.py @@ -4,6 +4,7 @@ from typing import Tuple, List, Optional import numpy as np import pandas as pd +import shap from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.model_selection import KFold, train_test_split from sklearn.base import BaseEstimator @@ -61,28 +62,57 @@ def train_evaluate_model( def get_feature_importances( - trained_model: BaseEstimator, column_names: List[str] + trained_model: BaseEstimator, + x_train: pd.DataFrame, + column_names: List[str], + scaler_type: BaseEstimator, + importance_type: str ) -> pd.DataFrame: """It computes the features importance, given a trained model. Parameters: - trained_model: a scikit-learn ML trained model + - x_train: training features - column_names: the name of the columns associated to the features + - scaler_type: choose between StandardScaler or MinMaxScaler + - importance_type: the method for selecting feature importance Return: - a DataFrame containing the feature importance (not sorted) as column and the name of the features as index """ - + + # SHAP importance + if importance_type == "shap": + + if scaler_type == "StandardScaler": + scaler = StandardScaler() + elif scaler_type == "MinMaxScaler": + scaler = MinMaxScaler() + x_train = scaler.fit_transform(x_train) + + explainer = shap.Explainer(trained_model, x_train) + shap_values = explainer.shap_values(x_train) + + shap_sum = np.abs(shap_values).mean(axis=0) + df_coef = pd.DataFrame([shap_sum.tolist()]).T + df_coef.columns = ['shap_importance'] + df_coef.index = column_names + # inspect coefficients - if hasattr(trained_model, "coef_"): - model_coefficients = trained_model.coef_ - elif hasattr(trained_model, "feature_importances_"): - model_coefficients = trained_model.feature_importances_ - else: - raise ValueError("Could not retrieve the feature importance") + elif importance_type == "model": + if hasattr(trained_model, "coef_"): + model_coefficients = trained_model.coef_ + elif hasattr(trained_model, "feature_importances_"): + model_coefficients = trained_model.feature_importances_ + else: + raise ValueError("Could not retrieve the feature importance") - df_coef = pd.DataFrame(model_coefficients, index=column_names) + df_coef = pd.DataFrame(model_coefficients, index=column_names) + + else: + raise ValueError("Allowed values for importance_type are model and shap") + return df_coef @@ -183,6 +213,7 @@ def train_with_kfold_splitting( labels: pd.DataFrame, model: BaseEstimator, scaler_type: BaseEstimator, + importance_type: str, verbose: bool, random_state: int, ) -> pd.DataFrame: @@ -194,6 +225,7 @@ def train_with_kfold_splitting( - labels: the vector with labels, commonly called y - model: an untrained scikit-learn model - scaler_type: choose between StandardScaler or MinMaxScaler + - importance_type: the method for selecting feature importance - verbose: True or False to tune the level of verbosity - random_state: select the random state of the train/test splitting process @@ -216,12 +248,13 @@ def train_with_kfold_splitting( scaler_type, verbose, ) + if i == 0: - df_coefs = get_feature_importances(trained_model, x_trains[i].columns) + df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type) df_coefs.columns = ["cycle_" + str(i + 1)] else: df_coefs["cycle_" + str(i + 1)] = get_feature_importances( - trained_model, x_trains[i].columns + trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type ) df_coef = compute_mean_coefficients(df_coefs) @@ -233,6 +266,7 @@ def train_with_simple_splitting( labels: pd.DataFrame, model: BaseEstimator, scaler_type: BaseEstimator, + importance_type: str, verbose: bool, random_state: int, ) -> pd.DataFrame: @@ -244,6 +278,7 @@ def train_with_simple_splitting( - labels: the vector with labels, commonly called y - model: an untrained scikit-learn model - scaler_type: choose between StandardScaler or MinMaxScaler + - importance_type: the method for selecting feature importance - verbose: True or False to tune the level of verbosity - random_state: select the random state of the train/test splitting process @@ -259,7 +294,7 @@ def train_with_simple_splitting( trained_model = train_evaluate_model( x_train, y_train, x_test, y_test, model, scaler_type, verbose ) - df_coefs = get_feature_importances(trained_model, x_train.columns) + df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, scaler_type, importance_type) df_coef = compute_mean_coefficients(df_coefs) @@ -274,6 +309,7 @@ def scan_features_pipeline( verbose: bool, random_state: int, noise_type: str, + importance_type: str ) -> pd.DataFrame: """This pipeline performs various operations: - train and evaluate the model @@ -289,6 +325,7 @@ def scan_features_pipeline( - verbose: True or False to tune the level of verbosity - random_state: select the random state of the train/test splitting process - noise_type: choose between "gaussian" noise or "random" (flat) noise + - importance_type: the method for selecting feature importance Return: - the simplified dataset, containing only the most relevant features @@ -308,11 +345,11 @@ def scan_features_pipeline( if splitting_type == "kfold": df_coef = train_with_kfold_splitting( - x_new, labels, model, scaler_type, verbose, random_state + x_new, labels, model, scaler_type, importance_type, verbose, random_state ) elif splitting_type == "simple": df_coef = train_with_simple_splitting( - x_new, labels, model, scaler_type, verbose, random_state + x_new, labels, model, scaler_type, importance_type, verbose, random_state ) else: raise ValueError("Choice not recognized. Possible choices are kfold or simple") @@ -330,6 +367,7 @@ def get_relevant_features( epochs: int, patience: int, noise_type: str = "gaussian", + importance_type: str = "model", verbose: bool = True, filename_output: Optional[str] = None, random_state: int = 42, @@ -346,6 +384,7 @@ def get_relevant_features( - patience: the number of cycles of non-improvement to wait before stopping the execution of the code - noise_type: choose between "gaussian" noise or "random" (flat) noise + - importance_type: the method for selecting feature importance - verbose: True or False, to tune the level of verbosity - filename_output: name of the simplified dataset if you want to export it, default is None - random_state: select the random seed @@ -372,6 +411,7 @@ def get_relevant_features( verbose, random_states[epoch], noise_type, + importance_type ) n_features_after = x_new.shape[1] @@ -390,4 +430,4 @@ def get_relevant_features( if filename_output is not None: x_new.to_csv(filename_output, index=False) - return x_new + return x_new \ No newline at end of file