From 2931ecc8de2216b245643a8ecbb56319fdacfd75 Mon Sep 17 00:00:00 2001
From: valots12 <valotidavide@gmail.com>
Date: Thu, 23 Nov 2023 20:20:44 +0100
Subject: [PATCH 1/3] Add SHAP support

---
 requirements.txt |  3 ++-
 src/ml.py        | 55 +++++++++++++++++++++++++++++++++++-------------
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f91b70a..fb50382 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ numpy==1.20.3
 pandas==1.3.4
 scipy==1.7.1
 scikit-learn==0.24.2
-jupyterlab==3.2.1
\ No newline at end of file
+jupyterlab==3.2.1
+shap==0.43.0
\ No newline at end of file
diff --git a/src/ml.py b/src/ml.py
index 08ad333..2eed533 100644
--- a/src/ml.py
+++ b/src/ml.py
@@ -4,6 +4,7 @@
 from typing import Tuple, List, Optional
 import numpy as np
 import pandas as pd
+import shap
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.model_selection import KFold, train_test_split
 from sklearn.base import BaseEstimator
@@ -61,7 +62,10 @@ def train_evaluate_model(
 
 
 def get_feature_importances(
-    trained_model: BaseEstimator, column_names: List[str]
+    trained_model: BaseEstimator, 
+    x_train: pd.DataFrame,
+    column_names: List[str],
+    importance_type: str
 ) -> pd.DataFrame:
     """It computes the features importance, given a trained model.
 
@@ -73,16 +77,31 @@ def get_feature_importances(
         - a DataFrame containing the feature importance (not sorted) as column and
         the name of the features as index
     """
-
+    
+    # SHAP importance
+    if importance_type == "shap":
+        explainer = shap.Explainer(trained_model, x_train)
+        shap_values = explainer.shap_values(x_train)
+        
+        shap_sum = np.abs(shap_values).mean(axis=0)
+        df_coef = pd.DataFrame([shap_sum.tolist()]).T
+        df_coef.columns = ['shap_importance']
+        df_coef.index = column_names
+    
     # inspect coefficients
-    if hasattr(trained_model, "coef_"):
-        model_coefficients = trained_model.coef_
-    elif hasattr(trained_model, "feature_importances_"):
-        model_coefficients = trained_model.feature_importances_
-    else:
-        raise ValueError("Could not retrieve the feature importance")
+    elif importance_type == "model":
+        if hasattr(trained_model, "coef_"):
+            model_coefficients = trained_model.coef_
+        elif hasattr(trained_model, "feature_importances_"):
+            model_coefficients = trained_model.feature_importances_
+        else:
+            raise ValueError("Could not retrieve the feature importance")
 
-    df_coef = pd.DataFrame(model_coefficients, index=column_names)
+        df_coef = pd.DataFrame(model_coefficients, index=column_names)
+        
+    else:
+        raise ValueError("Allowed values for importance_type are model and shap")
+        
 
     return df_coef
 
@@ -183,6 +202,7 @@ def train_with_kfold_splitting(
     labels: pd.DataFrame,
     model: BaseEstimator,
     scaler_type: BaseEstimator,
+    importance_type: str,
     verbose: bool,
     random_state: int,
 ) -> pd.DataFrame:
@@ -216,12 +236,13 @@ def train_with_kfold_splitting(
             scaler_type,
             verbose,
         )
+
         if i == 0:
-            df_coefs = get_feature_importances(trained_model, x_trains[i].columns)
+            df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, importance_type)
             df_coefs.columns = ["cycle_" + str(i + 1)]
         else:
             df_coefs["cycle_" + str(i + 1)] = get_feature_importances(
-                trained_model, x_trains[i].columns
+                trained_model, x_trains[i], x_trains[i].columns, importance_type
             )
 
     df_coef = compute_mean_coefficients(df_coefs)
@@ -233,6 +254,7 @@ def train_with_simple_splitting(
     labels: pd.DataFrame,
     model: BaseEstimator,
     scaler_type: BaseEstimator,
+    importance_type: str,
     verbose: bool,
     random_state: int,
 ) -> pd.DataFrame:
@@ -259,7 +281,7 @@ def train_with_simple_splitting(
     trained_model = train_evaluate_model(
         x_train, y_train, x_test, y_test, model, scaler_type, verbose
     )
-    df_coefs = get_feature_importances(trained_model, x_train.columns)
+    df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, importance_type)
 
     df_coef = compute_mean_coefficients(df_coefs)
 
@@ -274,6 +296,7 @@ def scan_features_pipeline(
     verbose: bool,
     random_state: int,
     noise_type: str,
+    importance_type: str
 ) -> pd.DataFrame:
     """This pipeline performs various operations:
     - train and evaluate the model
@@ -308,11 +331,11 @@ def scan_features_pipeline(
 
     if splitting_type == "kfold":
         df_coef = train_with_kfold_splitting(
-            x_new, labels, model, scaler_type, verbose, random_state
+            x_new, labels, model, scaler_type, importance_type, verbose, random_state
         )
     elif splitting_type == "simple":
         df_coef = train_with_simple_splitting(
-            x_new, labels, model, scaler_type, verbose, random_state
+            x_new, labels, model, scaler_type, importance_type, verbose, random_state
         )
     else:
         raise ValueError("Choice not recognized. Possible choices are kfold or simple")
@@ -330,6 +353,7 @@ def get_relevant_features(
     epochs: int,
     patience: int,
     noise_type: str = "gaussian",
+    importance_type: str = "model",
     verbose: bool = True,
     filename_output: Optional[str] = None,
     random_state: int = 42,
@@ -372,6 +396,7 @@ def get_relevant_features(
             verbose,
             random_states[epoch],
             noise_type,
+            importance_type
         )
         n_features_after = x_new.shape[1]
 
@@ -390,4 +415,4 @@ def get_relevant_features(
     if filename_output is not None:
         x_new.to_csv(filename_output, index=False)
 
-    return x_new
+    return x_new
\ No newline at end of file

From f581db7e546f6f37c4a12d2aeecefdcd25c4723a Mon Sep 17 00:00:00 2001
From: Davide Valoti <63108350+valots12@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:22:37 +0100
Subject: [PATCH 2/3] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6df2323..8919015 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,7 @@ This function takes as arguments:
 - `patience`, number of epochs without any improvement of the features selection, before stopping the process (the idea is similar to the early stopping of Tensorflow/Keras)
 - `splitting_type`, it can be equal to `simple` (for simple train/test split) or `kfold` (for 5-fold splitting). If you choose `kfold`, the feature importance will be computed as the average feature importance for each train/test subset.
 - `noise_type`, it can be equal to `gaussian` for gaussian noise or `random` for flat random noise
+- `importance_type`, it can be equal to `model` for using model coefficients or `shap` for extracting importance using Shapley values
 - `filename_output`, a string to indicate where to save the file. You can also choose `None` if you do not want to save it
 - `random_state`, set the random seed that it is used by the k-fold splitting
 

From 55f51640ce88928ab13a84e9d9ea66c93302d042 Mon Sep 17 00:00:00 2001
From: valots12 <valotidavide@gmail.com>
Date: Thu, 23 Nov 2023 20:32:35 +0100
Subject: [PATCH 3/3] Add scaler before computing shap

---
 src/ml.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/ml.py b/src/ml.py
index 2eed533..1e0ea21 100644
--- a/src/ml.py
+++ b/src/ml.py
@@ -65,13 +65,17 @@ def get_feature_importances(
     trained_model: BaseEstimator, 
     x_train: pd.DataFrame,
     column_names: List[str],
+    scaler_type: BaseEstimator,
     importance_type: str
 ) -> pd.DataFrame:
     """It computes the features importance, given a trained model.
 
     Parameters:
         - trained_model: a scikit-learn ML trained model
+        - x_train: training features
         - column_names: the name of the columns associated to the features
+        - scaler_type: choose between StandardScaler or MinMaxScaler
+        - importance_type: the method for selecting feature importance
 
     Return:
         - a DataFrame containing the feature importance (not sorted) as column and
@@ -80,6 +84,13 @@ def get_feature_importances(
     
     # SHAP importance
     if importance_type == "shap":
+
+        if scaler_type == "StandardScaler":
+            scaler = StandardScaler()
+        elif scaler_type == "MinMaxScaler":
+            scaler = MinMaxScaler()
+        x_train = scaler.fit_transform(x_train)
+
         explainer = shap.Explainer(trained_model, x_train)
         shap_values = explainer.shap_values(x_train)
         
@@ -214,6 +225,7 @@ def train_with_kfold_splitting(
         - labels: the vector with labels, commonly called y
         - model: an untrained scikit-learn model
         - scaler_type: choose between StandardScaler or MinMaxScaler
+        - importance_type: the method for selecting feature importance
         - verbose: True or False to tune the level of verbosity
         - random_state: select the random state of the train/test splitting process
 
@@ -238,11 +250,11 @@ def train_with_kfold_splitting(
         )
 
         if i == 0:
-            df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, importance_type)
+            df_coefs = get_feature_importances(trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type)
             df_coefs.columns = ["cycle_" + str(i + 1)]
         else:
             df_coefs["cycle_" + str(i + 1)] = get_feature_importances(
-                trained_model, x_trains[i], x_trains[i].columns, importance_type
+                trained_model, x_trains[i], x_trains[i].columns, scaler_type, importance_type
             )
 
     df_coef = compute_mean_coefficients(df_coefs)
@@ -266,6 +278,7 @@ def train_with_simple_splitting(
         - labels: the vector with labels, commonly called y
         - model: an untrained scikit-learn model
         - scaler_type: choose between StandardScaler or MinMaxScaler
+        - importance_type: the method for selecting feature importance
         - verbose: True or False to tune the level of verbosity
         - random_state: select the random state of the train/test splitting process
 
@@ -281,7 +294,7 @@ def train_with_simple_splitting(
     trained_model = train_evaluate_model(
         x_train, y_train, x_test, y_test, model, scaler_type, verbose
     )
-    df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, importance_type)
+    df_coefs = get_feature_importances(trained_model, x_train, x_train.columns, scaler_type, importance_type)
 
     df_coef = compute_mean_coefficients(df_coefs)
 
@@ -312,6 +325,7 @@ def scan_features_pipeline(
         - verbose: True or False to tune the level of verbosity
         - random_state: select the random state of the train/test splitting process
         - noise_type: choose between "gaussian" noise or "random" (flat) noise
+        - importance_type: the method for selecting feature importance
 
     Return:
         - the simplified dataset, containing only the most relevant features
@@ -370,6 +384,7 @@ def get_relevant_features(
         - patience: the number of cycles of non-improvement to wait before stopping
         the execution of the code
         - noise_type: choose between "gaussian" noise or "random" (flat) noise
+        - importance_type: the method for selecting feature importance
         - verbose: True or False, to tune the level of verbosity
         - filename_output:  name of the simplified dataset if you want to export it, default is None
         - random_state: select the random seed