From 1beebd51bb143fc00cfd6c7da79865870febe224 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 2 Oct 2025 16:50:47 +0200 Subject: [PATCH] suggestions --- summary_examples/blogpost1.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/summary_examples/blogpost1.py b/summary_examples/blogpost1.py index f8a4cb0..247937a 100644 --- a/summary_examples/blogpost1.py +++ b/summary_examples/blogpost1.py @@ -18,7 +18,7 @@ # We construct a synthetic problem for which a linear model is not able to predict better than # the empty model, i.e., the model predicting the observed average of the target. # -# As we can see, the model assigns a zero coefficient to features 1 and 2, indicating that it can learn +# As we can see, the model assigns a zero coefficient to features $X_1$ and $X_2$, indicating that it can learn # nothing from them. We might want to conclude that these features hold no information on the target. # However, since the performance of our model is really bad on external data, we should not draw any conclusion # on the underlying process and focus on improving the model first. @@ -40,10 +40,11 @@ linear_regressor = LassoCV(random_state=rng) linear_regressor.fit(X_train, y_train) +# maybe a dataframe feature | coef will be better looking ? print("Features with non zero coef:") print( [ - f"Feature {idx}, coef: {linear_regressor.coef_[idx]:.2f}" + f"x{idx} coef={linear_regressor.coef_[idx]:.2f}" for idx in range(3) if linear_regressor.coef_[idx] != 0 ] @@ -63,9 +64,10 @@ # # Discarding $X_1$ at the previous step would have been a mistake: it is used by the properly specified # model. It was ignored by the simpler model as its impact on the target is only through its square and its -# interaction with $X_0$. We can now say that $X_1$ is important for the underlying process. Some features involving -# $X_2$ are receiving low but non zero coefficients in the second model. It is not clear from just these coefficients -# if $X_2$ has a low impact on the target or if the model is overfitting on it. +# interaction with $X_0$. We can now say that $X_1$ is important for the underlying process. Some features involving +# $X_2$ are receiving low but non zero coefficients in the second model. In our synthetic case, we know +# that the target $Y = X_0 + (X_0+X_1)^2 + \text{noise}$ does not depend on $X_2$, so these small nonzero coefficients +# are only finite sample errors. # %% @@ -161,14 +163,14 @@ ax1.set_xlabel("Importance") ax1.set_title("Train Set") -idx_test = np.argsort(rf_pi_test.importances_mean) +# same ordering for the barplots ax2.barh( range(n_features), - rf_pi_test.importances_mean[idx_test], - xerr=rf_pi_test.importances_std[idx_test], + rf_pi_test.importances_mean[idx_train], + xerr=rf_pi_test.importances_std[idx_train], ) ax2.set_yticks(range(n_features)) -ax2.set_yticklabels(np.array(feature_names)[idx_test]) +ax2.set_yticklabels(np.array(feature_names)[idx_train]) ax2.set_xlabel("Importance") ax2.set_title("Test Set") @@ -196,6 +198,8 @@ # features to improve the quality of our model. Recursive Feature Elimination with Cross # Validation (`RFECV`) provides a good way to trim down irrelevant features. # [Justify that permutation importance is sensible by citing Reyero Lobo et al. ?] +# [Yes ! Maybe explain that j irrelevant means X_j \perp Y | X_{-j}, and that PFI +# (in the optimal setting) is able to detect such irrelevant features] from sklearn.feature_selection import RFECV @@ -298,6 +302,7 @@ def permutation_importance_getter(model, feature_indices, X_val, y_val, random_s ) # %% [markdown] +# [I feel the summary/recap is a bit dry, maybe give more details?] # We showed here that we should be careful with feature importance: it can lead to misleading # conclusions when the model is not optimal. We also showed how to use feature importance to # perform feature selection and how that can lead to a better model.