Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Changed
- Updated CI configuration to follow tubular's python-package.yml (https://github.com/azukds/model_interpreter/pull/8)
- Switched to prek for pre-commit checks (https://github.com/azukds/model_interpreter/pull/8)
- Fixed spelling mistakes and removed stale tubular references (https://github.com/azukds/model_interpreter/pull/8)
Comment thread
cjmwills marked this conversation as resolved.
- Added example doctests to interpreter.py (https://github.com/azukds/model_interpreter/pull/10)

1.0.0 (2024-08-06)
-------------------
Expand Down
205 changes: 203 additions & 2 deletions model_interpreter/interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ class ModelInterpreter:
of fields with a name beginning with "colour_" and then return the
aggregated contributions for "colour"

Examples:
---------
>>> features = ['feat_a', 'feat_b', 'feat_c', 'feat_d']
>>> mi = ModelInterpreter(features)
>>> mi.feature_names
['feat_a', 'feat_b', 'feat_c', 'feat_d']

>>> mi = ModelInterpreter(features, one_hot_cols=['feat_a'])
>>> mi.one_hot_cols
['feat_a']
"""

def __init__(
Expand Down Expand Up @@ -158,6 +168,27 @@ def fit(self, model, X_train=None, n_samples=50, is_classification=None):
Returns:
-----------
model explainer

Examples:
---------
Fit on a tree-based model here for example/demonstration

>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import train_test_split
>>> X, y = make_classification(
... n_samples=100, n_features=4, n_informative=2,
... n_redundant=0, random_state=0, shuffle=False,
... )
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42,
... )
>>> clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
>>> _ = clf.fit(X_train, y_train)
>>> mi = ModelInterpreter(['a', 'b', 'c', 'd'])
>>> explainer = mi.fit(clf)
>>> type(explainer).__name__
'TreeExplainer'
"""

logging.debug(f"Model interpreter version {self.version_} initialised")
Expand Down Expand Up @@ -257,13 +288,12 @@ def _get_single_model_contribution(
# this condition hits for multi-classification
if shap_vals.shape[2] > 2:
if predict_class_index not in np.arange(
len(shap_vals) + 1
shap_vals.shape[2]
Comment thread
cjmwills marked this conversation as resolved.
) or not isinstance(predict_class_index, int):
raise ValueError("predicted class not a valid int")

# this condition hits for binary classification
elif shap_vals.shape[2] == 2:
print(shap_vals)
if predict_class_index not in [0, 1]:
raise ValueError("predicted class must be 0 or 1 for binary data")

Expand Down Expand Up @@ -305,6 +335,26 @@ def _get_grouped_contribution(dict_value, dict_contrib, feature_mappings):
Returns:
----------
dict_resp_contrib: dict of feature contributions

Examples:
---------
Rename features with a one-to-one mapping:

>>> dict_value = {'a': 1.0, 'b': 2.0, 'c': 3.0}
>>> dict_contrib = {'a': 0.5, 'b': -0.3, 'c': 0.1}
>>> mapping = {'a': 'Alpha', 'b': 'Beta', 'c': 'Gamma'}
>>> ModelInterpreter._get_grouped_contribution(
... dict_value, dict_contrib, mapping,
... )
({'Alpha': 1.0, 'Beta': 2.0, 'Gamma': 3.0}, {'Alpha': 0.5, 'Beta': -0.3, 'Gamma': 0.1})

Group features together (contributions are summed):

>>> grouping = {'a': 'G1', 'b': 'G1', 'c': 'G2'}
>>> ModelInterpreter._get_grouped_contribution(
... dict_value, dict_contrib, grouping,
... )
(None, {'G1': 0.2, 'G2': 0.1})
"""

if not isinstance(feature_mappings, dict):
Expand Down Expand Up @@ -413,6 +463,137 @@ def transform(
-----------
Features and their contributions in structure according to specified
return_type

Examples:
Comment thread
cjmwills marked this conversation as resolved.
---------
**Binary classification (RandomForest / tree-based):**

>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import train_test_split
>>> X, y = make_classification(
... n_samples=1000, n_features=4, n_informative=2,
... n_redundant=0, random_state=0, shuffle=False,
... )
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42,
... )
>>> clf = RandomForestClassifier(max_depth=2, random_state=0)
>>> _ = clf.fit(X_train, y_train)
>>> features = ['feature1', 'feature2', 'feature3', 'feature4']
>>> mi = ModelInterpreter(features)
>>> _ = mi.fit(clf)
>>> single_row = X[0]

Default return type (name_value_dicts), sorted by absolute contribution:

>>> mi.transform(single_row, return_precision=4)
[{'Name': 'feature2', 'Value': -0.3491}, {'Name': 'feature1', 'Value': -0.0039}, {'Name': 'feature4', 'Value': 0.0032}, {'Name': 'feature3', 'Value': 0.0014}]

Rename features using a one-to-one mapping. You can provide a `feature_mapping` dictionary which can either map feature names to more interpretable names, or group features together:

>>> mapping = {
... 'feature1': 'feature 1 was mapped', 'feature2': 'feature 2 was mapped',
... 'feature3': 'feature 3 was mapped', 'feature4': 'feature 4 was mapped',
... }
>>> mi.transform(
... single_row, feature_mappings=mapping,
... )
[{'Name': 'feature 2 was mapped', 'Value': -0.3491295830480707}, {'Name': 'feature 1 was mapped', 'Value': -0.0039231513013799}, {'Name': 'feature 4 was mapped', 'Value': 0.0031653931724603}, {'Name': 'feature 3 was mapped', 'Value': 0.0013787609499949}]

**Multiclass classification (RandomForest):**

Use predict_class to select which class to get contributions for.
To find the class with the highest predicted probability, use
``np.argmax`` on ``predict_proba``:

>>> import numpy as np
>>> X, y = make_classification(
... n_samples=1000, n_features=4, n_informative=2,
... n_redundant=0, random_state=0, shuffle=False,
... )
>>> y[200:500] = 2
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42,
... )
>>> clf = RandomForestClassifier(max_depth=2, random_state=0)
>>> _ = clf.fit(X_train, y_train)
>>> mi = ModelInterpreter(features)
>>> _ = mi.fit(clf)
>>> single_row = X[0]

Find the class with the highest predicted probability and use it
as predict_class.

>>> probs = clf.predict_proba(single_row.reshape(1, -1))
>>> predict_class = int(clf.classes_[np.argmax(probs)])
>>> predict_class
0

Get contributions for predicted class (0) and other classes individually:

>>> mi.transform(
... single_row, predict_class=0,
... return_type='single_dict', return_precision=4,
... )
{'feature2': 0.3113, 'feature1': -0.0253, 'feature3': -0.0048, 'feature4': 0.0014}

>>> mi.transform(
... single_row, predict_class=1,
... return_type='single_dict', return_precision=4,
... )
{'feature2': -0.1429, 'feature1': -0.083, 'feature3': 0.0016, 'feature4': -0.0014}

>>> mi.transform(
... single_row, predict_class=2,
... return_type='single_dict', return_precision=4,
... )
{'feature2': -0.1684, 'feature1': 0.1082, 'feature3': 0.0032, 'feature4': 0.0}

**XGBoost regression (tree-based):**

>>> import xgboost as xgb
>>> from sklearn.datasets import fetch_california_housing
>>> X, y = fetch_california_housing(return_X_y=True, as_frame=True)
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42,
... )
>>> dtrain = xgb.DMatrix(data=X_train, label=y_train)
>>> feature_names = list(X.columns)
>>> xgb_model = xgb.train(
... params={"seed": 1, "max_depth": 6, "min_child_weight": 20},
... dtrain=dtrain,
... )
>>> mi = ModelInterpreter(feature_names)
>>> _ = mi.fit(xgb_model)
>>> single_row = X_test[feature_names].head(1)
>>> mi.transform(
... single_row, return_type='single_dict', return_precision=4,
... )
{'MedInc': -0.7147, 'AveOccup': -0.2794, 'Latitude': -0.2189, 'Longitude': -0.1588, 'AveRooms': 0.0442, 'Population': -0.0064, 'HouseAge': -0.0029, 'AveBedrms': -0.0015}

**Non-standard models (KernelExplainer):**

For non-standard models that require shap.KernelExplainer, you must
provide X_train and is_classification when calling fit().

>>> from sklearn.neighbors import KNeighborsClassifier
>>> X, y = make_classification(
... n_samples=1000, n_features=4, n_informative=2,
... n_redundant=0, random_state=0, shuffle=False,
... )
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42,
... )
>>> model = KNeighborsClassifier(n_neighbors=5)
>>> _ = model.fit(X_train, y_train)
>>> mi = ModelInterpreter(features)
>>> _ = mi.fit(model, X_train=X_train, is_classification=True)
>>> single_row = X_test[0].reshape(1, -1)
>>> mi.transform(
... single_row, return_type='single_dict', return_precision=4,
... )
{'feature2': -0.3783, 'feature3': -0.033, 'feature1': -0.0177, 'feature4': 0.005}
"""

if sorting not in ["abs", "label", "positive"]:
Expand Down Expand Up @@ -503,6 +684,16 @@ def _name_value_dicts_return(dict_response):
Return:
-----------
list of dictionaries with structure [{"Name": feature, "Value": contribution}, ... ]

Examples:
---------
>>> ModelInterpreter._name_value_dicts_return({'a': 0.5, 'b': -0.3})
[{'Name': 'a', 'Value': 0.5}, {'Name': 'b', 'Value': -0.3}]

Tuple values (feature_value, contribution) are reduced to contribution only:

>>> ModelInterpreter._name_value_dicts_return({'a': (1.0, 0.5), 'b': (2.0, -0.3)})
[{'Name': 'a', 'Value': 0.5}, {'Name': 'b', 'Value': -0.3}]
"""

dicts = []
Expand All @@ -523,6 +714,11 @@ def _dicts_return(dict_response):
Return:
-----------
list of dictionaries with structure [{feature: contribution}, ... ]

Examples:
---------
>>> ModelInterpreter._dicts_return({'a': 0.5, 'b': -0.3})
[{'a': 0.5}, {'b': -0.3}]
"""

dicts = []
Expand All @@ -539,6 +735,11 @@ def _tups_return(dict_response):
Return:
-----------
list of tuples with structure [(feature: contribution), ... ]

Examples:
---------
>>> ModelInterpreter._tups_return({'a': 0.5, 'b': -0.3})
[('a', 0.5), ('b', -0.3)]
"""

tups = []
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ dev = [

exclude = ["docs", "examples"]

[tool.pytest.ini_options]
addopts = "--doctest-modules"

[tool.ruff.lint]
select = [
"E",
Expand Down
Loading