From b9121525c4c7dfa9b8da9ad3c9db5bfcd4bf08ac Mon Sep 17 00:00:00 2001
From: Michele Gentili <michele.gentili93@gmail.com>
Date: Thu, 23 Apr 2020 20:17:49 +0200
Subject: [PATCH] paralleled the prediction

---
 treeinterpreter/treeinterpreter.py | 455 ++++++++++++++---------------
 1 file changed, 222 insertions(+), 233 deletions(-)

diff --git a/treeinterpreter/treeinterpreter.py b/treeinterpreter/treeinterpreter.py
index bba0427..428664f 100644
--- a/treeinterpreter/treeinterpreter.py
+++ b/treeinterpreter/treeinterpreter.py
@@ -1,233 +1,222 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import sklearn
-
-from sklearn.ensemble.forest import ForestClassifier, ForestRegressor
-from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, _tree
-from distutils.version import LooseVersion
-if LooseVersion(sklearn.__version__) < LooseVersion("0.17"):
-    raise Exception("treeinterpreter requires scikit-learn 0.17 or later")
-
-
-def _get_tree_paths(tree, node_id, depth=0):
-    """
-    Returns all paths through the tree as list of node_ids
-    """
-    if node_id == _tree.TREE_LEAF:
-        raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
-
-    left_child = tree.children_left[node_id]
-    right_child = tree.children_right[node_id]
-
-    if left_child != _tree.TREE_LEAF:
-        left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
-        right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)
-
-        for path in left_paths:
-            path.append(node_id)
-        for path in right_paths:
-            path.append(node_id)
-        paths = left_paths + right_paths
-    else:
-        paths = [[node_id]]
-    return paths
-
-
-def _predict_tree(model, X, joint_contribution=False):
-    """
-    For a given DecisionTreeRegressor, DecisionTreeClassifier,
-    ExtraTreeRegressor, or ExtraTreeClassifier,
-    returns a triple of [prediction, bias and feature_contributions], such
-    that prediction ≈ bias + feature_contributions.
-    """
-    leaves = model.apply(X)
-    paths = _get_tree_paths(model.tree_, 0)
-
-    for path in paths:
-        path.reverse()
-
-    leaf_to_path = {}
-    #map leaves to paths
-    for path in paths:
-        leaf_to_path[path[-1]] = path         
-    
-    # remove the single-dimensional inner arrays
-    values = model.tree_.value.squeeze(axis=1)
-    # reshape if squeezed into a single float
-    if len(values.shape) == 0:
-        values = np.array([values])
-    if isinstance(model, DecisionTreeRegressor):
-        biases = np.full(X.shape[0], values[paths[0][0]])
-        line_shape = X.shape[1]
-    elif isinstance(model, DecisionTreeClassifier):
-        # scikit stores category counts, we turn them into probabilities
-        normalizer = values.sum(axis=1)[:, np.newaxis]
-        normalizer[normalizer == 0.0] = 1.0
-        values /= normalizer
-
-        biases = np.tile(values[paths[0][0]], (X.shape[0], 1))
-        line_shape = (X.shape[1], model.n_classes_)
-    direct_prediction = values[leaves]
-    
-    
-    #make into python list, accessing values will be faster
-    values_list = list(values)
-    feature_index = list(model.tree_.feature)
-    
-    contributions = []
-    if joint_contribution:
-        for row, leaf in enumerate(leaves):
-            path = leaf_to_path[leaf]
-            
-            
-            path_features = set()
-            contributions.append({})
-            for i in range(len(path) - 1):
-                path_features.add(feature_index[path[i]])
-                contrib = values_list[path[i+1]] - \
-                         values_list[path[i]]
-                #path_features.sort()
-                contributions[row][tuple(sorted(path_features))] = \
-                    contributions[row].get(tuple(sorted(path_features)), 0) + contrib
-        return direct_prediction, biases, contributions
-        
-    else:
-        unique_leaves = np.unique(leaves)
-        unique_contributions = {}
-        
-        for row, leaf in enumerate(unique_leaves):
-            for path in paths:
-                if leaf == path[-1]:
-                    break
-            
-            contribs = np.zeros(line_shape)
-            for i in range(len(path) - 1):
-                
-                contrib = values_list[path[i+1]] - \
-                         values_list[path[i]]
-                contribs[feature_index[path[i]]] += contrib
-            unique_contributions[leaf] = contribs
-            
-        for row, leaf in enumerate(leaves):
-            contributions.append(unique_contributions[leaf])
-
-        return direct_prediction, biases, np.array(contributions)
-
-
-def _iterative_mean(iter, current_mean, x):
-    """
-    Iteratively calculates mean using
-    http://www.heikohoffmann.de/htmlthesis/node134.html
-    :param iter: non-negative integer, iteration
-    :param current_mean: numpy array, current value of mean
-    :param x: numpy array, new value to be added to mean
-    :return: numpy array, updated mean
-    """
-    return current_mean + ((x - current_mean) / (iter + 1))
-
-
-def _predict_forest(model, X, joint_contribution=False):
-    """
-    For a given RandomForestRegressor, RandomForestClassifier,
-    ExtraTreesRegressor, or ExtraTreesClassifier returns a triple of
-    [prediction, bias and feature_contributions], such that prediction ≈ bias +
-    feature_contributions.
-    """
-
-    if joint_contribution:
-        biases = []
-        contributions = []
-        predictions = []
-        
-        for tree in model.estimators_:
-            pred, bias, contribution = _predict_tree(tree, X, joint_contribution=joint_contribution)
-
-            biases.append(bias)
-            contributions.append(contribution)
-            predictions.append(pred)
-        
-        
-        total_contributions = []
-        
-        for i in range(len(X)):
-            contr = {}
-            for j, dct in enumerate(contributions):
-                for k in set(dct[i]).union(set(contr.keys())):
-                    contr[k] = (contr.get(k, 0)*j + dct[i].get(k,0) ) / (j+1)
-
-            total_contributions.append(contr)    
-            
-        for i, item in enumerate(contribution):
-            total_contributions[i]
-            sm = sum([v for v in contribution[i].values()])
-                
-
-        
-        return (np.mean(predictions, axis=0), np.mean(biases, axis=0),
-            total_contributions)
-    else:
-        mean_pred = None
-        mean_bias = None
-        mean_contribution = None
-
-        for i, tree in enumerate(model.estimators_):
-            pred, bias, contribution = _predict_tree(tree, X)
-
-            if i < 1: # first iteration
-                mean_bias = bias
-                mean_contribution = contribution
-                mean_pred = pred
-            else:
-                mean_bias = _iterative_mean(i, mean_bias, bias)
-                mean_contribution = _iterative_mean(i, mean_contribution, contribution)
-                mean_pred = _iterative_mean(i, mean_pred, pred)
-
-        return mean_pred, mean_bias, mean_contribution
-
-
-def predict(model, X, joint_contribution=False):
-    """ Returns a triple (prediction, bias, feature_contributions), such
-    that prediction ≈ bias + feature_contributions.
-    Parameters
-    ----------
-    model : DecisionTreeRegressor, DecisionTreeClassifier,
-        ExtraTreeRegressor, ExtraTreeClassifier,
-        RandomForestRegressor, RandomForestClassifier,
-        ExtraTreesRegressor, ExtraTreesClassifier
-    Scikit-learn model on which the prediction should be decomposed.
-
-    X : array-like, shape = (n_samples, n_features)
-    Test samples.
-    
-    joint_contribution : boolean
-    Specifies if contributions are given individually from each feature,
-    or jointly over them
-
-    Returns
-    -------
-    decomposed prediction : triple of
-    * prediction, shape = (n_samples) for regression and (n_samples, n_classes)
-        for classification
-    * bias, shape = (n_samples) for regression and (n_samples, n_classes) for
-        classification
-    * contributions, If joint_contribution is False then returns and  array of 
-        shape = (n_samples, n_features) for regression or
-        shape = (n_samples, n_features, n_classes) for classification, denoting
-        contribution from each feature.
-        If joint_contribution is True, then shape is array of size n_samples,
-        where each array element is a dict from a tuple of feature indices to
-        to a value denoting the contribution from that feature tuple.
-    """
-    # Only single out response variable supported,
-    if model.n_outputs_ > 1:
-        raise ValueError("Multilabel classification trees not supported")
-
-    if (isinstance(model, DecisionTreeClassifier) or
-        isinstance(model, DecisionTreeRegressor)):
-        return _predict_tree(model, X, joint_contribution=joint_contribution)
-    elif (isinstance(model, ForestClassifier) or
-          isinstance(model, ForestRegressor)):
-        return _predict_forest(model, X, joint_contribution=joint_contribution)
-    else:
-        raise ValueError("Wrong model type. Base learner needs to be a "
-                         "DecisionTreeClassifier or DecisionTreeRegressor.")
+# -*- coding: utf-8 -*-
+import numpy as np
+import sklearn
+from functools import partial
+
+from sklearn.ensemble.forest import ForestClassifier, ForestRegressor
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, _tree
+from distutils.version import LooseVersion
+import multiprocessing
+
+if LooseVersion(sklearn.__version__) < LooseVersion("0.17"):
+    raise Exception("treeinterpreter requires scikit-learn 0.17 or later")
+
+
+def _get_tree_paths(tree, node_id, depth=0):
+    """
+	Returns all paths through the tree as list of node_ids
+	"""
+    if node_id == _tree.TREE_LEAF:
+        raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
+
+    left_child = tree.children_left[node_id]
+    right_child = tree.children_right[node_id]
+
+    if left_child != _tree.TREE_LEAF:
+        left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
+        right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)
+
+        for path in left_paths:
+            path.append(node_id)
+        for path in right_paths:
+            path.append(node_id)
+        paths = left_paths + right_paths
+    else:
+        paths = [[node_id]]
+    return paths
+
+
+def _predict_tree(model, X, joint_contribution=False):
+    """
+	For a given DecisionTreeRegressor, DecisionTreeClassifier,
+	ExtraTreeRegressor, or ExtraTreeClassifier,
+	returns a triple of [prediction, bias and feature_contributions], such
+	that prediction ≈ bias + feature_contributions.
+	"""
+    leaves = model.apply(X)
+    paths = _get_tree_paths(model.tree_, 0)
+
+    for path in paths:
+        path.reverse()
+
+    leaf_to_path = {}
+    # map leaves to paths
+    for path in paths:
+        leaf_to_path[path[-1]] = path
+
+    # remove the single-dimensional inner arrays
+    values = model.tree_.value.squeeze(axis=1)
+    # reshape if squeezed into a single float
+    if len(values.shape) == 0:
+        values = np.array([values])
+    if isinstance(model, DecisionTreeRegressor):
+        biases = np.full(X.shape[0], values[paths[0][0]])
+        line_shape = X.shape[1]
+    elif isinstance(model, DecisionTreeClassifier):
+        # scikit stores category counts, we turn them into probabilities
+        normalizer = values.sum(axis=1)[:, np.newaxis]
+        normalizer[normalizer == 0.0] = 1.0
+        values /= normalizer
+
+        biases = np.tile(values[paths[0][0]], (X.shape[0], 1))
+        line_shape = (X.shape[1], model.n_classes_)
+    direct_prediction = values[leaves]
+
+    # make into python list, accessing values will be faster
+    values_list = list(values)
+    feature_index = list(model.tree_.feature)
+
+    contributions = []
+    if joint_contribution:
+        for row, leaf in enumerate(leaves):
+            path = leaf_to_path[leaf]
+
+            path_features = set()
+            contributions.append({})
+            for i in range(len(path) - 1):
+                path_features.add(feature_index[path[i]])
+                contrib = values_list[path[i + 1]] - \
+                          values_list[path[i]]
+                # path_features.sort()
+                contributions[row][tuple(sorted(path_features))] = \
+                    contributions[row].get(tuple(sorted(path_features)), 0) + contrib
+        return direct_prediction, biases, contributions
+
+    else:
+        unique_leaves = np.unique(leaves)
+        unique_contributions = {}
+
+        for row, leaf in enumerate(unique_leaves):
+            for path in paths:
+                if leaf == path[-1]:
+                    break
+
+            contribs = np.zeros(line_shape)
+            for i in range(len(path) - 1):
+                contrib = values_list[path[i + 1]] - \
+                          values_list[path[i]]
+                contribs[feature_index[path[i]]] += contrib
+            unique_contributions[leaf] = contribs
+
+        for row, leaf in enumerate(leaves):
+            contributions.append(unique_contributions[leaf])
+
+        return direct_prediction, biases, np.array(contributions)
+
+
+def _predict_forest(model, X, n_cores=1, joint_contribution=False):
+    """
+	For a given RandomForestRegressor, RandomForestClassifier,
+	ExtraTreesRegressor, or ExtraTreesClassifier returns a triple of
+	[prediction, bias and feature_contributions], such that prediction ≈ bias +
+	feature_contributions.
+	"""
+    biases = []
+    contributions = []
+    predictions = []
+
+    if joint_contribution:
+
+        if n_cores > 1:
+            print('Parallel preocessing')
+            pool = multiprocessing.Pool(n_cores)
+            predictions, biases, contributions = zip(
+                *pool.map(partial(_predict_tree, X=X, joint_contribution=joint_contribution), model.estimators_))
+        else:
+            for tree in model.estimators_:
+                pred, bias, contribution = _predict_tree(tree, X, joint_contribution=joint_contribution)
+
+                biases.append(bias)
+                contributions.append(contribution)
+                predictions.append(pred)
+
+        total_contributions = []
+
+        for i in range(len(X)):
+            contr = {}
+            for j, dct in enumerate(contributions):
+                for k in set(dct[i]).union(set(contr.keys())):
+                    contr[k] = (contr.get(k, 0) * j + dct[i].get(k, 0)) / (j + 1)
+
+            total_contributions.append(contr)
+
+        #not used
+        #for i, item in enumerate(contribution):
+        #    total_contributions[i]
+        #    sm = sum([v for v in contribution[i].values()])
+
+        return (np.mean(predictions, axis=0), np.mean(biases, axis=0),
+                total_contributions)
+    else:
+        if n_cores > 1:
+            print('Parallel preocessing')
+            pool = multiprocessing.Pool(n_cores)
+            predictions, biases, contributions = zip(*pool.map(partial(_predict_tree, X=X), model.estimators_))
+
+        else:
+            for tree in model.estimators_:
+                pred, bias, contribution = _predict_tree(tree, X)
+
+                biases.append(bias)
+                contributions.append(contribution)
+                predictions.append(pred)
+
+        return (np.mean(predictions, axis=0), np.mean(biases, axis=0), np.mean(contributions, axis=0))
+
+
+def predict(model, X, n_cores=1, joint_contribution=False):
+    """ Returns a triple (prediction, bias, feature_contributions), such
+	that prediction ≈ bias + feature_contributions.
+	Parameters
+	----------
+	model : DecisionTreeRegressor, DecisionTreeClassifier,
+		ExtraTreeRegressor, ExtraTreeClassifier,
+		RandomForestRegressor, RandomForestClassifier,
+		ExtraTreesRegressor, ExtraTreesClassifier
+	Scikit-learn model on which the prediction should be decomposed.
+
+	X : array-like, shape = (n_samples, n_features)
+	Test samples.
+
+	joint_contribution : boolean
+	Specifies if contributions are given individually from each feature,
+	or jointly over them
+
+	Returns
+	-------
+	decomposed prediction : triple of
+	* prediction, shape = (n_samples) for regression and (n_samples, n_classes)
+		for classification
+	* bias, shape = (n_samples) for regression and (n_samples, n_classes) for
+		classification
+	* contributions, If joint_contribution is False then returns and  array of
+		shape = (n_samples, n_features) for regression or
+		shape = (n_samples, n_features, n_classes) for classification, denoting
+		contribution from each feature.
+		If joint_contribution is True, then shape is array of size n_samples,
+		where each array element is a dict from a tuple of feature indices to
+		to a value denoting the contribution from that feature tuple.
+	"""
+    # Only single out response variable supported,
+    if model.n_outputs_ > 1:
+        raise ValueError("Multilabel classification trees not supported")
+
+    if (isinstance(model, DecisionTreeClassifier) or
+            isinstance(model, DecisionTreeRegressor)):
+        return _predict_tree(model, X, joint_contribution=joint_contribution)
+    elif (isinstance(model, ForestClassifier) or
+          isinstance(model, ForestRegressor)):
+        return _predict_forest(model, X, n_cores=n_cores, joint_contribution=joint_contribution)
+    else:
+        raise ValueError("Wrong model type. Base learner needs to be a "
+                         "DecisionTreeClassifier or DecisionTreeRegressor.")