diff --git a/lineartree/_classes.py b/lineartree/_classes.py index 8b11d09..8d25f2a 100644 --- a/lineartree/_classes.py +++ b/lineartree/_classes.py @@ -1,52 +1,56 @@ import numbers +from copy import deepcopy +from inspect import signature + import numpy as np import scipy.sparse as sp - -from copy import deepcopy +import sklearn from joblib import Parallel, effective_n_jobs # , delayed - +from sklearn.base import BaseEstimator, TransformerMixin, is_regressor from sklearn.dummy import DummyClassifier -from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeRegressor +from sklearn.utils.validation import check_is_fitted, has_fit_parameter, validate_data -from sklearn.base import is_regressor -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import has_fit_parameter, check_is_fitted +from ._criterion import SCORING, crossentropy, hamming, mae, mse, poisson, rmse -from ._criterion import SCORING -from ._criterion import mse, rmse, mae, poisson -from ._criterion import hamming, crossentropy +_sklearn_v1 = eval(sklearn.__version__.split(".")[0]) > 0 -import sklearn -_sklearn_v1 = eval(sklearn.__version__.split('.')[0]) > 0 +CRITERIA = { + "mse": mse, + "rmse": rmse, + "mae": mae, + "poisson": poisson, + "hamming": hamming, + "crossentropy": crossentropy, +} -CRITERIA = {"mse": mse, - "rmse": rmse, - "mae": mae, - "poisson": poisson, - "hamming": hamming, - "crossentropy": crossentropy} +import functools +from functools import update_wrapper ######################################################################### ### remove when https://github.com/joblib/joblib/issues/1071 is fixed ### ######################################################################### -from sklearn import get_config, config_context -from functools import update_wrapper -import functools +from sklearn import config_context, get_config + # from sklearn.utils.fixes def delayed(function): """Decorator used to capture the arguments of a function.""" + @functools.wraps(function) def delayed_function(*args, **kwargs): return _FuncWrapper(function), args, kwargs + return delayed_function + # from sklearn.utils.fixes class _FuncWrapper: - """"Load the global configuration before calling the function.""" + """ "Load the global configuration before calling the function.""" + def __init__(self, function): self.function = function self.config = get_config() @@ -55,6 +59,8 @@ def __init__(self, function): def __call__(self, *args, **kwargs): with config_context(**self.config): return self.function(*args, **kwargs) + + ######################################################################### ######################################################################### ######################################################################### @@ -68,7 +74,7 @@ def _partition_columns(columns, n_jobs): # Partition columns between jobs n_columns_per_job = np.full(n_jobs, n_columns // n_jobs, dtype=int) - n_columns_per_job[:n_columns % n_jobs] += 1 + n_columns_per_job[: n_columns % n_jobs] += 1 columns_per_job = np.cumsum(n_columns_per_job) columns_per_job = np.split(columns, columns_per_job) columns_per_job = columns_per_job[:-1] @@ -76,9 +82,9 @@ def _partition_columns(columns, n_jobs): return n_jobs, columns_per_job -def _parallel_binning_fit(split_feat, _self, X, y, - weights, support_sample_weight, - bins, loss): +def _parallel_binning_fit( + split_feat, _self, X, y, weights, support_sample_weight, bins, loss +): """Private function to find the best column splittings within a job.""" n_sample, n_feat = X.shape feval = CRITERIA[_self.criterion] @@ -87,18 +93,16 @@ def _parallel_binning_fit(split_feat, _self, X, y, split_col = None left_node = (None, None, None, None) right_node = (None, None, None, None) - largs_left = {'classes': None} - largs_right = {'classes': None} + largs_left = {"classes": None} + largs_right = {"classes": None} if n_sample < _self._min_samples_split: return loss, split_t, split_col, left_node, right_node for col, _bin in zip(split_feat, bins): - for q in _bin: - # create 1D bool mask for right/left children - mask = (X[:, col] > q) + mask = X[:, col] > q n_left, n_right = (~mask).sum(), mask.sum() @@ -112,44 +116,50 @@ def _parallel_binning_fit(split_feat, _self, X, y, model_left = deepcopy(_self.base_estimator) model_right = deepcopy(_self.base_estimator) - if hasattr(_self, 'classes_'): - largs_left['classes'] = np.unique(y[~mask]) - largs_right['classes'] = np.unique(y[mask]) - if len(largs_left['classes']) == 1: + if hasattr(_self, "classes_"): + largs_left["classes"] = np.unique(y[~mask]) + largs_right["classes"] = np.unique(y[mask]) + if len(largs_left["classes"]) == 1: model_left = DummyClassifier(strategy="most_frequent") - if len(largs_right['classes']) == 1: + if len(largs_right["classes"]) == 1: model_right = DummyClassifier(strategy="most_frequent") if weights is None: model_left.fit(X[left_mesh], y[~mask]) - loss_left = feval(model_left, X[left_mesh], y[~mask], - **largs_left) + loss_left = feval(model_left, X[left_mesh], y[~mask], **largs_left) wloss_left = loss_left * (n_left / n_sample) model_right.fit(X[right_mesh], y[mask]) - loss_right = feval(model_right, X[right_mesh], y[mask], - **largs_right) + loss_right = feval(model_right, X[right_mesh], y[mask], **largs_right) wloss_right = loss_right * (n_right / n_sample) else: if support_sample_weight: - model_left.fit(X[left_mesh], y[~mask], - sample_weight=weights[~mask]) + model_left.fit(X[left_mesh], y[~mask], sample_weight=weights[~mask]) - model_right.fit(X[right_mesh], y[mask], - sample_weight=weights[mask]) + model_right.fit(X[right_mesh], y[mask], sample_weight=weights[mask]) else: model_left.fit(X[left_mesh], y[~mask]) model_right.fit(X[right_mesh], y[mask]) - loss_left = feval(model_left, X[left_mesh], y[~mask], - weights=weights[~mask], **largs_left) + loss_left = feval( + model_left, + X[left_mesh], + y[~mask], + weights=weights[~mask], + **largs_left + ) wloss_left = loss_left * (weights[~mask].sum() / weights.sum()) - loss_right = feval(model_right, X[right_mesh], y[mask], - weights=weights[mask], **largs_right) + loss_right = feval( + model_right, + X[right_mesh], + y[mask], + weights=weights[mask], + **largs_right + ) wloss_right = loss_right * (weights[mask].sum() / weights.sum()) total_loss = round(wloss_left + wloss_right, 5) @@ -159,20 +169,30 @@ def _parallel_binning_fit(split_feat, _self, X, y, split_t = q split_col = col loss = total_loss - left_node = (model_left, loss_left, wloss_left, - n_left, largs_left['classes']) - right_node = (model_right, loss_right, wloss_right, - n_right, largs_right['classes']) + left_node = ( + model_left, + loss_left, + wloss_left, + n_left, + largs_left["classes"], + ) + right_node = ( + model_right, + loss_right, + wloss_right, + n_right, + largs_right["classes"], + ) return loss, split_t, split_col, left_node, right_node def _map_node(X, feat, direction, split): """Utility to map samples to nodes""" - if direction == 'L': - mask = (X[:, feat] <= split) + if direction == "L": + mask = X[:, feat] <= split else: - mask = (X[:, feat] > split) + mask = X[:, feat] > split return mask @@ -190,11 +210,18 @@ def _predict_branch(X, branch_history, mask=None): class Node: - - def __init__(self, id=None, threshold=[], - parent=None, children=None, - n_samples=None, w_loss=None, - loss=None, model=None, classes=None): + def __init__( + self, + id=None, + threshold=[], + parent=None, + children=None, + n_samples=None, + w_loss=None, + loss=None, + model=None, + classes=None, + ): self.id = id self.threshold = threshold self.parent = parent @@ -212,11 +239,22 @@ class _LinearTree(BaseEstimator): Warning: This class should not be used directly. Use derived classes instead. """ - def __init__(self, base_estimator, *, criterion, max_depth, - min_samples_split, min_samples_leaf, max_bins, - min_impurity_decrease, categorical_features, - split_features, linear_features, n_jobs): + def __init__( + self, + base_estimator, + *, + criterion, + max_depth, + min_samples_split, + min_samples_leaf, + max_bins, + min_impurity_decrease, + categorical_features, + split_features, + linear_features, + n_jobs + ): self.base_estimator = base_estimator self.criterion = criterion self.max_depth = max_depth @@ -232,10 +270,7 @@ def __init__(self, base_estimator, *, criterion, max_depth, def _parallel_args(self): return {} - def _split(self, X, y, bins, - support_sample_weight, - weights=None, - loss=None): + def _split(self, X, y, bins, support_sample_weight, weights=None, loss=None): """Evaluate optimal splits in a given node (in a specific partition of X and y). @@ -273,16 +308,19 @@ def _split(self, X, y, bins, n_jobs, split_feat = _partition_columns(self._split_features, self.n_jobs) # partition columns splittings between jobs - all_results = Parallel(n_jobs=n_jobs, verbose=0, - **self._parallel_args())( + all_results = Parallel(n_jobs=n_jobs, verbose=0, **self._parallel_args())( delayed(_parallel_binning_fit)( feat, - self, X, y, - weights, support_sample_weight, + self, + X, + y, + weights, + support_sample_weight, [bins[i] for i in feat], - loss + loss, ) - for feat in split_feat) + for feat in split_feat + ) # extract results from parallel loops _losses, split_t, split_col = [], [], [] @@ -337,40 +375,37 @@ def _grow(self, X, y, weights=None): bins = np.linspace(0, 1, self.max_bins)[1:-1] bins = np.quantile(X, bins, axis=0) bins = list(bins.T) - bins = [np.unique(X[:, c]) if c in self._categorical_features - else np.unique(q) for c, q in enumerate(bins)] + bins = [ + np.unique(X[:, c]) if c in self._categorical_features else np.unique(q) + for c, q in enumerate(bins) + ] # check if base_estimator supports fitting with sample_weights - support_sample_weight = has_fit_parameter(self.base_estimator, - "sample_weight") + support_sample_weight = has_fit_parameter(self.base_estimator, "sample_weight") - queue = [''] # queue of the nodes to evaluate for splitting + queue = [""] # queue of the nodes to evaluate for splitting # store the results of each node in dicts self._nodes = {} self._leaves = {} # initialize first fit - largs = {'classes': None} + largs = {"classes": None} model = deepcopy(self.base_estimator) if weights is None or not support_sample_weight: model.fit(X[:, self._linear_features], y) else: model.fit(X[:, self._linear_features], y, sample_weight=weights) - if hasattr(self, 'classes_'): - largs['classes'] = self.classes_ + if hasattr(self, "classes_"): + largs["classes"] = self.classes_ loss = CRITERIA[self.criterion]( - model, X[:, self._linear_features], y, - weights=weights, **largs) + model, X[:, self._linear_features], y, weights=weights, **largs + ) loss = round(loss, 5) - self._nodes[''] = Node( - id=0, - n_samples=n_sample, - model=model, - loss=loss, - classes=largs['classes'] + self._nodes[""] = Node( + id=0, n_samples=n_sample, model=model, loss=loss, classes=largs["classes"] ) # in the beginning consider all the samples @@ -379,17 +414,19 @@ def _grow(self, X, y, weights=None): i = 1 while len(queue) > 0: - if weights is None: split_t, split_col, left_node, right_node = self._split( - X[mask], y[mask], bins, - support_sample_weight, - loss=loss) + X[mask], y[mask], bins, support_sample_weight, loss=loss + ) else: split_t, split_col, left_node, right_node = self._split( - X[mask], y[mask], bins, - support_sample_weight, weights[mask], - loss=loss) + X[mask], + y[mask], + bins, + support_sample_weight, + weights[mask], + loss=loss, + ) # no utility in splitting if split_col is None or len(queue[-1]) >= self.max_depth: @@ -397,50 +434,48 @@ def _grow(self, X, y, weights=None): del self._nodes[queue[-1]] queue.pop() else: - model_left, loss_left, wloss_left, n_left, class_left = \ - left_node - model_right, loss_right, wloss_right, n_right, class_right = \ - right_node - self.feature_importances_[split_col] += \ - loss - wloss_left - wloss_right - - self._nodes[queue[-1] + 'L'] = Node( - id=i, parent=queue[-1], + model_left, loss_left, wloss_left, n_left, class_left = left_node + model_right, loss_right, wloss_right, n_right, class_right = right_node + self.feature_importances_[split_col] += loss - wloss_left - wloss_right + + self._nodes[queue[-1] + "L"] = Node( + id=i, + parent=queue[-1], model=model_left, loss=loss_left, w_loss=wloss_left, n_samples=n_left, - threshold=self._nodes[queue[-1]].threshold[:] + [ - (split_col, 'L', split_t) - ] + threshold=self._nodes[queue[-1]].threshold[:] + + [(split_col, "L", split_t)], ) - self._nodes[queue[-1] + 'R'] = Node( - id=i + 1, parent=queue[-1], + self._nodes[queue[-1] + "R"] = Node( + id=i + 1, + parent=queue[-1], model=model_right, loss=loss_right, w_loss=wloss_right, n_samples=n_right, - threshold=self._nodes[queue[-1]].threshold[:] + [ - (split_col, 'R', split_t) - ] + threshold=self._nodes[queue[-1]].threshold[:] + + [(split_col, "R", split_t)], ) - if hasattr(self, 'classes_'): - self._nodes[queue[-1] + 'L'].classes = class_left - self._nodes[queue[-1] + 'R'].classes = class_right + if hasattr(self, "classes_"): + self._nodes[queue[-1] + "L"].classes = class_left + self._nodes[queue[-1] + "R"].classes = class_right - self._nodes[queue[-1]].children = (queue[-1] + 'L', queue[-1] + 'R') + self._nodes[queue[-1]].children = (queue[-1] + "L", queue[-1] + "R") i += 2 q = queue[-1] queue.pop() - queue.extend([q + 'R', q + 'L']) + queue.extend([q + "R", q + "L"]) if len(queue) > 0: loss = self._nodes[queue[-1]].loss mask = _predict_branch( - X, self._nodes[queue[-1]].threshold, start.copy()) + X, self._nodes[queue[-1]].threshold, start.copy() + ) self.node_count = i @@ -470,20 +505,25 @@ def _fit(self, X, y, sample_weight=None): self : object """ n_sample, n_feat = X.shape + self.base_estimator = self.estimator if isinstance(self.min_samples_split, numbers.Integral): if self.min_samples_split < 6: raise ValueError( "min_samples_split must be an integer greater than 5 or " "a float in (0.0, 1.0); got the integer {}".format( - self.min_samples_split)) + self.min_samples_split + ) + ) self._min_samples_split = self.min_samples_split else: - if not 0. < self.min_samples_split < 1.: + if not 0.0 < self.min_samples_split < 1.0: raise ValueError( "min_samples_split must be an integer greater than 5 or " "a float in (0.0, 1.0); got the float {}".format( - self.min_samples_split)) + self.min_samples_split + ) + ) self._min_samples_split = int(np.ceil(self.min_samples_split * n_sample)) self._min_samples_split = max(6, self._min_samples_split) @@ -493,14 +533,18 @@ def _fit(self, X, y, sample_weight=None): raise ValueError( "min_samples_leaf must be an integer greater than 2 or " "a float in (0.0, 1.0); got the integer {}".format( - self.min_samples_leaf)) + self.min_samples_leaf + ) + ) self._min_samples_leaf = self.min_samples_leaf else: - if not 0. < self.min_samples_leaf < 1.: + if not 0.0 < self.min_samples_leaf < 1.0: raise ValueError( "min_samples_leaf must be an integer greater than 2 or " "a float in (0.0, 1.0); got the float {}".format( - self.min_samples_leaf)) + self.min_samples_leaf + ) + ) self._min_samples_leaf = int(np.ceil(self.min_samples_leaf * n_sample)) self._min_samples_leaf = max(3, self._min_samples_leaf) @@ -511,10 +555,11 @@ def _fit(self, X, y, sample_weight=None): if not 10 <= self.max_bins <= 120: raise ValueError("max_bins must be an integer in [10, 120].") - if not hasattr(self.base_estimator, 'fit_intercept'): + if not hasattr(self.base_estimator, "fit_intercept"): raise ValueError( "Only linear models are accepted as base_estimator. " - "Select one from linear_model class of scikit-learn.") + "Select one from linear_model class of scikit-learn." + ) if self.categorical_features is not None: cat_features = np.unique(self.categorical_features) @@ -522,17 +567,19 @@ def _fit(self, X, y, sample_weight=None): if not issubclass(cat_features.dtype.type, numbers.Integral): raise ValueError( "No valid specification of categorical columns. " - "Only a scalar, list or array-like of integers is allowed.") + "Only a scalar, list or array-like of integers is allowed." + ) if (cat_features < 0).any() or (cat_features >= n_feat).any(): raise ValueError( - 'Categorical features must be in [0, {}].'.format( - n_feat - 1)) + "Categorical features must be in [0, {}].".format(n_feat - 1) + ) if len(cat_features) == n_feat: raise ValueError( "Only categorical features detected. " - "No features available for fitting.") + "No features available for fitting." + ) else: cat_features = [] self._categorical_features = cat_features @@ -543,12 +590,13 @@ def _fit(self, X, y, sample_weight=None): if not issubclass(split_features.dtype.type, numbers.Integral): raise ValueError( "No valid specification of split_features. " - "Only a scalar, list or array-like of integers is allowed.") + "Only a scalar, list or array-like of integers is allowed." + ) if (split_features < 0).any() or (split_features >= n_feat).any(): raise ValueError( - 'Splitting features must be in [0, {}].'.format( - n_feat - 1)) + "Splitting features must be in [0, {}].".format(n_feat - 1) + ) else: split_features = np.arange(n_feat) self._split_features = split_features @@ -559,16 +607,16 @@ def _fit(self, X, y, sample_weight=None): if not issubclass(linear_features.dtype.type, numbers.Integral): raise ValueError( "No valid specification of linear_features. " - "Only a scalar, list or array-like of integers is allowed.") + "Only a scalar, list or array-like of integers is allowed." + ) if (linear_features < 0).any() or (linear_features >= n_feat).any(): raise ValueError( - 'Linear features must be in [0, {}].'.format( - n_feat - 1)) + "Linear features must be in [0, {}].".format(n_feat - 1) + ) if np.isin(linear_features, cat_features).any(): - raise ValueError( - "Linear features cannot be categorical features.") + raise ValueError("Linear features cannot be categorical features.") else: linear_features = np.setdiff1d(np.arange(n_feat), cat_features) self._linear_features = linear_features @@ -619,60 +667,55 @@ def summary(self, feature_names=None, only_leaves=False, max_depth=None): (^): Only for split nodes. (^^): Only for leaf nodes. """ - check_is_fitted(self, attributes='_nodes') + check_is_fitted(self, attributes="_nodes") if max_depth is None: max_depth = 20 if max_depth < 1: - raise ValueError( - "max_depth must be > 0, got {}".format(max_depth)) + raise ValueError("max_depth must be > 0, got {}".format(max_depth)) summary = {} if len(self._nodes) > 0 and not only_leaves: - - if (feature_names is not None and - len(feature_names) != self.n_features_in_): + if feature_names is not None and len(feature_names) != self.n_features_in_: raise ValueError( "feature_names must contain {} elements, got {}".format( - self.n_features_in_, len(feature_names))) + self.n_features_in_, len(feature_names) + ) + ) if feature_names is None: feature_names = np.arange(self.n_features_in_) for n, N in self._nodes.items(): - if len(n) >= max_depth: continue cl, cr = N.children - Cl = (self._nodes[cl] if cl in self._nodes - else self._leaves[cl]) - Cr = (self._nodes[cr] if cr in self._nodes - else self._leaves[cr]) + Cl = self._nodes[cl] if cl in self._nodes else self._leaves[cl] + Cr = self._nodes[cr] if cr in self._nodes else self._leaves[cr] summary[N.id] = { - 'col': feature_names[Cl.threshold[-1][0]], - 'th': round(Cl.threshold[-1][-1], 5), - 'loss': round(Cl.w_loss + Cr.w_loss, 5), - 'samples': Cl.n_samples + Cr.n_samples, - 'children': (Cl.id, Cr.id), - 'models': (Cl.model, Cr.model) + "col": feature_names[Cl.threshold[-1][0]], + "th": round(Cl.threshold[-1][-1], 5), + "loss": round(Cl.w_loss + Cr.w_loss, 5), + "samples": Cl.n_samples + Cr.n_samples, + "children": (Cl.id, Cr.id), + "models": (Cl.model, Cr.model), } for l, L in self._leaves.items(): - if len(l) > max_depth: continue summary[L.id] = { - 'loss': round(L.loss, 5), - 'samples': L.n_samples, - 'models': L.model + "loss": round(L.loss, 5), + "samples": L.n_samples, + "models": L.model, } - if hasattr(self, 'classes_'): - summary[L.id]['classes'] = L.classes + if hasattr(self, "classes_"): + summary[L.id]["classes"] = L.classes return summary @@ -692,23 +735,23 @@ def apply(self, X): ``[0; n_nodes)``, possibly with gaps in the numbering. """ - check_is_fitted(self, attributes='_nodes') + check_is_fitted(self, attributes="_nodes") X = self._validate_data( + self.base_estimator, X, reset=False, accept_sparse=False, - dtype='float32', - force_all_finite=True, + dtype="float32", + ensure_all_finite=True, ensure_2d=True, allow_nd=False, - ensure_min_features=self.n_features_in_ + ensure_min_features=self.n_features_in_, ) - X_leaves = np.zeros(X.shape[0], dtype='int64') + X_leaves = np.zeros(X.shape[0], dtype="int64") for L in self._leaves.values(): - mask = _predict_branch(X, L.threshold) if (~mask).all(): continue @@ -731,23 +774,23 @@ def decision_path(self, X): Return a node indicator CSR matrix where non zero elements indicates that the samples goes through the nodes. """ - check_is_fitted(self, attributes='_nodes') + check_is_fitted(self, attributes="_nodes") X = self._validate_data( + self.base_estimator, X, reset=False, accept_sparse=False, - dtype='float32', - force_all_finite=True, + dtype="float32", + ensure_all_finite=True, ensure_2d=True, allow_nd=False, - ensure_min_features=self.n_features_in_ + ensure_min_features=self.n_features_in_, ) - indicator = np.zeros((X.shape[0], self.node_count), dtype='int64') + indicator = np.zeros((X.shape[0], self.node_count), dtype="int64") for L in self._leaves.values(): - mask = _predict_branch(X, L.threshold) if (~mask).all(): continue @@ -789,36 +832,39 @@ def model_to_dot(self, feature_names=None, max_depth=None): import pydot summary = self.summary(feature_names=feature_names, max_depth=max_depth) - graph = pydot.Dot('linear_tree', graph_type='graph') + graph = pydot.Dot("linear_tree", graph_type="graph") # create nodes for n in summary: - if 'col' in summary[n]: - if isinstance(summary[n]['col'], str): + if "col" in summary[n]: + if isinstance(summary[n]["col"], str): msg = "id_node: {}\n{} <= {}\nloss: {:.4f}\nsamples: {}" else: msg = "id_node: {}\nX[{}] <= {}\nloss: {:.4f}\nsamples: {}" msg = msg.format( - n, summary[n]['col'], summary[n]['th'], - summary[n]['loss'], summary[n]['samples'] + n, + summary[n]["col"], + summary[n]["th"], + summary[n]["loss"], + summary[n]["samples"], ) - graph.add_node(pydot.Node(n, label=msg, shape='rectangle')) + graph.add_node(pydot.Node(n, label=msg, shape="rectangle")) - for c in summary[n]['children']: + for c in summary[n]["children"]: if c not in summary: - graph.add_node(pydot.Node(c, label="...", - shape='rectangle')) + graph.add_node(pydot.Node(c, label="...", shape="rectangle")) else: msg = "id_node: {}\nloss: {:.4f}\nsamples: {}".format( - n, summary[n]['loss'], summary[n]['samples']) + n, summary[n]["loss"], summary[n]["samples"] + ) graph.add_node(pydot.Node(n, label=msg)) # add edges for n in summary: - if 'children' in summary[n]: - for c in summary[n]['children']: + if "children" in summary[n]: + for c in summary[n]["children"]: graph.add_edge(pydot.Edge(n, c)) return graph @@ -858,12 +904,23 @@ class _LinearBoosting(TransformerMixin, BaseEstimator): Warning: This class should not be used directly. Use derived classes instead. """ - def __init__(self, base_estimator, *, loss, n_estimators, - max_depth, min_samples_split, min_samples_leaf, - min_weight_fraction_leaf, max_features, - random_state, max_leaf_nodes, - min_impurity_decrease, ccp_alpha): + def __init__( + self, + base_estimator, + *, + loss, + n_estimators, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + random_state, + max_leaf_nodes, + min_impurity_decrease, + ccp_alpha + ): self.base_estimator = base_estimator self.loss = loss self.n_estimators = n_estimators @@ -897,13 +954,18 @@ def _fit(self, X, y, sample_weight=None): ------- self : object """ - if not hasattr(self.base_estimator, 'fit_intercept'): - raise ValueError("Only linear models are accepted as base_estimator. " - "Select one from linear_model class of scikit-learn.") + self.base_estimator = self.estimator + if not hasattr(self.base_estimator, "fit_intercept"): + raise ValueError( + "Only linear models are accepted as base_estimator. " + "Select one from linear_model class of scikit-learn." + ) if self.n_estimators <= 0: - raise ValueError("n_estimators must be an integer greater than 0 but " - "got {}".format(self.n_estimators)) + raise ValueError( + "n_estimators must be an integer greater than 0 but " + "got {}".format(self.n_estimators) + ) n_sample, self.n_features_in_ = X.shape @@ -911,16 +973,22 @@ def _fit(self, X, y, sample_weight=None): self._leaves = [] for i in range(self.n_estimators): - estimator = deepcopy(self.base_estimator) - estimator.fit(X, y, sample_weight=sample_weight) - if self.loss == 'entropy': + sig = signature(estimator.fit) + estimator_fit_params = sig.parameters + + if "sample_weight" in estimator_fit_params: + estimator.fit(X, y, sample_weight=sample_weight) + else: + estimator.fit(X, y) + + if self.loss == "entropy": pred = estimator.predict_proba(X) else: pred = estimator.predict(X) - if hasattr(self, 'classes_'): + if hasattr(self, "classes_"): resid = SCORING[self.loss](y, pred, self.classes_) else: resid = SCORING[self.loss](y, pred) @@ -928,10 +996,11 @@ def _fit(self, X, y, sample_weight=None): if resid.ndim > 1: resid = resid.mean(1) - criterion = 'squared_error' if _sklearn_v1 else 'mse' + criterion = "squared_error" if _sklearn_v1 else "mse" tree = DecisionTreeRegressor( - criterion=criterion, max_depth=self.max_depth, + criterion=criterion, + max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, @@ -939,7 +1008,7 @@ def _fit(self, X, y, sample_weight=None): random_state=self.random_state, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, - ccp_alpha=self.ccp_alpha + ccp_alpha=self.ccp_alpha, ) tree.fit(X, resid, sample_weight=sample_weight, check_input=False) @@ -954,12 +1023,19 @@ def _fit(self, X, y, sample_weight=None): X = np.concatenate([X, pred_tree], axis=1) self.base_estimator_ = deepcopy(self.base_estimator) - self.base_estimator_.fit(X, y, sample_weight=sample_weight) - if hasattr(self.base_estimator_, 'coef_'): + sig = signature(self.base_estimator_.fit) + estimator_fit_params = sig.parameters + + if "sample_weight" in estimator_fit_params: + self.base_estimator_.fit(X, y, sample_weight=sample_weight) + else: + self.base_estimator_.fit(X, y) + + if hasattr(self.base_estimator_, "coef_"): self.coef_ = self.base_estimator_.coef_ - if hasattr(self.base_estimator_, 'intercept_'): + if hasattr(self.base_estimator_, "intercept_"): self.intercept_ = self.base_estimator_.intercept_ self.n_features_out_ = X.shape[1] @@ -981,17 +1057,18 @@ def transform(self, X): Transformed dataset. `n_out` is equal to `n_features` + `n_estimators` """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes="base_estimator_") X = self._validate_data( + self.base_estimator, X, reset=False, accept_sparse=False, - dtype='float32', - force_all_finite=True, + dtype="float32", + ensure_all_finite=True, ensure_2d=True, allow_nd=False, - ensure_min_features=self.n_features_in_ + ensure_min_features=self.n_features_in_, ) for tree, leaf in zip(self._trees, self._leaves): @@ -1009,12 +1086,26 @@ class _LinearForest(BaseEstimator): Warning: This class should not be used directly. Use derived classes instead. """ - def __init__(self, base_estimator, *, n_estimators, max_depth, - min_samples_split, min_samples_leaf, min_weight_fraction_leaf, - max_features, max_leaf_nodes, min_impurity_decrease, - bootstrap, oob_score, n_jobs, random_state, - ccp_alpha, max_samples): + def __init__( + self, + base_estimator, + *, + n_estimators, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + max_leaf_nodes, + min_impurity_decrease, + bootstrap, + oob_score, + n_jobs, + random_state, + ccp_alpha, + max_samples + ): self.base_estimator = base_estimator self.n_estimators = n_estimators self.max_depth = max_depth @@ -1083,16 +1174,19 @@ def _fit(self, X, y, sample_weight=None): ------- self : object """ - if not hasattr(self.base_estimator, 'fit_intercept'): - raise ValueError("Only linear models are accepted as base_estimator. " - "Select one from linear_model class of scikit-learn.") + self.base_estimator = self.estimator + if not hasattr(self.base_estimator, "fit_intercept"): + raise ValueError( + "Only linear models are accepted as base_estimator. " + "Select one from linear_model class of scikit-learn." + ) if not is_regressor(self.base_estimator): raise ValueError("Select a regressor linear model as base_estimator.") n_sample, self.n_features_in_ = X.shape - if hasattr(self, 'classes_'): + if hasattr(self, "classes_"): class_to_int = dict(map(reversed, enumerate(self.classes_))) y = np.array([class_to_int[i] for i in y]) y = self._inv_sigmoid(y) @@ -1101,7 +1195,7 @@ def _fit(self, X, y, sample_weight=None): self.base_estimator_.fit(X, y, sample_weight) resid = y - self.base_estimator_.predict(X) - criterion = 'squared_error' if _sklearn_v1 else 'mse' + criterion = "squared_error" if _sklearn_v1 else "mse" self.forest_estimator_ = RandomForestRegressor( n_estimators=self.n_estimators, @@ -1118,14 +1212,14 @@ def _fit(self, X, y, sample_weight=None): n_jobs=self.n_jobs, random_state=self.random_state, ccp_alpha=self.ccp_alpha, - max_samples=self.max_samples + max_samples=self.max_samples, ) self.forest_estimator_.fit(X, resid, sample_weight) - if hasattr(self.base_estimator_, 'coef_'): + if hasattr(self.base_estimator_, "coef_"): self.coef_ = self.base_estimator_.coef_ - if hasattr(self.base_estimator_, 'intercept_'): + if hasattr(self.base_estimator_, "intercept_"): self.intercept_ = self.base_estimator_.intercept_ self.feature_importances_ = self.forest_estimator_.feature_importances_ @@ -1146,7 +1240,7 @@ def apply(self, X): For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes="base_estimator_") return self.forest_estimator_.apply(X) @@ -1169,6 +1263,6 @@ def decision_path(self, X): The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] gives the indicator value for the i-th estimator. """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes="base_estimator_") return self.forest_estimator_.decision_path(X) \ No newline at end of file diff --git a/lineartree/lineartree.py b/lineartree/lineartree.py index b75f10f..9f4bab4 100644 --- a/lineartree/lineartree.py +++ b/lineartree/lineartree.py @@ -1,7 +1,8 @@ +import warnings import numpy as np from sklearn.base import ClassifierMixin, RegressorMixin -from sklearn.utils.validation import check_is_fitted, _check_sample_weight +from sklearn.utils.validation import check_is_fitted, _check_sample_weight, validate_data from ._classes import _predict_branch from ._classes import _LinearTree, _LinearBoosting, _LinearForest @@ -20,7 +21,7 @@ class LinearTreeRegressor(_LinearTree, RegressorMixin): Parameters ---------- - base_estimator : object + estimator : object The base estimator to fit on dataset splits. The base estimator must be a sklearn.linear_model. @@ -96,6 +97,12 @@ class LinearTreeRegressor(_LinearTree, RegressorMixin): The number of jobs to run in parallel for model fitting. ``None`` means 1 using one processor. ``-1`` means using all processors. + + base_estimator : object, default="deprecated" + Use `estimator` instead. + .. deprecated:: 0.3.6 + `base_estimator` is deprecated and will be removed in 1.0.0 + Use `estimator` instead. Attributes ---------- @@ -116,17 +123,17 @@ class LinearTreeRegressor(_LinearTree, RegressorMixin): >>> X, y = make_regression(n_samples=100, n_features=4, ... n_informative=2, n_targets=1, ... random_state=0, shuffle=False) - >>> regr = LinearTreeRegressor(base_estimator=LinearRegression()) + >>> regr = LinearTreeRegressor(estimator=LinearRegression()) >>> regr.fit(X, y) >>> regr.predict([[0, 0, 0, 0]]) array([8.8817842e-16]) """ - def __init__(self, base_estimator, *, criterion='mse', max_depth=5, + def __init__(self, estimator=None, *, criterion='mse', max_depth=5, min_samples_split=6, min_samples_leaf=0.1, max_bins=25, min_impurity_decrease=0.0, categorical_features=None, - split_features=None, linear_features=None, n_jobs=None): + split_features=None, linear_features=None, n_jobs=None, base_estimator="deprecated"): - self.base_estimator = base_estimator + self.estimator = estimator self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split @@ -137,6 +144,7 @@ def __init__(self, base_estimator, *, criterion='mse', max_depth=5, self.split_features = split_features self.linear_features = linear_features self.n_jobs = n_jobs + self.base_estimator = base_estimator def fit(self, X, y, sample_weight=None): """Build a Linear Tree of a linear estimator from the training @@ -159,6 +167,15 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + + if self.base_estimator != "deprecated": + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.3.6 and " + "will be removed in 1.0", + FutureWarning, + ) + self.estimator = self.base_estimator + reg_criterions = ('mse', 'rmse', 'mae', 'poisson') if self.criterion not in reg_criterions: @@ -166,12 +183,13 @@ def fit(self, X, y, sample_weight=None): "got '{}'.".format(reg_criterions, self.criterion)) # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( + X, y = validate_data( + self.estimator, X, y, reset=True, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=True, @@ -204,12 +222,13 @@ def predict(self, X): """ check_is_fitted(self, attributes='_nodes') - X = self._validate_data( + X = validate_data( + self.estimator, X, reset=False, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_features=self.n_features_in_ @@ -244,7 +263,7 @@ class LinearTreeClassifier(_LinearTree, ClassifierMixin): Parameters ---------- - base_estimator : object + estimator : object The base estimator to fit on dataset splits. The base estimator must be a sklearn.linear_model. The selected base estimator is automatically substituted by a @@ -323,6 +342,12 @@ class LinearTreeClassifier(_LinearTree, ClassifierMixin): The number of jobs to run in parallel for model fitting. ``None`` means 1 using one processor. ``-1`` means using all processors. + + base_estimator : object, default="deprecated" + Use `estimator` instead. + .. deprecated:: 0.3.6 + `base_estimator` is deprecated and will be removed in 1.0.0 + Use `estimator` instead. Attributes ---------- @@ -343,17 +368,18 @@ class LinearTreeClassifier(_LinearTree, ClassifierMixin): >>> X, y = make_classification(n_samples=100, n_features=4, ... n_informative=2, n_redundant=0, ... random_state=0, shuffle=False) - >>> clf = LinearTreeClassifier(base_estimator=RidgeClassifier()) + >>> clf = LinearTreeClassifier(estimator=RidgeClassifier()) >>> clf.fit(X, y) >>> clf.predict([[0, 0, 0, 0]]) array([1]) """ - def __init__(self, base_estimator, *, criterion='hamming', max_depth=5, + def __init__(self, estimator=None, *, criterion='hamming', max_depth=5, min_samples_split=6, min_samples_leaf=0.1, max_bins=25, min_impurity_decrease=0.0, categorical_features=None, - split_features=None, linear_features=None, n_jobs=None): + split_features=None, linear_features=None, n_jobs=None, + base_estimator="deprecated"): - self.base_estimator = base_estimator + self.estimator = estimator self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split @@ -364,6 +390,7 @@ def __init__(self, base_estimator, *, criterion='hamming', max_depth=5, self.split_features = split_features self.linear_features = linear_features self.n_jobs = n_jobs + self.base_estimator = base_estimator def fit(self, X, y, sample_weight=None): """Build a Linear Tree of a linear estimator from the training @@ -386,24 +413,33 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + if self.base_estimator != "deprecated": + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.3.6 and " + "will be removed in 1.0", + FutureWarning, + ) + self.estimator = self.base_estimator + clas_criterions = ('hamming', 'crossentropy') if self.criterion not in clas_criterions: raise ValueError("Classification tasks support only criterion in {}, " "got '{}'.".format(clas_criterions, self.criterion)) - if (not hasattr(self.base_estimator, 'predict_proba') and + if (not hasattr(self.estimator, 'predict_proba') and self.criterion == 'crossentropy'): - raise ValueError("The 'crossentropy' criterion requires a base_estimator " + raise ValueError("The 'crossentropy' criterion requires a estimator " "with predict_proba method.") # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( + X, y = validate_data( + self.estimator, X, y, reset=True, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, @@ -431,12 +467,13 @@ def predict(self, X): """ check_is_fitted(self, attributes='_nodes') - X = self._validate_data( + X = validate_data( + self.estimator, X, reset=False, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_features=self.n_features_in_ @@ -473,12 +510,13 @@ def predict_proba(self, X): """ check_is_fitted(self, attributes='_nodes') - X = self._validate_data( + X = validate_data( + self.estimator, X, reset=False, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_features=self.n_features_in_ @@ -486,7 +524,7 @@ def predict_proba(self, X): pred = np.zeros((X.shape[0], len(self.classes_))) - if hasattr(self.base_estimator, 'predict_proba'): + if hasattr(self.estimator, 'predict_proba'): for L in self._leaves.values(): mask = _predict_branch(X, L.threshold) @@ -536,7 +574,7 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin): Parameters ---------- - base_estimator : object + estimator : object The base estimator iteratively fitted. The base estimator must be a sklearn.linear_model. @@ -577,14 +615,13 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin): the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float or {"auto", "sqrt", "log2"}, default=None + max_features : int, float or {"sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. @@ -610,7 +647,12 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin): subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. - + + base_estimator : object, default="deprecated" + Use `estimator` instead. + .. deprecated:: 0.3.6 + `base_estimator` is deprecated and will be removed in 1.0.0 + Use `estimator` instead. Attributes ---------- n_features_in_ : int @@ -629,7 +671,7 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin): intercept_ : float or array of shape (n_targets, ) Independent term in the linear model. Set to 0 if `fit_intercept = False` - in `base_estimator` + in `estimator` Examples -------- @@ -639,7 +681,7 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin): >>> X, y = make_regression(n_samples=100, n_features=4, ... n_informative=2, n_targets=1, ... random_state=0, shuffle=False) - >>> regr = LinearBoostRegressor(base_estimator=LinearRegression()) + >>> regr = LinearBoostRegressor(estimator=LinearRegression()) >>> regr.fit(X, y) >>> regr.predict([[0, 0, 0, 0]]) array([8.8817842e-16]) @@ -650,13 +692,14 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin): Authors: Igor Ilic, Berk Gorgulu, Mucahit Cevik, Mustafa Gokce Baydogan. (https://arxiv.org/abs/2009.09110) """ - def __init__(self, base_estimator, *, loss='linear', n_estimators=10, + def __init__(self, estimator=None, *, loss='linear', n_estimators=10, max_depth=3, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, - min_impurity_decrease=0.0, ccp_alpha=0.0): + min_impurity_decrease=0.0, ccp_alpha=0.0, + base_estimator="deprecated"): - self.base_estimator = base_estimator + self.estimator = estimator self.loss = loss self.n_estimators = n_estimators self.max_depth = max_depth @@ -668,6 +711,7 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10, self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha + self.base_estimator = base_estimator def fit(self, X, y, sample_weight=None): """Build a Linear Boosting from the training set (X, y). @@ -687,6 +731,14 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + if self.base_estimator != "deprecated": + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.3.6 and " + "will be removed in 1.0", + FutureWarning, + ) + self.estimator = self.base_estimator + reg_losses = ('linear', 'square', 'absolute', 'exponential') if self.loss not in reg_losses: @@ -694,12 +746,13 @@ def fit(self, X, y, sample_weight=None): "got '{}'.".format(reg_losses, self.loss)) # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( + X, y = validate_data( + self.estimator, X, y, reset=True, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=True, @@ -730,9 +783,9 @@ def predict(self, X): multitarget regression. The predicted values. """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes='estimator_') - return self.base_estimator_.predict(self.transform(X)) + return self.estimator_.predict(self.transform(X)) class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): @@ -747,13 +800,13 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): Parameters ---------- - base_estimator : object + estimator : object The base estimator iteratively fitted. The base estimator must be a sklearn.linear_model. loss : {"hamming", "entropy"}, default="entropy" The function used to calculate the residuals of each sample. - `"entropy"` can be used only if `base_estimator` has `predict_proba` + `"entropy"` can be used only if `estimator` has `predict_proba` method. n_estimators : int, default=10 @@ -790,14 +843,13 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : int, float or {"auto", "sqrt", "log2"}, default=None + max_features : int, float or {"sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. @@ -824,6 +876,12 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. + base_estimator : object, default="deprecated" + Use `estimator` instead. + .. deprecated:: 0.3.6 + `base_estimator` is deprecated and will be removed in 1.0.0 + Use `estimator` instead. + Attributes ---------- n_features_in_ : int @@ -839,7 +897,7 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): intercept_ : float or array of shape (n_classes, ) Independent term in the linear model. Set to 0 if `fit_intercept = False` - in `base_estimator` + in `estimator` classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. @@ -852,7 +910,7 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): >>> X, y = make_classification(n_samples=100, n_features=4, ... n_informative=2, n_redundant=0, ... random_state=0, shuffle=False) - >>> clf = LinearBoostClassifier(base_estimator=RidgeClassifier()) + >>> clf = LinearBoostClassifier(estimator=RidgeClassifier()) >>> clf.fit(X, y) >>> clf.predict([[0, 0, 0, 0]]) array([1]) @@ -863,13 +921,14 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin): Authors: Igor Ilic, Berk Gorgulu, Mucahit Cevik, Mustafa Gokce Baydogan. (https://arxiv.org/abs/2009.09110) """ - def __init__(self, base_estimator, *, loss='hamming', n_estimators=10, + def __init__(self, estimator=None, *, loss='hamming', n_estimators=10, max_depth=3, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, - min_impurity_decrease=0.0, ccp_alpha=0.0): + min_impurity_decrease=0.0, ccp_alpha=0.0, + base_estimator="deprecated"): - self.base_estimator = base_estimator + self.estimator = estimator self.loss = loss self.n_estimators = n_estimators self.max_depth = max_depth @@ -881,6 +940,7 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10, self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha + self.base_estimator = base_estimator def fit(self, X, y, sample_weight=None): """Build a Linear Boosting from the training set (X, y). @@ -900,24 +960,33 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + if self.base_estimator != "deprecated": + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.3.6 and " + "will be removed in 1.0", + FutureWarning, + ) + self.estimator = self.base_estimator + clas_losses = ('hamming', 'entropy') if self.loss not in clas_losses: raise ValueError("Classification tasks support only loss in {}, " "got '{}'.".format(clas_losses, self.loss)) - if (not hasattr(self.base_estimator, 'predict_proba') and + if (not hasattr(self.estimator, 'predict_proba') and self.loss == 'entropy'): - raise ValueError("The 'entropy' loss requires a base_estimator " + raise ValueError("The 'entropy' loss requires a estimator " "with predict_proba method.") # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( + X, y = validate_data( + self.estimator, X, y, reset=True, accept_sparse=False, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, @@ -943,9 +1012,9 @@ def predict(self, X): pred : ndarray of shape (n_samples, ) The predicted classes. """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes='estimator_') - return self.base_estimator_.predict(self.transform(X)) + return self.estimator_.predict(self.transform(X)) def predict_proba(self, X): """Predict class probabilities for X. @@ -964,9 +1033,9 @@ def predict_proba(self, X): The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ - if hasattr(self.base_estimator, 'predict_proba'): - check_is_fitted(self, attributes='base_estimator_') - pred = self.base_estimator_.predict_proba(self.transform(X)) + if hasattr(self.estimator, 'predict_proba'): + check_is_fitted(self, attributes='estimator_') + pred = self.estimator_.predict_proba(self.transform(X)) else: pred_class = self.predict(X) @@ -1011,7 +1080,7 @@ class LinearForestRegressor(_LinearForest, RegressorMixin): Parameters ---------- - base_estimator : object + estimator : object The linear estimator fitted on the raw target. The linear estimator must be a regressor from sklearn.linear_model. @@ -1048,14 +1117,13 @@ class LinearForestRegressor(_LinearForest, RegressorMixin): the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" + max_features : {"sqrt", "log2"}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `round(max_features * n_features)` features are considered at each split. - - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. @@ -1107,6 +1175,11 @@ class LinearForestRegressor(_LinearForest, RegressorMixin): - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0, 1]`. + base_estimator : object, default="deprecated" + Use `estimator` instead. + .. deprecated:: 0.3.6 + `base_estimator` is deprecated and will be removed in 1.0.0 + Use `estimator` instead. Attributes ---------- @@ -1128,9 +1201,9 @@ class LinearForestRegressor(_LinearForest, RegressorMixin): intercept_ : float or array of shape (n_targets,) Independent term in the linear model. Set to 0 if `fit_intercept = False` - in `base_estimator`. + in `estimator`. - base_estimator_ : object + estimator_ : object A fitted linear model instance. forest_estimator_ : object @@ -1144,7 +1217,7 @@ class LinearForestRegressor(_LinearForest, RegressorMixin): >>> X, y = make_regression(n_samples=100, n_features=4, ... n_informative=2, n_targets=1, ... random_state=0, shuffle=False) - >>> regr = LinearForestRegressor(base_estimator=LinearRegression()) + >>> regr = LinearForestRegressor(estimator=LinearRegression()) >>> regr.fit(X, y) >>> regr.predict([[0, 0, 0, 0]]) array([8.8817842e-16]) @@ -1155,14 +1228,15 @@ class LinearForestRegressor(_LinearForest, RegressorMixin): Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu. (https://arxiv.org/abs/1904.10416) """ - def __init__(self, base_estimator, *, n_estimators=100, + def __init__(self, estimator=None, *, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, - min_weight_fraction_leaf=0., max_features="auto", + min_weight_fraction_leaf=0., max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0., bootstrap=True, oob_score=False, n_jobs=None, - random_state=None, ccp_alpha=0.0, max_samples=None): + random_state=None, ccp_alpha=0.0, max_samples=None, + base_estimator="deprecated"): - self.base_estimator = base_estimator + self.estimator = estimator self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split @@ -1177,6 +1251,7 @@ def __init__(self, base_estimator, *, n_estimators=100, self.random_state = random_state self.ccp_alpha = ccp_alpha self.max_samples = max_samples + self.base_estimator = base_estimator def fit(self, X, y, sample_weight=None): """Build a Linear Forest from the training set (X, y). @@ -1196,13 +1271,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + if self.base_estimator != "deprecated": + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.3.6 and " + "will be removed in 1.0", + FutureWarning, + ) + self.estimator = self.base_estimator + # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( + X, y = validate_data( + self.estimator, X, y, reset=True, accept_sparse=True, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=True, @@ -1233,20 +1317,21 @@ def predict(self, X): multitarget regression. The predicted values. """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes='estimator_') - X = self._validate_data( + X = validate_data( + self.estimator, X, reset=False, accept_sparse=True, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_features=self.n_features_in_ ) - linear_pred = self.base_estimator_.predict(X) + linear_pred = self.estimator_.predict(X) forest_pred = self.forest_estimator_.predict(X) return linear_pred + forest_pred @@ -1274,7 +1359,7 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin): Parameters ---------- - base_estimator : object + estimator : object The linear estimator fitted on the raw target. The linear estimator must be a regressor from sklearn.linear_model. @@ -1311,14 +1396,13 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin): the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. - max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" + max_features : {"sqrt", "log2"}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `round(max_features * n_features)` features are considered at each split. - - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. @@ -1371,6 +1455,12 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin): - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0, 1]`. + base_estimator : object, default="deprecated" + Use `estimator` instead. + .. deprecated:: 0.3.6 + `base_estimator` is deprecated and will be removed in 1.0.0 + Use `estimator` instead. + Attributes ---------- n_features_in_ : int @@ -1388,12 +1478,12 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin): intercept_ : float Independent term in the linear model. Set to 0 if `fit_intercept = False` - in `base_estimator`. + in `estimator`. classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. - base_estimator_ : object + estimator_ : object A fitted linear model instance. forest_estimator_ : object @@ -1407,7 +1497,7 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin): >>> X, y = make_classification(n_samples=100, n_classes=2, n_features=4, ... n_informative=2, n_redundant=0, ... random_state=0, shuffle=False) - >>> clf = LinearForestClassifier(base_estimator=LinearRegression()) + >>> clf = LinearForestClassifier(estimator=LinearRegression()) >>> clf.fit(X, y) >>> clf.predict([[0, 0, 0, 0]]) array([1]) @@ -1418,14 +1508,15 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin): Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu. (https://arxiv.org/abs/1904.10416) """ - def __init__(self, base_estimator, *, n_estimators=100, + def __init__(self, estimator=None, *, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, - min_weight_fraction_leaf=0., max_features="auto", + min_weight_fraction_leaf=0., max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0., bootstrap=True, oob_score=False, n_jobs=None, - random_state=None, ccp_alpha=0.0, max_samples=None): + random_state=None, ccp_alpha=0.0, max_samples=None, + base_estimator="deprecated"): - self.base_estimator = base_estimator + self.estimator = estimator self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split @@ -1440,6 +1531,7 @@ def __init__(self, base_estimator, *, n_estimators=100, self.random_state = random_state self.ccp_alpha = ccp_alpha self.max_samples = max_samples + self.base_estimator = base_estimator def fit(self, X, y, sample_weight=None): """Build a Linear Forest from the training set (X, y). @@ -1459,13 +1551,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + if self.base_estimator != "deprecated": + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.3.6 and " + "will be removed in 1.0", + FutureWarning, + ) + self.estimator = self.base_estimator + # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( + X, y = validate_data( + self.estimator, X, y, reset=True, accept_sparse=True, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, @@ -1502,20 +1603,21 @@ def decision_function(self, X): Confidence score for self.classes_[1] where >0 means this class would be predicted. """ - check_is_fitted(self, attributes='base_estimator_') + check_is_fitted(self, attributes='estimator_') - X = self._validate_data( + X = validate_data( + self.estimator, X, reset=False, accept_sparse=True, dtype='float32', - force_all_finite=True, + ensure_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_features=self.n_features_in_ ) - linear_pred = self.base_estimator_.predict(X) + linear_pred = self.estimator_.predict(X) forest_pred = self.forest_estimator_.predict(X) return linear_pred + forest_pred @@ -1576,4 +1678,4 @@ def predict_log_proba(self, X): The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ - return np.log(self.predict_proba(X)) + return np.log(self.predict_proba(X)) \ No newline at end of file