diff --git a/docs/api/linear.rst b/docs/api/linear.rst index 888db896..8629a1ec 100644 --- a/docs/api/linear.rst +++ b/docs/api/linear.rst @@ -34,12 +34,17 @@ The simplest usage is:: .. autofunction:: get_positive_labels +.. autofunction:: linear_test + .. autoclass:: FlatModel :members: .. autoclass:: TreeModel :members: +.. autoclass:: EnsembleTreeModel + :members: + Load Dataset ^^^^^^^^^^^^ @@ -101,3 +106,18 @@ Grid Search with Sklearn Estimators :members: .. automethod:: __init__ + +Grid Search for Tree-Based Linear Method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: TreeGridParameter + :members: + + .. automethod:: __init__ + +.. autoclass:: TreeGridSearch + :members: + + .. automethod:: __init__ + + .. automethod:: __call__ \ No newline at end of file diff --git a/docs/examples/plot_linear_gridsearch_tutorial.py b/docs/examples/plot_linear_gridsearch_tutorial.py index d1a239e7..01c1e069 100644 --- a/docs/examples/plot_linear_gridsearch_tutorial.py +++ b/docs/examples/plot_linear_gridsearch_tutorial.py @@ -1,7 +1,14 @@ """ -Hyperparameter Search for Linear Methods +Hyperparameter Search for One-vs-rest Linear Methods ============================================================= +.. warning:: + + If you are using the tree-based linear method, + please check `Hyperparameter Search for Tree-Based Linear Method <../auto_examples/plot_tree_gridsearch_tutorial.html>`_. + This guide helps users to tune the hyperparameters of the feature generation step and the linear model. +In this guide, the following methods are available: +``1vsrest``, ``thresholding``, ``cost_sensitive``, ``cost_sensitive_micro``, and ``binary_and_multiclass``. Here we show an example of tuning a linear text classifier with the `rcv1 dataset `_. Starting with loading and preprocessing of the data without using ``Preprocessor``: diff --git a/docs/examples/plot_tree_gridsearch_tutorial.py b/docs/examples/plot_tree_gridsearch_tutorial.py new file mode 100644 index 00000000..d95dbfdb --- /dev/null +++ b/docs/examples/plot_tree_gridsearch_tutorial.py @@ -0,0 +1,141 @@ +""" +Hyperparameter Search for Tree-Based Linear Method +============================================================= +.. warning:: + + If you are using the one-vs-rest linear methods, + please check `Hyperparameter Search for One-vs-rest Linear Methods <../auto_examples/plot_linear_gridsearch_tutorial.html>`_. + +To apply tree-based linear methods, +we first convert raw text into numerical TF-IDF features. +During training, the method builds a label tree and trains linear classifiers. +At inference, the model traverses the tree and selects +only a few candidate labels at each level to speed up prediction. + +To improve model performance, we need to search the hyperparameter space. +Therefore, in this guide, we help users tune the hyperparameters of the tree-based linear method. + +.. seealso:: + + `Implementation Document `_: + For more details about the implementation of tree-based linear methods and hyperparameter search. + +Here we show an example of tuning a tree-based linear text classifier with the `rcv1 dataset `_. +Starting with loading the data: +""" + +import logging + +from libmultilabel import linear + +logging.basicConfig(level=logging.INFO) + +datasets = linear.load_dataset("txt", "data/rcv1/train.txt", "data/rcv1/test.txt") +L = len(datasets["train"]["y"]) + +###################################################################### +# Next, we set up the search space. + +import numpy as np + +dmax = 10 +K_factors = [-2, 5] +search_space_dict = { + "ngram_range": [(1, 1), (1, 2), (1, 3)], + "stop_words": ["english"], + "dmax": [dmax], + "K": [max(2, int(np.round(np.power(L, 1 / dmax) * np.power(2.0, alpha) + 0.5))) for alpha in K_factors], + "s": [1], + "c": [0.5, 1, 2], + "B": [1], + "beam_width": [10], + "prob_A": [3] +} + +###################################################################### +# Following the suggestions in the `implementation document `_, +# we define 18 configurations to build a simple yet strong baseline. +# +# The search space covers several key parts of the search process: +# +# - Text feature extraction: (``ngram_range``, ``stop_words``) +# +# - We use the vectorizer ``TfidfVectorizer`` from ``sklearn`` to generate features from raw text. +# +# - Label tree structure: (``dmax``, ``K``) +# +# - The depth and node degree of the label tree. Note that ``K`` is the number of clusters and is calculated using the formula from the `implementation document `_. +# +# - Linear classifier: (``s``, ``c``, ``B``) +# +# - We combined them into a LIBLINEAR option string for training linear classifiers. (see *train Usage* in `liblinear `__ README) +# +# - Prediction: (``beam_width``, ``prob_A``) +# +# - The number of candidates considered and the parameter for the probability estimation function at each level during prediction. +# +# .. tip:: +# +# Available hyperparameters (and their defaults) are defined in the class variables of :py:class:`~libmultilabel.linear.TreeGridParameter`. +# +# In :py:class:`~libmultilabel.linear.TreeGridSearch`, we perform cross-validation for evaluation. +# Specifically, we split the training data into ``n_folds``, +# sequentially using each fold as the validation set while training on the remaining folds. +# Finally, we aggregate the validation outputs from each fold and compute the ``monitor_metrics``. +# Initialization requires the dataset, the number of cross-validation folds, and the evaluation metrics. + +n_folds = 3 +monitor_metrics = ["P@1", "P@3", "P@5"] +search = linear.TreeGridSearch(datasets, n_folds, monitor_metrics) +cv_scores = search(search_space_dict) + +###################################################################### +# ``cv_scores`` is a dictionary where keys are :py:class:`~libmultilabel.linear.TreeGridParameter` instances and values are the ``monitor_metrics`` results. +# +# Here we sort the results in descending order by the first metric in ``monitor_metrics``. +# You can retrieve the best parameters after the grid search with the following code: + +sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True) +print(sorted_cv_scores) + +best_params, best_cv_scores = list(sorted_cv_scores)[0] +print(best_params, best_cv_scores) + +###################################################################### +# The best parameters are:: +# +# {'ngram_range': (1, 3), 'stop_words': 'english', 'dmax': 10, 'K': 88, 's': 1, 'c': 1, 'B': 1, 'beam_width': 10, 'prob_A': 3} +# +# with best cross-validation scores:: +# +# {'P@1': 0.9669, 'P@3': 0.8137, 'P@5': 0.5640} +# +# We can then retrain using the best parameters, +# and use :py:meth:`~libmultilabel.linear.linear_test` and :py:meth:`~libmultilabel.linear.get_metrics` to compute test performance. + +from dataclasses import asdict + +preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) +transformed_dataset = preprocessor.fit_transform(datasets) + +model = linear.train_tree( + transformed_dataset["train"]["y"], + transformed_dataset["train"]["x"], + best_params.linear_options, + **asdict(best_params.tree), +) + +metrics, _, _, _ = linear.linear_test( + y = transformed_dataset["test"]["y"], + x = transformed_dataset["test"]["x"], + model = model, + metrics = linear.get_metrics(monitor_metrics, num_classes=-1), + predict_kwargs = asdict(best_params.predict), +) + +print(metrics.compute()) + +###################################################################### +# The result of the best parameters will look similar to:: +# +# {'P@1': 0.9554, 'P@3': 0.7968, 'P@5': 0.5576} diff --git a/docs/search_retrain.rst b/docs/search_retrain.rst index 26acfccb..9242b8ce 100644 --- a/docs/search_retrain.rst +++ b/docs/search_retrain.rst @@ -7,4 +7,5 @@ Hyperparameter Search ../auto_examples/plot_linear_gridsearch_tutorial + ../auto_examples/plot_tree_gridsearch_tutorial tutorials/Parameter_Selection_for_Neural_Networks diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 7f1ce851..f8411b68 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -4,6 +4,7 @@ import numpy as np import scipy.sparse as sparse +from scipy.special import log_expit from sparsekmeans import LloydKmeans, ElkanKmeans import sklearn.preprocessing from tqdm import tqdm @@ -58,16 +59,35 @@ def __init__( self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + def sigmoid_A(self, x: np.ndarray, prob_A: int) -> np.ndarray: + """ + Calculate log(sigmoid(prob_A * x)), which represents the probability of the positive class in binary classification. + + Args: + x (np.ndarray): The decision value matrix with dimension number of instances * number of classes. + prob_A (int): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * x). + + Returns: + np.ndarray: A matrix with dimension number of instances * number of classes. + """ + return log_expit(prob_A * x) + def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, + prob_A: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. - beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10. + beam_width (int, optional): Number of candidates considered during beam search. + prob_A (int, optional): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). Returns: np.ndarray: A matrix with dimension number of instances * number of classes. @@ -81,8 +101,8 @@ def predict_values( if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) - return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) + all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels) + return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): """ @@ -113,7 +133,7 @@ def _separate_model_for_pruning_tree(self): ) self.subtree_models.append(subtree_flatmodel) - def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. Only subtrees corresponding to the top beam_width candidates from the root are evaluated, @@ -122,6 +142,9 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. + prob_A (int): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). @@ -132,7 +155,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -159,12 +182,15 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) return all_preds - def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray: + def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray: """Predict with beam search using cached probability estimates for a single instance. Args: instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels. beam_width (int): Number of candidates considered. + prob_A (int, optional): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). Returns: np.ndarray: A vector with dimension number of classes. @@ -182,7 +208,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = score + self.sigmoid_A(pred, prob_A) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -193,7 +219,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A)) return scores @@ -204,6 +230,7 @@ def train_tree( K=DEFAULT_K, dmax=DEFAULT_DMAX, verbose: bool = True, + root: Node = None, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. The algorithm used is based on https://github.com/xmc-aalto/bonsai. @@ -215,14 +242,16 @@ def train_tree( K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. dmax (int, optional): Maximum depth of the tree. Defaults to 10. verbose (bool, optional): Output extra progress information. Defaults to True. + root (Node, optional): Pre-built tree root. Defaults to None. Returns: TreeModel: A model which can be used in predict_values. """ - label_representation = (y.T * x).tocsr() - label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) - root.is_root = True + if root is None: + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) + root.is_root = True num_nodes = 0 # Both type(x) and type(y) are sparse.csr_matrix diff --git a/libmultilabel/linear/utils.py b/libmultilabel/linear/utils.py index 3324a896..74761179 100644 --- a/libmultilabel/linear/utils.py +++ b/libmultilabel/linear/utils.py @@ -1,10 +1,17 @@ from __future__ import annotations import os +import sys +import math +import itertools +import logging import pathlib import pickle import re -from typing import Any +from math import ceil +from tqdm import tqdm +from typing import Any, Callable +from dataclasses import make_dataclass, field, fields, asdict import numpy as np import scipy.sparse as sparse @@ -12,12 +19,14 @@ import sklearn.model_selection import sklearn.pipeline import sklearn.utils +import sklearn.preprocessing import libmultilabel.linear as linear from .preprocessor import Preprocessor +from .tree import _build_tree -__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV"] +__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV", "linear_test", "TreeGridParameter", "TreeGridSearch"] LINEAR_TECHNIQUES = { @@ -143,3 +152,386 @@ def _set_singlecore_options(self, estimator, param_grid: dict): key = f"{name}__options" param_grid[key] = [f"{re.sub(regex, '', v)} -m 1" for v in param_grid[key]] return param_grid + + +def linear_test( + y: sparse.csr_matrix, + x: sparse.csr_matrix, + model: linear.FlatModel | linear.TreeModel | linear.EnsembleTreeModel, + eval_batch_size: int = 256, + monitor_metrics: list[str] | None = None, + metrics: linear.MetricCollection | None = None, + predict_kwargs: dict | None = None, + beam_width: int | None = None, + prob_A: float | None = None, + label_mapping: np.ndarray | None = None, + save_k_predictions: int | None = None, + save_positive_predictions: bool | None = None, +) -> tuple[linear.MetricCollection, dict, list | np.ndarray, list | np.ndarray]: + """ + Evaluate a linear model on test data with batched prediction and compute metrics. + + Args: + y (scipy.sparse.csr_matrix): The labels of the test data with dimensions number of instances * number of classes. + x (scipy.sparse.csr_matrix): The features of the test data with dimensions number of instances * number of features. + model (linear.FlatModel | linear.TreeModel | linear.EnsembleTreeModel): The trained model. + eval_batch_size (int): Batch size used during evaluation. + monitor_metrics (list[str], optional): The evaluation metrics to monitor. + metrics (linear.MetricCollection, optional): The metric values. + predict_kwargs (dict, optional): Extra parameters passed to model.predict_values. + beam_width (int, optional): Number of candidates considered during beam search. + prob_A (float, optional): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). + label_mapping (np.ndarray, optional): A np.ndarray of class labels that maps each index (from 0 to ``num_class-1``) to its label. + save_k_predictions (int, optional): Determine how many classes per instance should be predicted. + save_positive_predictions (bool, optional): Whether to save labels and scores with positive decision scores. + + Returns: + tuple[linear.MetricCollection, dict, list | np.ndarray, list | np.ndarray]: + A tuple containing: + metrics (linear.MetricCollection) + The updated metric values. + metric_dict (dict[str, float]) + The computed metric results. + labels (list or np.ndarray) + If ``save_k_predictions`` is set, an np.ndarray containing the labels of + top k predictions from decision values. + Else if ``save_positive_predictions`` is True, a list with the labels with + positive decision values. + Otherwise, an empty list. + scores (list or np.ndarray) + If ``save_k_predictions`` is set, an np.ndarray containing the scores of + top k predictions from decision values. + Else if ``save_positive_predictions`` is True, a list with the scores with + positive decision values. + Otherwise, an empty list. + """ + if monitor_metrics is None: + monitor_metrics = ["P@1", "P@3", "P@5"] + if metrics is None: + metrics = linear.get_metrics(monitor_metrics, y.shape[1], multiclass=model.multiclass) + num_instance = x.shape[0] + k = save_k_predictions + if k is not None and k > 0: + labels = np.zeros((num_instance, k), dtype=object) + scores = np.zeros((num_instance, k), dtype="d") + else: + labels = [] + scores = [] + + if predict_kwargs is None: + predict_kwargs = {} + if isinstance(model, (linear.TreeModel, linear.EnsembleTreeModel)): + if beam_width is not None: + predict_kwargs["beam_width"] = beam_width + if prob_A is not None: + predict_kwargs["prob_A"] = prob_A + + for i in tqdm(range(ceil(num_instance / eval_batch_size))): + slice = np.s_[i * eval_batch_size : (i + 1) * eval_batch_size] + preds = model.predict_values(x[slice], **predict_kwargs) + target = y[slice].toarray() + metrics.update(preds, target) + if k is not None and label_mapping is not None and k > 0: + labels[slice], scores[slice] = linear.get_topk_labels(preds, label_mapping, save_k_predictions) + elif save_positive_predictions and label_mapping is not None: + res = linear.get_positive_labels(preds, label_mapping) + labels.append(res[0]) + scores.append(res[1]) + metric_dict = metrics.compute() + return metrics, metric_dict, labels, scores + + +# suppress inevitable outputs from sparsekmeans and sklearn preprocessors +class __silent__: + def __init__(self): + self.stderr = os.dup(2) + self.devnull = os.open(os.devnull, os.O_WRONLY) + + def __enter__(self): + os.dup2(self.devnull, 2) + self.stdout = sys.stdout + sys.stdout = open(os.devnull, "w") + + def __exit__(self, type, value, traceback): + os.dup2(self.stderr, 2) + os.close(self.devnull) + os.close(self.stderr) + sys.stdout.close() + sys.stdout = self.stdout + + +class TreeGridParameter: + """A tree-based linear method hyperparameter class for TreeGridSearch. + Transform the parameter dict into dataclass instances. + Parameters not in the dict will be set to default values. + + Args: + params (dict, optional): The keys are the parameter names, and the valus are the parameter values. + """ + + _tfidf_fields = [ + ("ngram_range", tuple[int, int], field(default=(1, 1))), + ("max_features", int, field(default=None)), + ("min_df", float | int, field(default=1)), + ("stop_words", str | list, field(default=None)), + ("strip_accents", str | Callable, field(default=None)), + ("tokenizer", Callable, field(default=None)), + ] + _tree_fields = [ + ("dmax", int, field(default=10)), + ("K", int, field(default=8)), + ] + _linear_fields = [ + ("s", int, field(default=1)), + ("c", float, field(default=1)), + ("B", int, field(default=-1)), + ] + _predict_fields = [ + ("beam_width", int, field(default=10)), + ("prob_A", int, field(default=3)), + ] + + # set frozen=True to make instances hashable. + # set order=True to enable comparison operations. + param_types = { + "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True), + "tree": make_dataclass("TreeParams", _tree_fields, frozen=True, order=True), + "linear": make_dataclass("LinearParams", _linear_fields, frozen=True, order=True), + "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True), + } + _param_field_names = { + param_type: {f.name for f in fields(class_name)} for param_type, class_name in param_types.items() + } + + def __init__(self, params: dict | None = None): + self.params = params or {} + + params_set = set(self.params) + for param_type, class_name in self.param_types.items(): + field_names = self._param_field_names[param_type] + filtered_keys = params_set & field_names + params_set -= field_names + + filtered_params = {k: self.params[k] for k in filtered_keys} + setattr(self, param_type, class_name(**filtered_params)) + + @property + def linear_options(self): + options = "" + for field_name in self._param_field_names["linear"]: + options += f" -{field_name} {getattr(self.linear, field_name)}" + return options.strip() + + def __repr__(self): # provide a readable string representation of the object + return str(self.params) + + def __eq__(self, other): # compare instance attributes to define equality. + return all(getattr(self, t) == getattr(other, t) for t in self.param_types) + + def __lt__(self, other): # define ordering for sorting. + # "<" for tuple is automatically lexicographic ordering + my_values = tuple(getattr(self, t) for t in self.param_types) + other_values = tuple(getattr(other, t) for t in self.param_types) + return my_values < other_values + + def __hash__(self): # make instances hashable for use as dict keys + return hash(tuple(getattr(self, t) for t in self.param_types)) + + +class TreeGridSearch: + """Grid search the search space and find the best parameters for the tree-based linear method, + according to the monitored metrics. + + Args: + datasets (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively. + The data has keys 'x' for input features and 'y' for labels. + n_folds (int, optional): The number of cross-validation folds. + monitor_metrics (list[str], optional): The evaluation metrics to monitor. + """ + + def __init__( + self, + datasets: dict[str, dict[str, list[str]]], + n_folds: int = 3, + monitor_metrics: list[str] = ["P@1", "P@3", "P@5"], + ): + self.datasets = datasets + self.n_folds = n_folds + self.monitor_metrics = monitor_metrics + + self._cached_params = TreeGridParameter() + for param_type in self._cached_params.param_types: + setattr(self._cached_params, param_type, None) + self._cached_transformed_dataset = None + self._cached_tree_root = None + self._cached_fold_data = None + self._cached_model = None + self.no_cache = True + + self.num_instances = len(self.datasets["train"]["y"]) + + def get_fold_dataset(self, train_idx, valid_idx): + def take(data, idx): + if isinstance(data, list): + return [data[i] for i in idx] + else: + return data[idx] + + return { + "data_format": self.datasets["data_format"], + "train": { + "y": take(self.datasets["train"]["y"], train_idx), + "x": take(self.datasets["train"]["x"], train_idx), + }, + "test": { + "y": take(self.datasets["train"]["y"], valid_idx), + "x": take(self.datasets["train"]["x"], valid_idx), + }, + } + + def get_transformed_dataset( + self, dataset: dict[str, dict[str, list[str]]], params: TreeGridParameter + ) -> dict[str, dict[str, sparse.csr_matrix]]: + """ + Get and cache the dataset for the given TF-IDF params. + If we have processed the coming params, return the cached dataset directly without computation. + + Args: + dataset (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively. + The data has keys 'x' for input features and 'y' for labels. + params (TreeGridParameter): The params to build the dataset. + + Returns: + dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset. + """ + tfidf_params = params.tfidf + self.no_cache = tfidf_params != self._cached_params.tfidf + if self.no_cache: + logging.info(f"TFIDF - Preprocessing: {tfidf_params}") + if self.datasets["data_format"] not in {"txt", "dataframe"}: + logging.info( + "Please make sure the data format is 'txt' or 'dataframe'. Otherwise, the TF-IDF parameters have no effect on the dataset." + ) + with __silent__(): + preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) + self._cached_params.tfidf = tfidf_params + self._cached_transformed_dataset = preprocessor.fit_transform(dataset) + else: + logging.info(f"TFIDF - Using cached data: {tfidf_params}") + + return self._cached_transformed_dataset + + def get_tree(self, y, x, params): + tree_params = params.tree + self.no_cache |= tree_params != self._cached_params.tree + if self.no_cache: + logging.info(f"Tree - Preprocessing: {tree_params}") + with __silent__(): + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + self._cached_params.tree = tree_params + self._cached_tree_root = _build_tree( + label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params) + ) + self._cached_tree_root.is_root = True + else: + logging.info(f"Tree - Using cached data: {tree_params}") + + return self._cached_tree_root + + def get_model(self, y: sparse.csr_matrix, x: sparse.csr_matrix, params: TreeGridParameter) -> linear.TreeModel: + """ + Get and cache the model for the given params. + If we have processed the coming params, return the cached model directly without computation. + + Args: + y (sparse.csr_matrix): The labels of the training data. + x (sparse.csr_matrix): The features of the training data. + params (TreeGridParameter): The params to build the model. + + Returns: + linear.TreeModel: The model for the given params. + """ + root = self.get_tree(y, x, params) + + linear_params = params.linear + + if self.no_cache or (linear_params != self._cached_params.linear): + logging.info(f"Model - Training: {linear_params}") + with __silent__(): + self._cached_params.linear = linear_params + self._cached_model = linear.train_tree( + y, + x, + root=root, + options=params.linear_options, + ) + else: + logging.info(f"Model - Using cached data: {linear_params}") + + return self._cached_model + + def __call__(self, search_space_dict: dict[str, list]) -> dict[TreeGridParameter, dict[str, float]]: + """ + Run the grid search on the search space. + + Args: + search_space_dict (dict[str, list]): The search space for the grid search. + + Returns: + dict[TreeGridParameter, dict[str, float]]: The cross-validation scores for each TreeGridParameter in the search space. + """ + param_names = search_space_dict.keys() + + # To avoid redundant computation (e.g., building the same tree multiple times across different params), + # we group identical settings in fields and process them continuously. + # This is implemented by sorting the params in the order of the four fields: + # TF-IDF, tree, linear, and predict. Finally, cache and reuse the most recent result of each field. + self.search_space = sorted( + [ + TreeGridParameter(dict(zip(param_names, param_values))) + for param_values in itertools.product(*search_space_dict.values()) + ], + reverse=True, + ) + + # When the number of labels is large, evaluation often focuses on top-ranked + # metrics (e.g., Precision@K), which do not depend on num_classes. + # We therefore use -1 as a placeholder. + self.param_metrics = { + params: linear.get_metrics(self.monitor_metrics, num_classes=-1) for params in self.search_space + } + + permutation = np.random.permutation(self.num_instances) + index_per_fold = [] + for fold in range(self.n_folds): + index = permutation[ + int(fold * self.num_instances / self.n_folds) : int((fold + 1) * self.num_instances / self.n_folds) + ] + index_per_fold.append(index) + + for fold in range(self.n_folds): + train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold + 1 :]) + valid_idx = index_per_fold[fold] + fold_dataset = self.get_fold_dataset(train_idx, valid_idx) + + self._cached_params.tfidf = None + for params in self.search_space: + logging.info(f"Status - Running fold {fold}, params: {params}") + + transformed_dataset = self.get_transformed_dataset(fold_dataset, params) + model = self.get_model(transformed_dataset["train"]["y"], transformed_dataset["train"]["x"], params) + + logging.info(f"Metric - Scoring: {params.predict}\n") + with __silent__(): + self.param_metrics[params], _, _, _ = linear_test( + y = transformed_dataset["test"]["y"], + x = transformed_dataset["test"]["x"], + model = model, + metrics = self.param_metrics[params], + predict_kwargs = asdict(params.predict), + ) + + return {params: metrics.compute() for params, metrics in self.param_metrics.items()} diff --git a/linear_trainer.py b/linear_trainer.py index b9133857..763e7f37 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -6,38 +6,8 @@ import libmultilabel.linear as linear from libmultilabel.common_utils import dump_log, is_multiclass_dataset -from libmultilabel.linear.tree import EnsembleTreeModel, TreeModel, train_ensemble_tree -from libmultilabel.linear.utils import LINEAR_TECHNIQUES - - -def linear_test(config, model, datasets, label_mapping): - metrics = linear.get_metrics(config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.multiclass) - num_instance = datasets["test"]["x"].shape[0] - k = config.save_k_predictions - if k > 0: - labels = np.zeros((num_instance, k), dtype=object) - scores = np.zeros((num_instance, k), dtype="d") - else: - labels = [] - scores = [] - - predict_kwargs = {} - if isinstance(model, (TreeModel, EnsembleTreeModel)): - predict_kwargs["beam_width"] = config.beam_width - - for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): - slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size] - preds = model.predict_values(datasets["test"]["x"][slice], **predict_kwargs) - target = datasets["test"]["y"][slice].toarray() - metrics.update(preds, target) - if k > 0: - labels[slice], scores[slice] = linear.get_topk_labels(preds, label_mapping, config.save_k_predictions) - elif config.save_positive_predictions: - res = linear.get_positive_labels(preds, label_mapping) - labels.append(res[0]) - scores.append(res[1]) - metric_dict = metrics.compute() - return metric_dict, labels, scores +from libmultilabel.linear.tree import train_ensemble_tree +from libmultilabel.linear.utils import LINEAR_TECHNIQUES, linear_test def linear_train(datasets, config): @@ -103,7 +73,18 @@ def linear_run(config): ), """ If save_k_predictions is larger than 0, only top k labels are saved. Save all labels with decision value larger than 0 by using save_positive_predictions and save_k_predictions=0.""" - metric_dict, labels, scores = linear_test(config, model, datasets, preprocessor.label_mapping) + metrics, metric_dict, labels, scores = linear_test( + y = datasets["test"]["y"], + x = datasets["test"]["x"], + model = model, + eval_batch_size = config.eval_batch_size, + monitor_metrics = config.monitor_metrics, + beam_width = config.beam_width, + prob_A = config.prob_A, + label_mapping = preprocessor.label_mapping, + save_k_predictions = config.save_k_predictions, + save_positive_predictions = config.save_positive_predictions, + ) dump_log(config=config, metrics=metric_dict, split="test", log_path=config.log_path) print(linear.tabulate_metrics(metric_dict, "test")) if config.save_k_predictions > 0: diff --git a/main.py b/main.py index 7a523f1f..59e3e379 100644 --- a/main.py +++ b/main.py @@ -252,6 +252,12 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) + parser.add_argument( + "--prob_A", + type=int, + default=3, + help="The hyperparameter used in the probability estimation function for binary classification: sigmoid(prob_A * decision_value_matrix). (default: %(default)s)", + ) # AttentionXML parser.add_argument( "--cluster_size", diff --git a/setup.cfg b/setup.cfg index 14ec42f4..8b610116 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = libmultilabel -version = 0.9.0 +version = 0.10.0 author = LibMultiLabel Team license = MIT License license_file = LICENSE diff --git a/tests/docs/test_changed_document.sh b/tests/docs/test_changed_document.sh index fa5ce6f3..05be5cd9 100644 --- a/tests/docs/test_changed_document.sh +++ b/tests/docs/test_changed_document.sh @@ -50,6 +50,7 @@ main() { rm $REPORT_PATH TEST_FILES_WithoutOutput=( "plot_linear_gridsearch_tutorial.py" + "plot_tree_gridsearch_tutorial.py" "plot_dataset_tutorial.py" ) for file_name in "${TEST_FILES_WithoutOutput[@]}"; do