diff --git a/docs/api/linear.rst b/docs/api/linear.rst
index 888db896..8629a1ec 100644
--- a/docs/api/linear.rst
+++ b/docs/api/linear.rst
@@ -34,12 +34,17 @@ The simplest usage is::
 
 .. autofunction:: get_positive_labels
 
+.. autofunction:: linear_test
+
 .. autoclass:: FlatModel
    :members:
 
 .. autoclass:: TreeModel
    :members:
 
+.. autoclass:: EnsembleTreeModel
+   :members:
+
 Load Dataset
 ^^^^^^^^^^^^
 
@@ -101,3 +106,18 @@ Grid Search with Sklearn Estimators
    :members:
 
    .. automethod:: __init__
+
+Grid Search for Tree-Based Linear Method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: TreeGridParameter
+   :members:
+
+   .. automethod:: __init__
+
+.. autoclass:: TreeGridSearch
+   :members:
+
+   .. automethod:: __init__
+
+   .. automethod:: __call__
\ No newline at end of file
diff --git a/docs/examples/plot_linear_gridsearch_tutorial.py b/docs/examples/plot_linear_gridsearch_tutorial.py
index d1a239e7..01c1e069 100644
--- a/docs/examples/plot_linear_gridsearch_tutorial.py
+++ b/docs/examples/plot_linear_gridsearch_tutorial.py
@@ -1,7 +1,14 @@
 """
-Hyperparameter Search for Linear Methods
+Hyperparameter Search for One-vs-rest Linear Methods
 =============================================================
+.. warning::
+
+    If you are using the tree-based linear method,
+    please check `Hyperparameter Search for Tree-Based Linear Method  <../auto_examples/plot_tree_gridsearch_tutorial.html>`_.
+
 This guide helps users to tune the hyperparameters of the feature generation step and the linear model.
+In this guide, the following methods are available:
+``1vsrest``, ``thresholding``, ``cost_sensitive``, ``cost_sensitive_micro``, and ``binary_and_multiclass``.
 
 Here we show an example of tuning a linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
 Starting with loading and preprocessing of the data without using ``Preprocessor``:
diff --git a/docs/examples/plot_tree_gridsearch_tutorial.py b/docs/examples/plot_tree_gridsearch_tutorial.py
new file mode 100644
index 00000000..d95dbfdb
--- /dev/null
+++ b/docs/examples/plot_tree_gridsearch_tutorial.py
@@ -0,0 +1,141 @@
+"""
+Hyperparameter Search for Tree-Based Linear Method
+=============================================================
+.. warning::
+
+    If you are using the one-vs-rest linear methods,
+    please check `Hyperparameter Search for One-vs-rest Linear Methods  <../auto_examples/plot_linear_gridsearch_tutorial.html>`_.
+
+To apply tree-based linear methods,
+we first convert raw text into numerical TF-IDF features.
+During training, the method builds a label tree and trains linear classifiers.
+At inference, the model traverses the tree and selects
+only a few candidate labels at each level to speed up prediction.
+
+To improve model performance, we need to search the hyperparameter space.
+Therefore, in this guide, we help users tune the hyperparameters of the tree-based linear method.
+
+.. seealso::
+
+    `Implementation Document <https://www.csie.ntu.edu.tw/~cjlin/papers/libmultilabel/libmultilabel_implementation.pdf>`_:
+        For more details about the implementation of tree-based linear methods and hyperparameter search.
+
+Here we show an example of tuning a tree-based linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
+Starting with loading the data:
+"""
+
+import logging
+
+from libmultilabel import linear
+
+logging.basicConfig(level=logging.INFO)
+
+datasets = linear.load_dataset("txt", "data/rcv1/train.txt", "data/rcv1/test.txt")
+L = len(datasets["train"]["y"])
+
+######################################################################
+# Next, we set up the search space.
+
+import numpy as np
+
+dmax = 10
+K_factors = [-2, 5]
+search_space_dict = {
+    "ngram_range": [(1, 1), (1, 2), (1, 3)],
+    "stop_words": ["english"],
+    "dmax": [dmax],
+    "K": [max(2, int(np.round(np.power(L, 1 / dmax) * np.power(2.0, alpha) + 0.5))) for alpha in K_factors],
+    "s": [1],
+    "c": [0.5, 1, 2],
+    "B": [1],
+    "beam_width": [10],
+    "prob_A": [3]
+}
+
+######################################################################
+# Following the suggestions in the `implementation document <https://www.csie.ntu.edu.tw/~cjlin/papers/libmultilabel/libmultilabel_implementation.pdf>`_,
+# we define 18 configurations to build a simple yet strong baseline.
+#
+# The search space covers several key parts of the search process:
+#
+# - Text feature extraction: (``ngram_range``, ``stop_words``)
+#
+#       - We use the vectorizer ``TfidfVectorizer`` from ``sklearn`` to generate features from raw text.
+#
+# - Label tree structure: (``dmax``, ``K``)
+#
+#      - The depth and node degree of the label tree. Note that ``K`` is the number of clusters and is calculated using the formula from the `implementation document <https://www.csie.ntu.edu.tw/~cjlin/papers/libmultilabel/libmultilabel_implementation.pdf>`_.
+#
+# - Linear classifier: (``s``, ``c``, ``B``)
+#
+#       - We combined them into a LIBLINEAR option string for training linear classifiers. (see *train Usage* in `liblinear <https://github.com/cjlin1/liblinear>`__ README)
+#
+# - Prediction: (``beam_width``, ``prob_A``)
+#
+#       - The number of candidates considered and the parameter for the probability estimation function at each level during prediction.
+#
+# .. tip::
+#
+#     Available hyperparameters (and their defaults) are defined in the class variables of :py:class:`~libmultilabel.linear.TreeGridParameter`.
+#
+# In :py:class:`~libmultilabel.linear.TreeGridSearch`, we perform cross-validation for evaluation.
+# Specifically, we split the training data into ``n_folds``,
+# sequentially using each fold as the validation set while training on the remaining folds.
+# Finally, we aggregate the validation outputs from each fold and compute the ``monitor_metrics``.
+# Initialization requires the dataset, the number of cross-validation folds, and the evaluation metrics.
+
+n_folds = 3
+monitor_metrics = ["P@1", "P@3", "P@5"]
+search = linear.TreeGridSearch(datasets, n_folds, monitor_metrics)
+cv_scores = search(search_space_dict)
+
+######################################################################
+# ``cv_scores`` is a dictionary where keys are :py:class:`~libmultilabel.linear.TreeGridParameter` instances and values are the ``monitor_metrics`` results.
+#
+# Here we sort the results in descending order by the first metric in ``monitor_metrics``.
+# You can retrieve the best parameters after the grid search with the following code:
+
+sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True)
+print(sorted_cv_scores)
+
+best_params, best_cv_scores = list(sorted_cv_scores)[0]
+print(best_params, best_cv_scores)
+
+######################################################################
+# The best parameters are::
+#
+#   {'ngram_range': (1, 3), 'stop_words': 'english', 'dmax': 10, 'K': 88, 's': 1, 'c': 1, 'B': 1, 'beam_width': 10, 'prob_A': 3}
+#
+# with best cross-validation scores::
+#
+#   {'P@1': 0.9669, 'P@3': 0.8137, 'P@5': 0.5640}
+#
+# We can then retrain using the best parameters,
+# and use :py:meth:`~libmultilabel.linear.linear_test` and :py:meth:`~libmultilabel.linear.get_metrics` to compute test performance.
+
+from dataclasses import asdict
+
+preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf))
+transformed_dataset = preprocessor.fit_transform(datasets)
+
+model = linear.train_tree(
+    transformed_dataset["train"]["y"],
+    transformed_dataset["train"]["x"],
+    best_params.linear_options,
+    **asdict(best_params.tree),
+)
+
+metrics, _, _, _ = linear.linear_test(
+    y = transformed_dataset["test"]["y"],
+    x = transformed_dataset["test"]["x"],
+    model = model,
+    metrics = linear.get_metrics(monitor_metrics, num_classes=-1),
+    predict_kwargs = asdict(best_params.predict),
+)
+
+print(metrics.compute())
+
+######################################################################
+# The result of the best parameters will look similar to::
+#
+#   {'P@1': 0.9554, 'P@3': 0.7968, 'P@5': 0.5576}
diff --git a/docs/search_retrain.rst b/docs/search_retrain.rst
index 26acfccb..9242b8ce 100644
--- a/docs/search_retrain.rst
+++ b/docs/search_retrain.rst
@@ -7,4 +7,5 @@ Hyperparameter Search
 
 
     ../auto_examples/plot_linear_gridsearch_tutorial
+    ../auto_examples/plot_tree_gridsearch_tutorial
     tutorials/Parameter_Selection_for_Neural_Networks
diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 7f1ce851..f8411b68 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy.sparse as sparse
+from scipy.special import log_expit
 from sparsekmeans import LloydKmeans, ElkanKmeans
 import sklearn.preprocessing
 from tqdm import tqdm
@@ -58,16 +59,35 @@ def __init__(
         self.multiclass = False
         self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
+    def sigmoid_A(self, x: np.ndarray, prob_A: int) -> np.ndarray:
+        """
+        Calculate log(sigmoid(prob_A * x)), which represents the probability of the positive class in binary classification.
+
+        Args:
+            x (np.ndarray): The decision value matrix with dimension number of instances * number of classes.
+            prob_A (int):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * x).
+
+        Returns:
+            np.ndarray: A matrix with dimension number of instances * number of classes.
+        """
+        return log_expit(prob_A * x)
+
     def predict_values(
         self,
         x: sparse.csr_matrix,
         beam_width: int = 10,
+        prob_A: int = 3,
     ) -> np.ndarray:
         """Calculate the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
-            beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10.
+            beam_width (int, optional): Number of candidates considered during beam search.
+            prob_A (int, optional):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * number of classes.
@@ -81,8 +101,8 @@ def predict_values(
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
-        return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
+            all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels)
+        return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
         """
@@ -113,7 +133,7 @@ def _separate_model_for_pruning_tree(self):
             )
             self.subtree_models.append(subtree_flatmodel)
         
-    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
+    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
 
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
@@ -122,6 +142,9 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
+            prob_A (int):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
@@ -132,7 +155,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
+        children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A)
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -159,12 +182,15 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         return all_preds
 
-    def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray:
+    def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray:
         """Predict with beam search using cached probability estimates for a single instance.
 
         Args:
             instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels.
             beam_width (int): Number of candidates considered.
+            prob_A (int, optional):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A vector with dimension number of classes.
@@ -182,7 +208,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.square(np.maximum(0, 1 - pred))
+                children_score = score + self.sigmoid_A(pred, prob_A)
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
@@ -193,7 +219,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
+            scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A))
         return scores
 
 
@@ -204,6 +230,7 @@ def train_tree(
     K=DEFAULT_K,
     dmax=DEFAULT_DMAX,
     verbose: bool = True,
+    root: Node = None,
 ) -> TreeModel:
     """Train a linear model for multi-label data using a divide-and-conquer strategy.
     The algorithm used is based on https://github.com/xmc-aalto/bonsai.
@@ -215,14 +242,16 @@ def train_tree(
         K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
         dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         verbose (bool, optional): Output extra progress information. Defaults to True.
+        root (Node, optional): Pre-built tree root. Defaults to None.
 
     Returns:
         TreeModel: A model which can be used in predict_values.
     """
-    label_representation = (y.T * x).tocsr()
-    label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
-    root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
-    root.is_root = True
+    if root is None:
+        label_representation = (y.T * x).tocsr()
+        label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+        root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
+        root.is_root = True
 
     num_nodes = 0
     # Both type(x) and type(y) are sparse.csr_matrix
diff --git a/libmultilabel/linear/utils.py b/libmultilabel/linear/utils.py
index 3324a896..74761179 100644
--- a/libmultilabel/linear/utils.py
+++ b/libmultilabel/linear/utils.py
@@ -1,10 +1,17 @@
 from __future__ import annotations
 
 import os
+import sys
+import math
+import itertools
+import logging
 import pathlib
 import pickle
 import re
-from typing import Any
+from math import ceil
+from tqdm import tqdm
+from typing import Any, Callable
+from dataclasses import make_dataclass, field, fields, asdict
 
 import numpy as np
 import scipy.sparse as sparse
@@ -12,12 +19,14 @@
 import sklearn.model_selection
 import sklearn.pipeline
 import sklearn.utils
+import sklearn.preprocessing
 
 import libmultilabel.linear as linear
 
 from .preprocessor import Preprocessor
+from .tree import _build_tree
 
-__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV"]
+__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV", "linear_test", "TreeGridParameter", "TreeGridSearch"]
 
 
 LINEAR_TECHNIQUES = {
@@ -143,3 +152,386 @@ def _set_singlecore_options(self, estimator, param_grid: dict):
                 key = f"{name}__options"
                 param_grid[key] = [f"{re.sub(regex, '', v)} -m 1" for v in param_grid[key]]
         return param_grid
+
+
+def linear_test(
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    model: linear.FlatModel | linear.TreeModel | linear.EnsembleTreeModel,
+    eval_batch_size: int = 256,
+    monitor_metrics: list[str] | None = None,
+    metrics: linear.MetricCollection | None = None,
+    predict_kwargs: dict | None = None,
+    beam_width: int | None = None,
+    prob_A: float | None = None,
+    label_mapping: np.ndarray | None = None,
+    save_k_predictions: int | None = None,
+    save_positive_predictions: bool | None = None,
+) -> tuple[linear.MetricCollection, dict, list | np.ndarray, list | np.ndarray]:
+    """
+    Evaluate a linear model on test data with batched prediction and compute metrics.
+
+    Args:
+        y (scipy.sparse.csr_matrix): The labels of the test data with dimensions number of instances * number of classes.
+        x (scipy.sparse.csr_matrix): The features of the test data with dimensions number of instances * number of features.
+        model (linear.FlatModel | linear.TreeModel | linear.EnsembleTreeModel): The trained model.
+        eval_batch_size (int): Batch size used during evaluation.
+        monitor_metrics (list[str], optional): The evaluation metrics to monitor.
+        metrics (linear.MetricCollection, optional): The metric values.
+        predict_kwargs (dict, optional): Extra parameters passed to model.predict_values.
+        beam_width (int, optional): Number of candidates considered during beam search.
+        prob_A (float, optional):
+            The hyperparameter used in the probability estimation function for
+            binary classification: sigmoid(prob_A * decision_value_matrix).
+        label_mapping (np.ndarray, optional): A np.ndarray of class labels that maps each index (from 0 to ``num_class-1``) to its label.
+        save_k_predictions (int, optional): Determine how many classes per instance should be predicted.
+        save_positive_predictions (bool, optional): Whether to save labels and scores with positive decision scores.
+
+    Returns:
+        tuple[linear.MetricCollection, dict, list | np.ndarray, list | np.ndarray]:
+        A tuple containing:
+            metrics (linear.MetricCollection)
+                The updated metric values.
+            metric_dict (dict[str, float])
+                The computed metric results.
+            labels (list or np.ndarray)
+                If ``save_k_predictions`` is set, an np.ndarray containing the labels of
+                top k predictions from decision values.
+                Else if ``save_positive_predictions`` is True, a list with the labels with
+                positive decision values.
+                Otherwise, an empty list.
+            scores (list or np.ndarray)
+                If ``save_k_predictions`` is set, an np.ndarray containing the scores of
+                top k predictions from decision values.
+                Else if ``save_positive_predictions`` is True, a list with the scores with
+                positive decision values.
+                Otherwise, an empty list.
+    """
+    if monitor_metrics is None:
+        monitor_metrics = ["P@1", "P@3", "P@5"]
+    if metrics is None:
+        metrics = linear.get_metrics(monitor_metrics, y.shape[1], multiclass=model.multiclass)
+    num_instance = x.shape[0]
+    k = save_k_predictions
+    if k is not None and k > 0:
+        labels = np.zeros((num_instance, k), dtype=object)
+        scores = np.zeros((num_instance, k), dtype="d")
+    else:
+        labels = []
+        scores = []
+
+    if predict_kwargs is None:
+        predict_kwargs = {}
+        if isinstance(model, (linear.TreeModel, linear.EnsembleTreeModel)):
+            if beam_width is not None:
+                predict_kwargs["beam_width"] = beam_width
+            if prob_A is not None:
+                predict_kwargs["prob_A"] = prob_A
+
+    for i in tqdm(range(ceil(num_instance / eval_batch_size))):
+        slice = np.s_[i * eval_batch_size : (i + 1) * eval_batch_size]
+        preds = model.predict_values(x[slice], **predict_kwargs)
+        target = y[slice].toarray()
+        metrics.update(preds, target)
+        if k is not None and label_mapping is not None and k > 0:
+            labels[slice], scores[slice] = linear.get_topk_labels(preds, label_mapping, save_k_predictions)
+        elif save_positive_predictions and label_mapping is not None:
+            res = linear.get_positive_labels(preds, label_mapping)
+            labels.append(res[0])
+            scores.append(res[1])
+    metric_dict = metrics.compute()
+    return metrics, metric_dict, labels, scores
+
+
+# suppress inevitable outputs from sparsekmeans and sklearn preprocessors
+class __silent__:
+    def __init__(self):
+        self.stderr = os.dup(2)
+        self.devnull = os.open(os.devnull, os.O_WRONLY)
+
+    def __enter__(self):
+        os.dup2(self.devnull, 2)
+        self.stdout = sys.stdout
+        sys.stdout = open(os.devnull, "w")
+
+    def __exit__(self, type, value, traceback):
+        os.dup2(self.stderr, 2)
+        os.close(self.devnull)
+        os.close(self.stderr)
+        sys.stdout.close()
+        sys.stdout = self.stdout
+
+
+class TreeGridParameter:
+    """A tree-based linear method hyperparameter class for TreeGridSearch.
+    Transform the parameter dict into dataclass instances.
+    Parameters not in the dict will be set to default values.
+
+    Args:
+        params (dict, optional): The keys are the parameter names, and the valus are the parameter values.
+    """
+
+    _tfidf_fields = [
+        ("ngram_range", tuple[int, int], field(default=(1, 1))),
+        ("max_features", int, field(default=None)),
+        ("min_df", float | int, field(default=1)),
+        ("stop_words", str | list, field(default=None)),
+        ("strip_accents", str | Callable, field(default=None)),
+        ("tokenizer", Callable, field(default=None)),
+    ]
+    _tree_fields = [
+        ("dmax", int, field(default=10)),
+        ("K", int, field(default=8)),
+    ]
+    _linear_fields = [
+        ("s", int, field(default=1)),
+        ("c", float, field(default=1)),
+        ("B", int, field(default=-1)),
+    ]
+    _predict_fields = [
+        ("beam_width", int, field(default=10)),
+        ("prob_A", int, field(default=3)),
+    ]
+
+    # set frozen=True to make instances hashable.
+    # set order=True to enable comparison operations.
+    param_types = {
+        "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True),
+        "tree": make_dataclass("TreeParams", _tree_fields, frozen=True, order=True),
+        "linear": make_dataclass("LinearParams", _linear_fields, frozen=True, order=True),
+        "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True),
+    }
+    _param_field_names = {
+        param_type: {f.name for f in fields(class_name)} for param_type, class_name in param_types.items()
+    }
+
+    def __init__(self, params: dict | None = None):
+        self.params = params or {}
+
+        params_set = set(self.params)
+        for param_type, class_name in self.param_types.items():
+            field_names = self._param_field_names[param_type]
+            filtered_keys = params_set & field_names
+            params_set -= field_names
+
+            filtered_params = {k: self.params[k] for k in filtered_keys}
+            setattr(self, param_type, class_name(**filtered_params))
+
+    @property
+    def linear_options(self):
+        options = ""
+        for field_name in self._param_field_names["linear"]:
+            options += f" -{field_name} {getattr(self.linear, field_name)}"
+        return options.strip()
+
+    def __repr__(self):  # provide a readable string representation of the object
+        return str(self.params)
+
+    def __eq__(self, other):  # compare instance attributes to define equality.
+        return all(getattr(self, t) == getattr(other, t) for t in self.param_types)
+
+    def __lt__(self, other):  # define ordering for sorting.
+        # "<" for tuple is automatically lexicographic ordering
+        my_values = tuple(getattr(self, t) for t in self.param_types)
+        other_values = tuple(getattr(other, t) for t in self.param_types)
+        return my_values < other_values
+
+    def __hash__(self):  # make instances hashable for use as dict keys
+        return hash(tuple(getattr(self, t) for t in self.param_types))
+
+
+class TreeGridSearch:
+    """Grid search the search space and find the best parameters for the tree-based linear method,
+    according to the monitored metrics.
+
+    Args:
+        datasets (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively.
+                The data has keys 'x' for input features and 'y' for labels.
+        n_folds (int, optional): The number of cross-validation folds.
+        monitor_metrics (list[str], optional): The evaluation metrics to monitor.
+    """
+
+    def __init__(
+        self,
+        datasets: dict[str, dict[str, list[str]]],
+        n_folds: int = 3,
+        monitor_metrics: list[str] = ["P@1", "P@3", "P@5"],
+    ):
+        self.datasets = datasets
+        self.n_folds = n_folds
+        self.monitor_metrics = monitor_metrics
+
+        self._cached_params = TreeGridParameter()
+        for param_type in self._cached_params.param_types:
+            setattr(self._cached_params, param_type, None)
+        self._cached_transformed_dataset = None
+        self._cached_tree_root = None
+        self._cached_fold_data = None
+        self._cached_model = None
+        self.no_cache = True
+
+        self.num_instances = len(self.datasets["train"]["y"])
+
+    def get_fold_dataset(self, train_idx, valid_idx):
+        def take(data, idx):
+            if isinstance(data, list):
+                return [data[i] for i in idx]
+            else:
+                return data[idx]
+
+        return {
+            "data_format": self.datasets["data_format"],
+            "train": {
+                "y": take(self.datasets["train"]["y"], train_idx),
+                "x": take(self.datasets["train"]["x"], train_idx),
+            },
+            "test": {
+                "y": take(self.datasets["train"]["y"], valid_idx),
+                "x": take(self.datasets["train"]["x"], valid_idx),
+            },
+        }
+
+    def get_transformed_dataset(
+        self, dataset: dict[str, dict[str, list[str]]], params: TreeGridParameter
+    ) -> dict[str, dict[str, sparse.csr_matrix]]:
+        """
+        Get and cache the dataset for the given TF-IDF params.
+        If we have processed the coming params, return the cached dataset directly without computation.
+
+        Args:
+            dataset (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively.
+                The data has keys 'x' for input features and 'y' for labels.
+            params (TreeGridParameter): The params to build the dataset.
+
+        Returns:
+            dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset.
+        """
+        tfidf_params = params.tfidf
+        self.no_cache = tfidf_params != self._cached_params.tfidf
+        if self.no_cache:
+            logging.info(f"TFIDF  - Preprocessing: {tfidf_params}")
+            if self.datasets["data_format"] not in {"txt", "dataframe"}:
+                logging.info(
+                    "Please make sure the data format is 'txt' or 'dataframe'. Otherwise, the TF-IDF parameters have no effect on the dataset."
+                )
+            with __silent__():
+                preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params))
+                self._cached_params.tfidf = tfidf_params
+                self._cached_transformed_dataset = preprocessor.fit_transform(dataset)
+        else:
+            logging.info(f"TFIDF  - Using cached data: {tfidf_params}")
+
+        return self._cached_transformed_dataset
+
+    def get_tree(self, y, x, params):
+        tree_params = params.tree
+        self.no_cache |= tree_params != self._cached_params.tree
+        if self.no_cache:
+            logging.info(f"Tree   - Preprocessing: {tree_params}")
+            with __silent__():
+                label_representation = (y.T * x).tocsr()
+                label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+                self._cached_params.tree = tree_params
+                self._cached_tree_root = _build_tree(
+                    label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)
+                )
+                self._cached_tree_root.is_root = True
+        else:
+            logging.info(f"Tree   - Using cached data: {tree_params}")
+
+        return self._cached_tree_root
+
+    def get_model(self, y: sparse.csr_matrix, x: sparse.csr_matrix, params: TreeGridParameter) -> linear.TreeModel:
+        """
+        Get and cache the model for the given params.
+        If we have processed the coming params, return the cached model directly without computation.
+
+        Args:
+            y (sparse.csr_matrix): The labels of the training data.
+            x (sparse.csr_matrix): The features of the training data.
+            params (TreeGridParameter): The params to build the model.
+
+        Returns:
+            linear.TreeModel: The model for the given params.
+        """
+        root = self.get_tree(y, x, params)
+
+        linear_params = params.linear
+
+        if self.no_cache or (linear_params != self._cached_params.linear):
+            logging.info(f"Model  - Training: {linear_params}")
+            with __silent__():
+                self._cached_params.linear = linear_params
+                self._cached_model = linear.train_tree(
+                    y,
+                    x,
+                    root=root,
+                    options=params.linear_options,
+                )
+        else:
+            logging.info(f"Model  - Using cached data: {linear_params}")
+
+        return self._cached_model
+
+    def __call__(self, search_space_dict: dict[str, list]) -> dict[TreeGridParameter, dict[str, float]]:
+        """
+        Run the grid search on the search space.
+
+        Args:
+            search_space_dict (dict[str, list]): The search space for the grid search.
+
+        Returns:
+            dict[TreeGridParameter, dict[str, float]]: The cross-validation scores for each TreeGridParameter in the search space.
+        """
+        param_names = search_space_dict.keys()
+
+        # To avoid redundant computation (e.g., building the same tree multiple times across different params),
+        # we group identical settings in fields and process them continuously.
+        # This is implemented by sorting the params in the order of the four fields:
+        # TF-IDF, tree, linear, and predict. Finally, cache and reuse the most recent result of each field.
+        self.search_space = sorted(
+            [
+                TreeGridParameter(dict(zip(param_names, param_values)))
+                for param_values in itertools.product(*search_space_dict.values())
+            ],
+            reverse=True,
+        )
+
+        # When the number of labels is large, evaluation often focuses on top-ranked
+        # metrics (e.g., Precision@K), which do not depend on num_classes.
+        # We therefore use -1 as a placeholder.
+        self.param_metrics = {
+            params: linear.get_metrics(self.monitor_metrics, num_classes=-1) for params in self.search_space
+        }
+
+        permutation = np.random.permutation(self.num_instances)
+        index_per_fold = []
+        for fold in range(self.n_folds):
+            index = permutation[
+                int(fold * self.num_instances / self.n_folds) : int((fold + 1) * self.num_instances / self.n_folds)
+            ]
+            index_per_fold.append(index)
+
+        for fold in range(self.n_folds):
+            train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold + 1 :])
+            valid_idx = index_per_fold[fold]
+            fold_dataset = self.get_fold_dataset(train_idx, valid_idx)
+
+            self._cached_params.tfidf = None
+            for params in self.search_space:
+                logging.info(f"Status - Running fold {fold}, params: {params}")
+
+                transformed_dataset = self.get_transformed_dataset(fold_dataset, params)
+                model = self.get_model(transformed_dataset["train"]["y"], transformed_dataset["train"]["x"], params)
+
+                logging.info(f"Metric - Scoring: {params.predict}\n")
+                with __silent__():
+                    self.param_metrics[params], _, _, _ = linear_test(
+                        y = transformed_dataset["test"]["y"],
+                        x = transformed_dataset["test"]["x"],
+                        model = model,
+                        metrics = self.param_metrics[params],
+                        predict_kwargs = asdict(params.predict),
+                    )
+
+        return {params: metrics.compute() for params, metrics in self.param_metrics.items()}
diff --git a/linear_trainer.py b/linear_trainer.py
index b9133857..763e7f37 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -6,38 +6,8 @@
 
 import libmultilabel.linear as linear
 from libmultilabel.common_utils import dump_log, is_multiclass_dataset
-from libmultilabel.linear.tree import EnsembleTreeModel, TreeModel, train_ensemble_tree
-from libmultilabel.linear.utils import LINEAR_TECHNIQUES
-
-
-def linear_test(config, model, datasets, label_mapping):
-    metrics = linear.get_metrics(config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.multiclass)
-    num_instance = datasets["test"]["x"].shape[0]
-    k = config.save_k_predictions
-    if k > 0:
-        labels = np.zeros((num_instance, k), dtype=object)
-        scores = np.zeros((num_instance, k), dtype="d")
-    else:
-        labels = []
-        scores = []
-
-    predict_kwargs = {}
-    if isinstance(model, (TreeModel, EnsembleTreeModel)):
-        predict_kwargs["beam_width"] = config.beam_width
-
-    for i in tqdm(range(ceil(num_instance / config.eval_batch_size))):
-        slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size]
-        preds = model.predict_values(datasets["test"]["x"][slice], **predict_kwargs)
-        target = datasets["test"]["y"][slice].toarray()
-        metrics.update(preds, target)
-        if k > 0:
-            labels[slice], scores[slice] = linear.get_topk_labels(preds, label_mapping, config.save_k_predictions)
-        elif config.save_positive_predictions:
-            res = linear.get_positive_labels(preds, label_mapping)
-            labels.append(res[0])
-            scores.append(res[1])
-    metric_dict = metrics.compute()
-    return metric_dict, labels, scores
+from libmultilabel.linear.tree import train_ensemble_tree
+from libmultilabel.linear.utils import LINEAR_TECHNIQUES, linear_test
 
 
 def linear_train(datasets, config):
@@ -103,7 +73,18 @@ def linear_run(config):
         ), """
             If save_k_predictions is larger than 0, only top k labels are saved.
             Save all labels with decision value larger than 0 by using save_positive_predictions and save_k_predictions=0."""
-        metric_dict, labels, scores = linear_test(config, model, datasets, preprocessor.label_mapping)
+        metrics, metric_dict, labels, scores = linear_test(
+            y = datasets["test"]["y"],
+            x = datasets["test"]["x"],
+            model = model,
+            eval_batch_size = config.eval_batch_size,
+            monitor_metrics = config.monitor_metrics,
+            beam_width = config.beam_width,
+            prob_A = config.prob_A,
+            label_mapping = preprocessor.label_mapping,
+            save_k_predictions = config.save_k_predictions,
+            save_positive_predictions = config.save_positive_predictions,
+        )
         dump_log(config=config, metrics=metric_dict, split="test", log_path=config.log_path)
         print(linear.tabulate_metrics(metric_dict, "test"))
         if config.save_k_predictions > 0:
diff --git a/main.py b/main.py
index 7a523f1f..59e3e379 100644
--- a/main.py
+++ b/main.py
@@ -252,6 +252,12 @@ def add_all_arguments(parser):
         default=10,
         help="The width of the beam search (default: %(default)s)",
     )
+    parser.add_argument(
+        "--prob_A",
+        type=int,
+        default=3,
+        help="The hyperparameter used in the probability estimation function for binary classification: sigmoid(prob_A * decision_value_matrix). (default: %(default)s)",
+    )
     # AttentionXML
     parser.add_argument(
         "--cluster_size",
diff --git a/setup.cfg b/setup.cfg
index 14ec42f4..8b610116 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = libmultilabel
-version = 0.9.0
+version = 0.10.0
 author = LibMultiLabel Team
 license = MIT License
 license_file = LICENSE
diff --git a/tests/docs/test_changed_document.sh b/tests/docs/test_changed_document.sh
index fa5ce6f3..05be5cd9 100644
--- a/tests/docs/test_changed_document.sh
+++ b/tests/docs/test_changed_document.sh
@@ -50,6 +50,7 @@ main() {
   rm $REPORT_PATH
   TEST_FILES_WithoutOutput=(
     "plot_linear_gridsearch_tutorial.py"
+    "plot_tree_gridsearch_tutorial.py"
     "plot_dataset_tutorial.py"
   )
   for file_name in "${TEST_FILES_WithoutOutput[@]}"; do