From bbbb33f36c4b7abe95c36b9e63434c92ca171f4e Mon Sep 17 00:00:00 2001 From: Sushobhan Parajuli Date: Thu, 5 Mar 2026 15:42:13 -0500 Subject: [PATCH 1/5] add graded rbp metric class --- src/lenskit/metrics/__init__.py | 2 + src/lenskit/metrics/ranking/__init__.py | 3 +- src/lenskit/metrics/ranking/_rbp.py | 94 +++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 1 deletion(-) diff --git a/src/lenskit/metrics/__init__.py b/src/lenskit/metrics/__init__.py index 4cb237724..f7691c308 100644 --- a/src/lenskit/metrics/__init__.py +++ b/src/lenskit/metrics/__init__.py @@ -25,6 +25,7 @@ RBP, ExposureGini, GeometricRankWeight, + GradedRBP, Hit, ListGini, LogRankWeight, @@ -58,6 +59,7 @@ "NDCG", "DCG", "RBP", + "GradedRBP", "Hit", "Precision", "Recall", diff --git a/src/lenskit/metrics/ranking/__init__.py b/src/lenskit/metrics/ranking/__init__.py index d0f11abf0..8d57a1890 100644 --- a/src/lenskit/metrics/ranking/__init__.py +++ b/src/lenskit/metrics/ranking/__init__.py @@ -17,7 +17,7 @@ from ._map import AveragePrecision from ._pop import MeanPopRank from ._pr import Precision, Recall -from ._rbp import RBP, rank_biased_precision +from ._rbp import RBP, GradedRBP, rank_biased_precision from ._recip import RecipRank from ._weighting import GeometricRankWeight, LogRankWeight, RankWeight @@ -33,6 +33,7 @@ "NDCG", "DCG", "RBP", + "GradedRBP", "rank_biased_precision", "MeanPopRank", "AveragePrecision", diff --git a/src/lenskit/metrics/ranking/_rbp.py b/src/lenskit/metrics/ranking/_rbp.py index 373aa3935..f61bf6f15 100644 --- a/src/lenskit/metrics/ranking/_rbp.py +++ b/src/lenskit/metrics/ranking/_rbp.py @@ -159,3 +159,97 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: normalization = np.sum(weights).item() return rank_biased_precision(good, weights, normalization) + + +class GradedRBP(RBP): + """ + Rank-Biased Precision with graded relevance. + + Extends RBP by allowing relevance grades :math:`r_{ui} \\in \\[0, 1\\]`. + Grades are read from a field in the test ItemList. If the item is + unknown, a default grade of `0.25` is assigned. If the grade field + is absent, this metric defaults to binary RBP. + """ + + grade_field: str + scale: bool + unknown_grade: float + + def __init__( + self, + n: int | None = None, + *, + k: int | None = None, + weight: RankWeight | None = None, + patience: float = 0.85, + normalize: bool = False, + weight_field: str | None = None, + grade_field: str = "grade", + scale: bool = False, + unknown_grade: float = 0.25, + ): + super().__init__( + n, + k=k, + weight=weight, + patience=patience, + normalize=normalize, + weight_field=weight_field, + ) + + self.grade_field = grade_field + self.scale = scale + self.unknown_grade = unknown_grade + + @property + def label(self): + if self.n is not None: + return f"GradedRBP@{self.n}" + else: + return "GradedRBP" + + @override + def measure_list(self, recs: ItemList, test: ItemList) -> float: + recs = self.truncate(recs) + k = len(recs) + + if len(test) == 0: + return np.nan + + # fallback to binary RBP if grade field is missing + if self.grade_field not in test.fields: + return super().measure_list(recs, test) + + # build grade lookup + grades = test.field(self.grade_field) + + if self.scale and len(grades) > 0: + max_grade = np.max(grades) + if max_grade > 0: + grades = grades / max_grade + + # map item and grade + grade_map = dict(zip(test.ids(), grades)) + + rel = np.array([grade_map.get(item, self.unknown_grade) for item in recs.ids()]) + + if self.weight_field is not None: + weights = recs.field(self.weight_field) + normalization = np.sum(weights).item() + + else: + ranks = recs.ranks() + assert ranks is not None + + weights = self.weight.weight(ranks) + + wmax = self.weight.series_sum() + + if self.normalize: + normalization = np.sum(weights[: min(len(test), k)]).item() + elif wmax is not None: + normalization = wmax + else: + normalization = np.sum(weights).item() + + return rank_biased_precision(rel, weights, normalization) From a155e405724c7c69f0024b2ae10bee06827c299d Mon Sep 17 00:00:00 2001 From: Sushobhan Parajuli Date: Thu, 5 Mar 2026 16:31:12 -0500 Subject: [PATCH 2/5] add helper method and initial test' --- src/lenskit/metrics/ranking/_rbp.py | 25 +++++++++++++++++++++++-- tests/eval/test_rank_grbp.py | 21 +++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 tests/eval/test_rank_grbp.py diff --git a/src/lenskit/metrics/ranking/_rbp.py b/src/lenskit/metrics/ranking/_rbp.py index f61bf6f15..4c443e411 100644 --- a/src/lenskit/metrics/ranking/_rbp.py +++ b/src/lenskit/metrics/ranking/_rbp.py @@ -36,6 +36,27 @@ def rank_biased_precision( return rbp / normalization +def graded_rank_biased_precision( + relevance: np.ndarray, weights: np.ndarray, normalization: float = 1.0 +) -> float: + """ + Compute graded rank-biased precision. + + Args: + relevance: + Float array of relevance/grade scores at each position + weights: + Weight for each item position (same length as relevance) + normalization: + Optional normalization factor, defaults to 1.0 + + Returns: + Graded RBP score + """ + score = np.sum(weights * relevance).item() + return score / normalization + + class RBP(ListMetric, RankingMetricBase): """ Evaluate recommendations with rank-biased precision :cite:p:`rbp`. @@ -217,7 +238,7 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: return np.nan # fallback to binary RBP if grade field is missing - if self.grade_field not in test.fields: + if self.grade_field not in test._fields: return super().measure_list(recs, test) # build grade lookup @@ -252,4 +273,4 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: else: normalization = np.sum(weights).item() - return rank_biased_precision(rel, weights, normalization) + return graded_rank_biased_precision(rel, weights, normalization) diff --git a/tests/eval/test_rank_grbp.py b/tests/eval/test_rank_grbp.py new file mode 100644 index 000000000..77577910b --- /dev/null +++ b/tests/eval/test_rank_grbp.py @@ -0,0 +1,21 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University. +# Copyright (C) 2023-2025 Drexel University. +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +import numpy as np + +from pytest import approx + +from lenskit.data import ItemList +from lenskit.metrics import call_metric +from lenskit.metrics.ranking import GradedRBP + + +def test_grbp_empty(): + recs = ItemList([], ordered=True) + truth = ItemList(item_ids=[1, 2, 3], grade=[1.0, 1.0, 1.0]) + + metric = GradedRBP() + assert metric.measure_list(recs, truth) == approx(0.0) From 314437ec13737155102c4ec37e7042f8de1a0ca3 Mon Sep 17 00:00:00 2001 From: Sushobhan Parajuli Date: Thu, 5 Mar 2026 16:50:19 -0500 Subject: [PATCH 3/5] add basic tests --- tests/eval/test_rank_grbp.py | 40 ++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tests/eval/test_rank_grbp.py b/tests/eval/test_rank_grbp.py index 77577910b..22d7bec0d 100644 --- a/tests/eval/test_rank_grbp.py +++ b/tests/eval/test_rank_grbp.py @@ -9,8 +9,7 @@ from pytest import approx from lenskit.data import ItemList -from lenskit.metrics import call_metric -from lenskit.metrics.ranking import GradedRBP +from lenskit.metrics.ranking import RBP, GradedRBP def test_grbp_empty(): @@ -19,3 +18,40 @@ def test_grbp_empty(): metric = GradedRBP() assert metric.measure_list(recs, truth) == approx(0.0) + + +def test_grbp_unknown_grade(): + recs = ItemList([1, 2], ordered=True) + truth = ItemList(item_ids=[1], grade=[1.0]) + + p = 0.5 + metric = GradedRBP(patience=p, unknown_grade=0.25) + + expected = (1 - p) * (1 + 0.25 * p) + + assert metric.measure_list(recs, truth) == approx(expected) + + +def test_grbp_scaling(): + recs = ItemList([1, 2], ordered=True) + truth = ItemList(item_ids=[1, 2], grade=[2.0, 4.0]) + + p = 0.5 + metric = GradedRBP(patience=p, scale=True) + + scaled = np.array([0.5, 1.0]) + expected = (1 - p) * (scaled[0] + scaled[1] * p) + + assert metric.measure_list(recs, truth) == approx(expected) + + +def test_grbp_binary(): + recs = ItemList([1, 2, 3], ordered=True) + + graded_truth = ItemList(item_ids=[1, 3], graded=[1.0, 1.0]) + binary_truth = ItemList([1, 3]) # no grade field + + grbp = GradedRBP() + rbp = RBP() + + assert grbp.measure_list(recs, graded_truth) == approx(rbp.measure_list(recs, binary_truth)) From 9544ad727acda4a3136106c138a713711e742581 Mon Sep 17 00:00:00 2001 From: Sushobhan Parajuli Date: Thu, 5 Mar 2026 17:35:25 -0500 Subject: [PATCH 4/5] keep `RBP` configurable and remove `GradedRBP` --- src/lenskit/metrics/__init__.py | 2 - src/lenskit/metrics/ranking/__init__.py | 3 +- src/lenskit/metrics/ranking/_rbp.py | 113 +++++------------------- tests/eval/test_rank_grbp.py | 57 ------------ tests/eval/test_rank_rbp.py | 35 ++++++++ 5 files changed, 56 insertions(+), 154 deletions(-) delete mode 100644 tests/eval/test_rank_grbp.py diff --git a/src/lenskit/metrics/__init__.py b/src/lenskit/metrics/__init__.py index 48af58187..7bc603a30 100644 --- a/src/lenskit/metrics/__init__.py +++ b/src/lenskit/metrics/__init__.py @@ -26,7 +26,6 @@ Entropy, ExposureGini, GeometricRankWeight, - GradedRBP, Hit, ListGini, LogRankWeight, @@ -59,7 +58,6 @@ "NDCG", "DCG", "RBP", - "GradedRBP", "Hit", "Precision", "Recall", diff --git a/src/lenskit/metrics/ranking/__init__.py b/src/lenskit/metrics/ranking/__init__.py index 276a3d54d..8d66ef7a9 100644 --- a/src/lenskit/metrics/ranking/__init__.py +++ b/src/lenskit/metrics/ranking/__init__.py @@ -17,7 +17,7 @@ from ._map import AveragePrecision from ._pop import MeanPopRank from ._pr import Precision, Recall -from ._rbp import RBP, GradedRBP, rank_biased_precision +from ._rbp import RBP, rank_biased_precision from ._recip import RecipRank from ._weighting import GeometricRankWeight, LogRankWeight, RankWeight @@ -33,7 +33,6 @@ "NDCG", "DCG", "RBP", - "GradedRBP", "rank_biased_precision", "MeanPopRank", "AveragePrecision", diff --git a/src/lenskit/metrics/ranking/_rbp.py b/src/lenskit/metrics/ranking/_rbp.py index b58c01cee..5401ac941 100644 --- a/src/lenskit/metrics/ranking/_rbp.py +++ b/src/lenskit/metrics/ranking/_rbp.py @@ -84,6 +84,9 @@ class RBP(ListMetric, RankingMetricBase): in the paper; however, RBP with high patience should be no worse than nDCG (and perhaps even better) in this regard. + This metric class supports relevance grades :math:`r_{ui} \\in \\[0, 1\\]` + via an optional ``grade_field``. + In recommender evaluation, we usually have a small test set, so the maximum achievable RBP is significantly less than the theoretical maximum, and is a function of the number of test items. With ``normalize=True``, the RBP @@ -120,6 +123,8 @@ class RBP(ListMetric, RankingMetricBase): patience: float normalize: bool weight_field: str | None + grade_field: str | None + unknown_grade: float def __init__( self, @@ -130,6 +135,8 @@ def __init__( patience: float = 0.85, normalize: bool = False, weight_field: str | None = None, + grade_field: str | None = None, + unknown_grade: float = 0.25, ): super().__init__(n, k=k) self.patience = patience @@ -138,13 +145,16 @@ def __init__( self.weight = weight self.normalize = normalize self.weight_field = weight_field + self.grade_field = grade_field + self.unknown_grade = unknown_grade @property def label(self): + base = "RBP" if self.grade_field is None else "GradedRBP" if self.n is not None: - return f"RBP@{self.n}" + return f"{base}@{self.n}" else: - return "RBP" + return base @override def measure_list(self, recs: ItemList, test: ItemList) -> float: @@ -155,8 +165,6 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: if nrel == 0: return np.nan - good = recs.isin(test) - if self.weight_field is not None: # use custom weights from field weights = recs.field(self.weight_field) @@ -179,98 +187,17 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: else: normalization = np.sum(weights).item() - return rank_biased_precision(good, weights, normalization) - - -class GradedRBP(RBP): - """ - Rank-Biased Precision with graded relevance. - - Extends RBP by allowing relevance grades :math:`r_{ui} \\in \\[0, 1\\]`. - Grades are read from a field in the test ItemList. If the item is - unknown, a default grade of `0.25` is assigned. If the grade field - is absent, this metric defaults to binary RBP. - """ - - grade_field: str - scale: bool - unknown_grade: float + # Binary relevance + if self.grade_field is None: + good = recs.isin(test) + return rank_biased_precision(good, weights, normalization) - def __init__( - self, - n: int | None = None, - *, - k: int | None = None, - weight: RankWeight | None = None, - patience: float = 0.85, - normalize: bool = False, - weight_field: str | None = None, - grade_field: str = "grade", - scale: bool = False, - unknown_grade: float = 0.25, - ): - super().__init__( - n, - k=k, - weight=weight, - patience=patience, - normalize=normalize, - weight_field=weight_field, - ) - - self.grade_field = grade_field - self.scale = scale - self.unknown_grade = unknown_grade - - @property - def label(self): - if self.n is not None: - return f"GradedRBP@{self.n}" - else: - return "GradedRBP" - - @override - def measure_list(self, recs: ItemList, test: ItemList) -> float: - recs = self.truncate(recs) - k = len(recs) - - if len(test) == 0: - return np.nan - - # fallback to binary RBP if grade field is missing + # Graded relevance if self.grade_field not in test._fields: - return super().measure_list(recs, test) + raise ValueError(f"Grade field '{self.grade_field}' not found in test ItemList") - # build grade lookup grades = test.field(self.grade_field) - - if self.scale and len(grades) > 0: - max_grade = np.max(grades) - if max_grade > 0: - grades = grades / max_grade - - # map item and grade grade_map = dict(zip(test.ids(), grades)) + relevance = np.array([grade_map.get(item, self.unknown_grade) for item in recs.ids()]) - rel = np.array([grade_map.get(item, self.unknown_grade) for item in recs.ids()]) - - if self.weight_field is not None: - weights = recs.field(self.weight_field) - normalization = np.sum(weights).item() - - else: - ranks = recs.ranks() - assert ranks is not None - - weights = self.weight.weight(ranks) - - wmax = self.weight.series_sum() - - if self.normalize: - normalization = np.sum(weights[: min(len(test), k)]).item() - elif wmax is not None: - normalization = wmax - else: - normalization = np.sum(weights).item() - - return graded_rank_biased_precision(rel, weights, normalization) + return graded_rank_biased_precision(relevance, weights, normalization) diff --git a/tests/eval/test_rank_grbp.py b/tests/eval/test_rank_grbp.py deleted file mode 100644 index 22d7bec0d..000000000 --- a/tests/eval/test_rank_grbp.py +++ /dev/null @@ -1,57 +0,0 @@ -# This file is part of LensKit. -# Copyright (C) 2018-2023 Boise State University. -# Copyright (C) 2023-2025 Drexel University. -# Licensed under the MIT license, see LICENSE.md for details. -# SPDX-License-Identifier: MIT - -import numpy as np - -from pytest import approx - -from lenskit.data import ItemList -from lenskit.metrics.ranking import RBP, GradedRBP - - -def test_grbp_empty(): - recs = ItemList([], ordered=True) - truth = ItemList(item_ids=[1, 2, 3], grade=[1.0, 1.0, 1.0]) - - metric = GradedRBP() - assert metric.measure_list(recs, truth) == approx(0.0) - - -def test_grbp_unknown_grade(): - recs = ItemList([1, 2], ordered=True) - truth = ItemList(item_ids=[1], grade=[1.0]) - - p = 0.5 - metric = GradedRBP(patience=p, unknown_grade=0.25) - - expected = (1 - p) * (1 + 0.25 * p) - - assert metric.measure_list(recs, truth) == approx(expected) - - -def test_grbp_scaling(): - recs = ItemList([1, 2], ordered=True) - truth = ItemList(item_ids=[1, 2], grade=[2.0, 4.0]) - - p = 0.5 - metric = GradedRBP(patience=p, scale=True) - - scaled = np.array([0.5, 1.0]) - expected = (1 - p) * (scaled[0] + scaled[1] * p) - - assert metric.measure_list(recs, truth) == approx(expected) - - -def test_grbp_binary(): - recs = ItemList([1, 2, 3], ordered=True) - - graded_truth = ItemList(item_ids=[1, 3], graded=[1.0, 1.0]) - binary_truth = ItemList([1, 3]) # no grade field - - grbp = GradedRBP() - rbp = RBP() - - assert grbp.measure_list(recs, graded_truth) == approx(rbp.measure_list(recs, binary_truth)) diff --git a/tests/eval/test_rank_rbp.py b/tests/eval/test_rank_rbp.py index 4887dcc53..c87a7f2a4 100644 --- a/tests/eval/test_rank_rbp.py +++ b/tests/eval/test_rank_rbp.py @@ -128,3 +128,38 @@ def test_rank_biased_precision(): weights = np.array([1.0, 0.8, 0.6, 0.4, 0.2]) result = rank_biased_precision(good, weights, normalization=3.0) assert result == approx(1.2 / 3.0) + + +# test for graded rbp + + +def test_rbp_empty_graded(): + recs = ItemList([], ordered=True) + truth = ItemList(item_ids=[1, 2, 3], grade=[1.0, 1.0, 1.0]) + + metric = RBP(grade_field="grade") + assert metric.measure_list(recs, truth) == approx(0.0) + + +def test_rbp_unknown_grade(): + recs = ItemList([1, 2], ordered=True) + truth = ItemList(item_ids=[1], grade=[1.0]) + + p = 0.5 + metric = RBP(patience=p, grade_field="grade", unknown_grade=0.25) + + # RBP = (1-p)*(relevance[0] + relevance[1]*p) + expected = (1 - p) * (1 + 0.25 * p) + assert metric.measure_list(recs, truth) == approx(expected) + + +def test_rbp_binary_vs_graded_equivalent(): + recs = ItemList([1, 3], ordered=True) + + graded_truth = ItemList(item_ids=[1, 3], grade=[1.0, 1.0]) + binary_truth = ItemList([1, 3]) # no grade field + + grbp = RBP(grade_field="grade") + rbp = RBP() # binary + + assert grbp.measure_list(recs, graded_truth) == approx(rbp.measure_list(recs, binary_truth)) From 11b733ac28c031f6c6c05563bb7caa8548972622 Mon Sep 17 00:00:00 2001 From: Sushobhan Parajuli Date: Tue, 10 Mar 2026 14:38:52 -0400 Subject: [PATCH 5/5] add a test --- tests/eval/test_rank_rbp.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/eval/test_rank_rbp.py b/tests/eval/test_rank_rbp.py index c87a7f2a4..9222abbf5 100644 --- a/tests/eval/test_rank_rbp.py +++ b/tests/eval/test_rank_rbp.py @@ -141,7 +141,7 @@ def test_rbp_empty_graded(): assert metric.measure_list(recs, truth) == approx(0.0) -def test_rbp_unknown_grade(): +def test_rbp_unknown_grade_default(): recs = ItemList([1, 2], ordered=True) truth = ItemList(item_ids=[1], grade=[1.0]) @@ -153,6 +153,18 @@ def test_rbp_unknown_grade(): assert metric.measure_list(recs, truth) == approx(expected) +def test_rbp_unknown_grade(): + recs = ItemList([1, 2], ordered=True) + truth = ItemList(item_ids=[1], grade=[1.0]) + + p = 0.5 + metric = RBP(patience=p, grade_field="grade", unknown_grade=0.30) + + # RBP = (1-p)*(relevance[0] + relevance[1]*p) + expected = (1 - p) * (1 + 0.30 * p) + assert metric.measure_list(recs, truth) == approx(expected) + + def test_rbp_binary_vs_graded_equivalent(): recs = ItemList([1, 3], ordered=True)