diff --git a/src/lenskit/metrics/ranking/_rbp.py b/src/lenskit/metrics/ranking/_rbp.py index a6c8fb2aa..5401ac941 100644 --- a/src/lenskit/metrics/ranking/_rbp.py +++ b/src/lenskit/metrics/ranking/_rbp.py @@ -36,6 +36,27 @@ def rank_biased_precision( return rbp / normalization +def graded_rank_biased_precision( + relevance: np.ndarray, weights: np.ndarray, normalization: float = 1.0 +) -> float: + """ + Compute graded rank-biased precision. + + Args: + relevance: + Float array of relevance/grade scores at each position + weights: + Weight for each item position (same length as relevance) + normalization: + Optional normalization factor, defaults to 1.0 + + Returns: + Graded RBP score + """ + score = np.sum(weights * relevance).item() + return score / normalization + + class RBP(ListMetric, RankingMetricBase): """ Evaluate recommendations with rank-biased precision :cite:p:`rbp`. @@ -63,6 +84,9 @@ class RBP(ListMetric, RankingMetricBase): in the paper; however, RBP with high patience should be no worse than nDCG (and perhaps even better) in this regard. + This metric class supports relevance grades :math:`r_{ui} \\in \\[0, 1\\]` + via an optional ``grade_field``. + In recommender evaluation, we usually have a small test set, so the maximum achievable RBP is significantly less than the theoretical maximum, and is a function of the number of test items. With ``normalize=True``, the RBP @@ -99,6 +123,8 @@ class RBP(ListMetric, RankingMetricBase): patience: float normalize: bool weight_field: str | None + grade_field: str | None + unknown_grade: float def __init__( self, @@ -109,6 +135,8 @@ def __init__( patience: float = 0.85, normalize: bool = False, weight_field: str | None = None, + grade_field: str | None = None, + unknown_grade: float = 0.25, ): super().__init__(n, k=k) self.patience = patience @@ -117,13 +145,16 @@ def __init__( self.weight = weight self.normalize = normalize self.weight_field = weight_field + self.grade_field = grade_field + self.unknown_grade = unknown_grade @property def label(self): + base = "RBP" if self.grade_field is None else "GradedRBP" if self.n is not None: - return f"RBP@{self.n}" + return f"{base}@{self.n}" else: - return "RBP" + return base @override def measure_list(self, recs: ItemList, test: ItemList) -> float: @@ -134,8 +165,6 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: if nrel == 0: return np.nan - good = recs.isin(test) - if self.weight_field is not None: # use custom weights from field weights = recs.field(self.weight_field) @@ -158,4 +187,17 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: else: normalization = np.sum(weights).item() - return rank_biased_precision(good, weights, normalization) + # Binary relevance + if self.grade_field is None: + good = recs.isin(test) + return rank_biased_precision(good, weights, normalization) + + # Graded relevance + if self.grade_field not in test._fields: + raise ValueError(f"Grade field '{self.grade_field}' not found in test ItemList") + + grades = test.field(self.grade_field) + grade_map = dict(zip(test.ids(), grades)) + relevance = np.array([grade_map.get(item, self.unknown_grade) for item in recs.ids()]) + + return graded_rank_biased_precision(relevance, weights, normalization) diff --git a/tests/eval/test_rank_rbp.py b/tests/eval/test_rank_rbp.py index 4887dcc53..9222abbf5 100644 --- a/tests/eval/test_rank_rbp.py +++ b/tests/eval/test_rank_rbp.py @@ -128,3 +128,50 @@ def test_rank_biased_precision(): weights = np.array([1.0, 0.8, 0.6, 0.4, 0.2]) result = rank_biased_precision(good, weights, normalization=3.0) assert result == approx(1.2 / 3.0) + + +# test for graded rbp + + +def test_rbp_empty_graded(): + recs = ItemList([], ordered=True) + truth = ItemList(item_ids=[1, 2, 3], grade=[1.0, 1.0, 1.0]) + + metric = RBP(grade_field="grade") + assert metric.measure_list(recs, truth) == approx(0.0) + + +def test_rbp_unknown_grade_default(): + recs = ItemList([1, 2], ordered=True) + truth = ItemList(item_ids=[1], grade=[1.0]) + + p = 0.5 + metric = RBP(patience=p, grade_field="grade", unknown_grade=0.25) + + # RBP = (1-p)*(relevance[0] + relevance[1]*p) + expected = (1 - p) * (1 + 0.25 * p) + assert metric.measure_list(recs, truth) == approx(expected) + + +def test_rbp_unknown_grade(): + recs = ItemList([1, 2], ordered=True) + truth = ItemList(item_ids=[1], grade=[1.0]) + + p = 0.5 + metric = RBP(patience=p, grade_field="grade", unknown_grade=0.30) + + # RBP = (1-p)*(relevance[0] + relevance[1]*p) + expected = (1 - p) * (1 + 0.30 * p) + assert metric.measure_list(recs, truth) == approx(expected) + + +def test_rbp_binary_vs_graded_equivalent(): + recs = ItemList([1, 3], ordered=True) + + graded_truth = ItemList(item_ids=[1, 3], grade=[1.0, 1.0]) + binary_truth = ItemList([1, 3]) # no grade field + + grbp = RBP(grade_field="grade") + rbp = RBP() # binary + + assert grbp.measure_list(recs, graded_truth) == approx(rbp.measure_list(recs, binary_truth))