diff --git a/src/lenskit/metrics/ranking/_dcg.py b/src/lenskit/metrics/ranking/_dcg.py index bfe10a5fd..69efd8d62 100644 --- a/src/lenskit/metrics/ranking/_dcg.py +++ b/src/lenskit/metrics/ranking/_dcg.py @@ -45,6 +45,11 @@ class NDCG(ListMetric, RankingMetricBase): \\mathrm{nDCG}(L, u) & = \\frac{\\mathrm{DCG}(L,u)}{\\mathrm{DCG}(L_{\\mathrm{ideal}}, u)} \\end{align*} + .. note:: + Negative gains are clipped to zero before computing NDCG. + This keeps the metric bounded between 0 and 1 and prevents cases where + negative gains can lead to misleading positive scores due to + cancellation effects. Args: n: The maximum recommendation list length to consider (longer lists are @@ -105,6 +110,7 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: gains = test.field(self.gain, "pandas", index="ids") if gains is None: raise KeyError(f"test items have no field {self.gain}") + gains = gains.clip(lower=0) if self.n: gains = gains.nlargest(n=self.n) else: @@ -112,6 +118,9 @@ def measure_list(self, recs: ItemList, test: ItemList) -> float: iweight = self.weight.weight(np.arange(1, len(gains) + 1)) ideal = np.dot(gains.values, iweight).item() # type: ignore + if ideal == 0: + return 0.0 + else: realized = _binary_dcg(recs, test, self.weight) n = len(test) @@ -201,6 +210,8 @@ def _graded_dcg( if gains is None: raise KeyError(f"test items have no field {field}") + gains = gains.clip(lower=0) + ranks = recs.ranks(format="pandas") if ranks is None: raise TypeError("item list is not ordered") diff --git a/tests/eval/test_rank_ndcg.py b/tests/eval/test_rank_ndcg.py index 72ae82c7a..d9bb479a2 100644 --- a/tests/eval/test_rank_ndcg.py +++ b/tests/eval/test_rank_ndcg.py @@ -110,3 +110,17 @@ def test_ndcg_alt_discount(items, n): e.add_note(f"recs: {recs}") e.add_note(f"truth: {truth}") raise e + + +@mark.parametrize( + "ratings, expected_ndcg", + [ + ([-1, -2, -3, -4, -5], 0.0), # all negative + ([-6, -2, 3, 1, -3], 0.5982), # mixed + ], +) +def test_ndcg_negative_gains(ratings, expected_ndcg): + recs = ItemList([1, 2, 3, 4, 5], ordered=True) + truth = ItemList([1, 2, 3, 4, 5], rating=ratings) + val = call_metric(NDCG, recs, truth, gain="rating") + assert val == approx(expected_ndcg, rel=1e-3)