From ca5ecb1ed03c32b07918890498e6001219c5f06a Mon Sep 17 00:00:00 2001
From: ML Metrics Team <ml-metrics-dev@google.com>
Date: Wed, 7 May 2025 14:03:03 -0700
Subject: [PATCH] Adds Covariance Signal.

PiperOrigin-RevId: 755997596
---
 ml_metrics/_src/aggregates/rolling_stats.py   | 107 +++++++++++++-----
 .../_src/aggregates/rolling_stats_test.py     |  74 ++++++++++--
 2 files changed, 144 insertions(+), 37 deletions(-)

diff --git a/ml_metrics/_src/aggregates/rolling_stats.py b/ml_metrics/_src/aggregates/rolling_stats.py
index ec582e7c..e78e2c00 100644
--- a/ml_metrics/_src/aggregates/rolling_stats.py
+++ b/ml_metrics/_src/aggregates/rolling_stats.py
@@ -641,29 +641,9 @@ def result(self) -> types.NumbersT:
     )
 
 
-@dataclasses.dataclass(slots=True)
-class RRegression(base.MergeableMetric):
-  """Computes the Pearson Correlation Coefficient (PCC).
-
-  The Pearson correlation coefficient (PCC) is a correlation coefficient that
-  measures linear correlation between two sets of data. It is the ratio between
-  the covariance of two variables and the product of their standard deviations;
-  thus, it is essentially a normalized measurement of the covariance, such that
-  the result always has a value between -1 and 1. As with covariance itself, the
-  measure can only reflect a linear correlation of variables, and ignores many
-  other types of relationships or correlations. As a simple example, one would
-  expect the age and height of a sample of teenagers from a high school to have
-  a Pearson correlation coefficient significantly greater than 0, but less than
-  1 (as 1 would represent an unrealistically perfect correlation).
-
-  https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
-  """
-
-  # If True, center the data matrix x and the target vector y.
-  # The centered r-regression is the "Pearson's Correlation".
-  # The not-centered r-regression is the "Reflective Correlation".
-  center: bool = True
-
+@dataclasses.dataclass
+class _PartialCrossFeatureStats(abc.ABC, base.MergeableMetric):
+  """Partial cross feature statistics."""
   num_samples: int = 0
   sum_x: types.NumbersT = 0
   sum_y: float = 0
@@ -671,10 +651,7 @@ class RRegression(base.MergeableMetric):
   sum_yy: float = 0  # sum(y**2)
   sum_xy: types.NumbersT = 0  # sum(x * y)
 
-  def as_agg_fn(self) -> base.AggregateFn:
-    return base.as_agg_fn(self.__class__, self.center)
-
-  def __eq__(self, other: 'RRegression') -> bool:
+  def __eq__(self, other: '_PartialCrossFeatureStats') -> bool:
     return (
         self.num_samples == other.num_samples
         and self.sum_x == other.sum_x
@@ -684,7 +661,9 @@ def __eq__(self, other: 'RRegression') -> bool:
         and self.sum_xy == other.sum_xy
     )
 
-  def add(self, x: types.NumbersT, y: types.NumbersT) -> 'RRegression':
+  def add(
+      self, x: types.NumbersT, y: types.NumbersT
+  ) -> '_PartialCrossFeatureStats':
     """Updates the Class with the given batch.
 
     Args:
@@ -706,7 +685,9 @@ def add(self, x: types.NumbersT, y: types.NumbersT) -> 'RRegression':
 
     return self
 
-  def merge(self, other: 'RRegression') -> 'RRegression':
+  def merge(
+      self, other: '_PartialCrossFeatureStats'
+  ) -> '_PartialCrossFeatureStats':
     self.num_samples += other.num_samples
     self.sum_x += other.sum_x
     self.sum_y += other.sum_y
@@ -716,6 +697,74 @@ def merge(self, other: 'RRegression') -> 'RRegression':
 
     return self
 
+  @abc.abstractmethod
+  def result(self) -> types.NumbersT:
+    """Must be overwritten by the specific metric."""
+    pass
+
+
+class Covariance(_PartialCrossFeatureStats):
+  """Computes the covariance of two sets of data.
+
+  Covariance = E[(X-E[X]) * (Y-E[Y])] = E[XY] - E[X] * E[Y]
+  https://en.wikipedia.org/wiki/Covariance
+  """
+
+  def as_agg_fn(self) -> base.AggregateFn:
+    return base.as_agg_fn(
+        self.__class__,
+        sum_x=self.sum_x,
+        sum_y=self.sum_y,
+        sum_xx=self.sum_xx,
+        sum_yy=self.sum_yy,
+        sum_xy=self.sum_xy,
+    )
+
+  def result(self) -> types.NumbersT:
+    # TODO: b/417267344 - Implement Delta Degrees of Freedom. Here, ddof is
+    # always 0.
+
+    # Covariance = E[(X-E[X]) * (Y-E[Y])] = E[XY] - E[X] * E[Y]
+    # = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
+    return (
+        self.sum_xy - self.sum_x * self.sum_y / self.num_samples
+    ) / self.num_samples
+
+
+@dataclasses.dataclass
+class RRegression(_PartialCrossFeatureStats):
+  """Computes the Pearson Correlation Coefficient (PCC).
+
+  The Pearson correlation coefficient (PCC) is a correlation coefficient that
+  measures linear correlation between two sets of data. It is the ratio between
+  the covariance of two variables and the product of their standard deviations;
+  thus, it is essentially a normalized measurement of the covariance, such that
+  the result always has a value between -1 and 1. As with covariance itself, the
+  measure can only reflect a linear correlation of variables, and ignores many
+  other types of relationships or correlations. As a simple example, one would
+  expect the age and height of a sample of teenagers from a high school to have
+  a Pearson correlation coefficient significantly greater than 0, but less than
+  1 (as 1 would represent an unrealistically perfect correlation).
+
+  https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+  """
+
+  # If True, center the data matrix x and the target vector y.
+  # The centered r-regression is the "Pearson's Correlation".
+  # The not-centered r-regression is the "Reflective Correlation".
+  center: bool = True
+
+  def as_agg_fn(self) -> base.AggregateFn:
+    return base.as_agg_fn(
+        self.__class__,
+        sum_x=self.sum_x,
+        sum_y=self.sum_y,
+        sum_xx=self.sum_xx,
+        sum_yy=self.sum_yy,
+        sum_xy=self.sum_xy,
+        center=self.center,
+    )
+
   def result(self) -> types.NumbersT:
     """Calculates the Pearson Correlation Coefficient (PCC).
 
diff --git a/ml_metrics/_src/aggregates/rolling_stats_test.py b/ml_metrics/_src/aggregates/rolling_stats_test.py
index 08b802fe..b294ec72 100644
--- a/ml_metrics/_src/aggregates/rolling_stats_test.py
+++ b/ml_metrics/_src/aggregates/rolling_stats_test.py
@@ -1457,21 +1457,17 @@ def test_r2_tjur_relative_returns_nan(self, y_true, y_pred):
     )
 
 
-class RRegressionTest(parameterized.TestCase):
+class PartialCrossFeatureStatsTest(absltest.TestCase):
 
-  @parameterized.named_parameters(
-      dict(testcase_name='centered', center=True),
-      dict(testcase_name='not_centered', center=False),
-  )
-  def test_r_regression_merge(self, center):
+  def test_partial_cross_feature_stats_merge(self):
     x_1 = (1, 2, 3, 4)
     y_1 = (10, 9, 2.5, 6)
 
     x_2 = (5, 6, 7)
     y_2 = (4, 3, 2)
 
-    state_1 = rolling_stats.RRegression(center=center).add(x_1, y_1)
-    state_2 = rolling_stats.RRegression(center=center).add(x_2, y_2)
+    state_1 = rolling_stats.RRegression().add(x_1, y_1)
+    state_2 = rolling_stats.RRegression().add(x_2, y_2)
     result = state_1.merge(state_2)
 
     expected_result = rolling_stats.RRegression(
@@ -1485,6 +1481,68 @@ def test_r_regression_merge(self, center):
 
     self.assertEqual(result, expected_result)
 
+
+class CovarianceTest(absltest.TestCase):
+
+  def test_covariance_single_output(self):
+    x = (1, 2, 3, 4, 5, 6, 7)
+    y = (10, 9, 2.5, 6, 4, 3, 2)
+
+    # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
+    # = (111.5 - 28 * 36.5 / 7) / 7 = -4.928571428571429
+    expected_result = -4.928571428571429
+
+    actual_result = rolling_stats.Covariance().add(x, y).result()
+
+    self.assertAlmostEqual(actual_result, expected_result, places=10)
+
+  def test_covariance_single_output_as_agg_fn(self):
+    x = (1, 2, 3, 4, 5, 6, 7)
+    y = (10, 9, 2.5, 6, 4, 3, 2)
+
+    # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
+    # = (111.5 - 28 * 36.5 / 7) / 7 = -4.928571428571429
+    expected_result = -4.928571428571429
+
+    actual_result = rolling_stats.Covariance().as_agg_fn()(x, y)
+
+    self.assertAlmostEqual(actual_result, expected_result, places=10)
+
+  def test_covariance_multi_output(self):
+    x1 = (10, 9, 2.5, 6, 4, 3, 2)
+    x2 = (8, 6, 7, 5, 3, 0, 9)
+    y = (1, 2, 3, 4, 5, 6, 7)
+
+    # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
+    # covariance(x1, y) = (111.5 - 36.5 * 28 / 7) / 7 = -4.928571428571429
+    # covariance(x2, y) = (139 - 38 * 28 / 7) / 7 = -1.8571428571428572
+    expected_result = (-4.928571428571429, -1.8571428571428572)
+
+    x_all = np.array((x1, x2)).T
+
+    actual_result = rolling_stats.Covariance().add(x_all, y).result()
+
+    np.testing.assert_almost_equal(actual_result, expected_result)
+
+  def test_covariance_multi_output_as_agg_fn(self):
+    x1 = (10, 9, 2.5, 6, 4, 3, 2)
+    x2 = (8, 6, 7, 5, 3, 0, 9)
+    y = (1, 2, 3, 4, 5, 6, 7)
+
+    # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
+    # covariance(x1, y) = (111.5 - 36.5 * 28 / 7) / 7 = -4.928571428571429
+    # covariance(x2, y) = (139 - 38 * 28 / 7) / 7 = -1.8571428571428572
+    expected_result = (-4.928571428571429, -1.8571428571428572)
+
+    x_all = np.array((x1, x2)).T
+
+    actual_result = rolling_stats.Covariance().as_agg_fn()(x_all, y)
+
+    np.testing.assert_almost_equal(actual_result, expected_result)
+
+
+class RRegressionTest(parameterized.TestCase):
+
   @parameterized.named_parameters(
       dict(
           testcase_name='centered',