From ca5ecb1ed03c32b07918890498e6001219c5f06a Mon Sep 17 00:00:00 2001 From: ML Metrics Team Date: Wed, 7 May 2025 14:03:03 -0700 Subject: [PATCH] Adds Covariance Signal. PiperOrigin-RevId: 755997596 --- ml_metrics/_src/aggregates/rolling_stats.py | 107 +++++++++++++----- .../_src/aggregates/rolling_stats_test.py | 74 ++++++++++-- 2 files changed, 144 insertions(+), 37 deletions(-) diff --git a/ml_metrics/_src/aggregates/rolling_stats.py b/ml_metrics/_src/aggregates/rolling_stats.py index ec582e7c..e78e2c00 100644 --- a/ml_metrics/_src/aggregates/rolling_stats.py +++ b/ml_metrics/_src/aggregates/rolling_stats.py @@ -641,29 +641,9 @@ def result(self) -> types.NumbersT: ) -@dataclasses.dataclass(slots=True) -class RRegression(base.MergeableMetric): - """Computes the Pearson Correlation Coefficient (PCC). - - The Pearson correlation coefficient (PCC) is a correlation coefficient that - measures linear correlation between two sets of data. It is the ratio between - the covariance of two variables and the product of their standard deviations; - thus, it is essentially a normalized measurement of the covariance, such that - the result always has a value between -1 and 1. As with covariance itself, the - measure can only reflect a linear correlation of variables, and ignores many - other types of relationships or correlations. As a simple example, one would - expect the age and height of a sample of teenagers from a high school to have - a Pearson correlation coefficient significantly greater than 0, but less than - 1 (as 1 would represent an unrealistically perfect correlation). - - https://en.wikipedia.org/wiki/Pearson_correlation_coefficient - """ - - # If True, center the data matrix x and the target vector y. - # The centered r-regression is the "Pearson's Correlation". - # The not-centered r-regression is the "Reflective Correlation". - center: bool = True - +@dataclasses.dataclass +class _PartialCrossFeatureStats(abc.ABC, base.MergeableMetric): + """Partial cross feature statistics.""" num_samples: int = 0 sum_x: types.NumbersT = 0 sum_y: float = 0 @@ -671,10 +651,7 @@ class RRegression(base.MergeableMetric): sum_yy: float = 0 # sum(y**2) sum_xy: types.NumbersT = 0 # sum(x * y) - def as_agg_fn(self) -> base.AggregateFn: - return base.as_agg_fn(self.__class__, self.center) - - def __eq__(self, other: 'RRegression') -> bool: + def __eq__(self, other: '_PartialCrossFeatureStats') -> bool: return ( self.num_samples == other.num_samples and self.sum_x == other.sum_x @@ -684,7 +661,9 @@ def __eq__(self, other: 'RRegression') -> bool: and self.sum_xy == other.sum_xy ) - def add(self, x: types.NumbersT, y: types.NumbersT) -> 'RRegression': + def add( + self, x: types.NumbersT, y: types.NumbersT + ) -> '_PartialCrossFeatureStats': """Updates the Class with the given batch. Args: @@ -706,7 +685,9 @@ def add(self, x: types.NumbersT, y: types.NumbersT) -> 'RRegression': return self - def merge(self, other: 'RRegression') -> 'RRegression': + def merge( + self, other: '_PartialCrossFeatureStats' + ) -> '_PartialCrossFeatureStats': self.num_samples += other.num_samples self.sum_x += other.sum_x self.sum_y += other.sum_y @@ -716,6 +697,74 @@ def merge(self, other: 'RRegression') -> 'RRegression': return self + @abc.abstractmethod + def result(self) -> types.NumbersT: + """Must be overwritten by the specific metric.""" + pass + + +class Covariance(_PartialCrossFeatureStats): + """Computes the covariance of two sets of data. + + Covariance = E[(X-E[X]) * (Y-E[Y])] = E[XY] - E[X] * E[Y] + https://en.wikipedia.org/wiki/Covariance + """ + + def as_agg_fn(self) -> base.AggregateFn: + return base.as_agg_fn( + self.__class__, + sum_x=self.sum_x, + sum_y=self.sum_y, + sum_xx=self.sum_xx, + sum_yy=self.sum_yy, + sum_xy=self.sum_xy, + ) + + def result(self) -> types.NumbersT: + # TODO: b/417267344 - Implement Delta Degrees of Freedom. Here, ddof is + # always 0. + + # Covariance = E[(X-E[X]) * (Y-E[Y])] = E[XY] - E[X] * E[Y] + # = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples + return ( + self.sum_xy - self.sum_x * self.sum_y / self.num_samples + ) / self.num_samples + + +@dataclasses.dataclass +class RRegression(_PartialCrossFeatureStats): + """Computes the Pearson Correlation Coefficient (PCC). + + The Pearson correlation coefficient (PCC) is a correlation coefficient that + measures linear correlation between two sets of data. It is the ratio between + the covariance of two variables and the product of their standard deviations; + thus, it is essentially a normalized measurement of the covariance, such that + the result always has a value between -1 and 1. As with covariance itself, the + measure can only reflect a linear correlation of variables, and ignores many + other types of relationships or correlations. As a simple example, one would + expect the age and height of a sample of teenagers from a high school to have + a Pearson correlation coefficient significantly greater than 0, but less than + 1 (as 1 would represent an unrealistically perfect correlation). + + https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + """ + + # If True, center the data matrix x and the target vector y. + # The centered r-regression is the "Pearson's Correlation". + # The not-centered r-regression is the "Reflective Correlation". + center: bool = True + + def as_agg_fn(self) -> base.AggregateFn: + return base.as_agg_fn( + self.__class__, + sum_x=self.sum_x, + sum_y=self.sum_y, + sum_xx=self.sum_xx, + sum_yy=self.sum_yy, + sum_xy=self.sum_xy, + center=self.center, + ) + def result(self) -> types.NumbersT: """Calculates the Pearson Correlation Coefficient (PCC). diff --git a/ml_metrics/_src/aggregates/rolling_stats_test.py b/ml_metrics/_src/aggregates/rolling_stats_test.py index 08b802fe..b294ec72 100644 --- a/ml_metrics/_src/aggregates/rolling_stats_test.py +++ b/ml_metrics/_src/aggregates/rolling_stats_test.py @@ -1457,21 +1457,17 @@ def test_r2_tjur_relative_returns_nan(self, y_true, y_pred): ) -class RRegressionTest(parameterized.TestCase): +class PartialCrossFeatureStatsTest(absltest.TestCase): - @parameterized.named_parameters( - dict(testcase_name='centered', center=True), - dict(testcase_name='not_centered', center=False), - ) - def test_r_regression_merge(self, center): + def test_partial_cross_feature_stats_merge(self): x_1 = (1, 2, 3, 4) y_1 = (10, 9, 2.5, 6) x_2 = (5, 6, 7) y_2 = (4, 3, 2) - state_1 = rolling_stats.RRegression(center=center).add(x_1, y_1) - state_2 = rolling_stats.RRegression(center=center).add(x_2, y_2) + state_1 = rolling_stats.RRegression().add(x_1, y_1) + state_2 = rolling_stats.RRegression().add(x_2, y_2) result = state_1.merge(state_2) expected_result = rolling_stats.RRegression( @@ -1485,6 +1481,68 @@ def test_r_regression_merge(self, center): self.assertEqual(result, expected_result) + +class CovarianceTest(absltest.TestCase): + + def test_covariance_single_output(self): + x = (1, 2, 3, 4, 5, 6, 7) + y = (10, 9, 2.5, 6, 4, 3, 2) + + # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples + # = (111.5 - 28 * 36.5 / 7) / 7 = -4.928571428571429 + expected_result = -4.928571428571429 + + actual_result = rolling_stats.Covariance().add(x, y).result() + + self.assertAlmostEqual(actual_result, expected_result, places=10) + + def test_covariance_single_output_as_agg_fn(self): + x = (1, 2, 3, 4, 5, 6, 7) + y = (10, 9, 2.5, 6, 4, 3, 2) + + # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples + # = (111.5 - 28 * 36.5 / 7) / 7 = -4.928571428571429 + expected_result = -4.928571428571429 + + actual_result = rolling_stats.Covariance().as_agg_fn()(x, y) + + self.assertAlmostEqual(actual_result, expected_result, places=10) + + def test_covariance_multi_output(self): + x1 = (10, 9, 2.5, 6, 4, 3, 2) + x2 = (8, 6, 7, 5, 3, 0, 9) + y = (1, 2, 3, 4, 5, 6, 7) + + # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples + # covariance(x1, y) = (111.5 - 36.5 * 28 / 7) / 7 = -4.928571428571429 + # covariance(x2, y) = (139 - 38 * 28 / 7) / 7 = -1.8571428571428572 + expected_result = (-4.928571428571429, -1.8571428571428572) + + x_all = np.array((x1, x2)).T + + actual_result = rolling_stats.Covariance().add(x_all, y).result() + + np.testing.assert_almost_equal(actual_result, expected_result) + + def test_covariance_multi_output_as_agg_fn(self): + x1 = (10, 9, 2.5, 6, 4, 3, 2) + x2 = (8, 6, 7, 5, 3, 0, 9) + y = (1, 2, 3, 4, 5, 6, 7) + + # covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples + # covariance(x1, y) = (111.5 - 36.5 * 28 / 7) / 7 = -4.928571428571429 + # covariance(x2, y) = (139 - 38 * 28 / 7) / 7 = -1.8571428571428572 + expected_result = (-4.928571428571429, -1.8571428571428572) + + x_all = np.array((x1, x2)).T + + actual_result = rolling_stats.Covariance().as_agg_fn()(x_all, y) + + np.testing.assert_almost_equal(actual_result, expected_result) + + +class RRegressionTest(parameterized.TestCase): + @parameterized.named_parameters( dict( testcase_name='centered',