Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 78 additions & 29 deletions ml_metrics/_src/aggregates/rolling_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,40 +641,17 @@ def result(self) -> types.NumbersT:
)


@dataclasses.dataclass(slots=True)
class RRegression(base.MergeableMetric):
"""Computes the Pearson Correlation Coefficient (PCC).

The Pearson correlation coefficient (PCC) is a correlation coefficient that
measures linear correlation between two sets of data. It is the ratio between
the covariance of two variables and the product of their standard deviations;
thus, it is essentially a normalized measurement of the covariance, such that
the result always has a value between -1 and 1. As with covariance itself, the
measure can only reflect a linear correlation of variables, and ignores many
other types of relationships or correlations. As a simple example, one would
expect the age and height of a sample of teenagers from a high school to have
a Pearson correlation coefficient significantly greater than 0, but less than
1 (as 1 would represent an unrealistically perfect correlation).

https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
"""

# If True, center the data matrix x and the target vector y.
# The centered r-regression is the "Pearson's Correlation".
# The not-centered r-regression is the "Reflective Correlation".
center: bool = True

@dataclasses.dataclass
class _PartialCrossFeatureStats(abc.ABC, base.MergeableMetric):
"""Partial cross feature statistics."""
num_samples: int = 0
sum_x: types.NumbersT = 0
sum_y: float = 0
sum_xx: types.NumbersT = 0 # sum(x**2)
sum_yy: float = 0 # sum(y**2)
sum_xy: types.NumbersT = 0 # sum(x * y)

def as_agg_fn(self) -> base.AggregateFn:
return base.as_agg_fn(self.__class__, self.center)

def __eq__(self, other: 'RRegression') -> bool:
def __eq__(self, other: '_PartialCrossFeatureStats') -> bool:
return (
self.num_samples == other.num_samples
and self.sum_x == other.sum_x
Expand All @@ -684,7 +661,9 @@ def __eq__(self, other: 'RRegression') -> bool:
and self.sum_xy == other.sum_xy
)

def add(self, x: types.NumbersT, y: types.NumbersT) -> 'RRegression':
def add(
self, x: types.NumbersT, y: types.NumbersT
) -> '_PartialCrossFeatureStats':
"""Updates the Class with the given batch.

Args:
Expand All @@ -706,7 +685,9 @@ def add(self, x: types.NumbersT, y: types.NumbersT) -> 'RRegression':

return self

def merge(self, other: 'RRegression') -> 'RRegression':
def merge(
self, other: '_PartialCrossFeatureStats'
) -> '_PartialCrossFeatureStats':
self.num_samples += other.num_samples
self.sum_x += other.sum_x
self.sum_y += other.sum_y
Expand All @@ -716,6 +697,74 @@ def merge(self, other: 'RRegression') -> 'RRegression':

return self

@abc.abstractmethod
def result(self) -> types.NumbersT:
"""Must be overwritten by the specific metric."""
pass


class Covariance(_PartialCrossFeatureStats):
"""Computes the covariance of two sets of data.

Covariance = E[(X-E[X]) * (Y-E[Y])] = E[XY] - E[X] * E[Y]
https://en.wikipedia.org/wiki/Covariance
"""

def as_agg_fn(self) -> base.AggregateFn:
return base.as_agg_fn(
self.__class__,
sum_x=self.sum_x,
sum_y=self.sum_y,
sum_xx=self.sum_xx,
sum_yy=self.sum_yy,
sum_xy=self.sum_xy,
)

def result(self) -> types.NumbersT:
# TODO: b/417267344 - Implement Delta Degrees of Freedom. Here, ddof is
# always 0.

# Covariance = E[(X-E[X]) * (Y-E[Y])] = E[XY] - E[X] * E[Y]
# = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
return (
self.sum_xy - self.sum_x * self.sum_y / self.num_samples
) / self.num_samples


@dataclasses.dataclass
class RRegression(_PartialCrossFeatureStats):
"""Computes the Pearson Correlation Coefficient (PCC).

The Pearson correlation coefficient (PCC) is a correlation coefficient that
measures linear correlation between two sets of data. It is the ratio between
the covariance of two variables and the product of their standard deviations;
thus, it is essentially a normalized measurement of the covariance, such that
the result always has a value between -1 and 1. As with covariance itself, the
measure can only reflect a linear correlation of variables, and ignores many
other types of relationships or correlations. As a simple example, one would
expect the age and height of a sample of teenagers from a high school to have
a Pearson correlation coefficient significantly greater than 0, but less than
1 (as 1 would represent an unrealistically perfect correlation).

https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
"""

# If True, center the data matrix x and the target vector y.
# The centered r-regression is the "Pearson's Correlation".
# The not-centered r-regression is the "Reflective Correlation".
center: bool = True

def as_agg_fn(self) -> base.AggregateFn:
return base.as_agg_fn(
self.__class__,
sum_x=self.sum_x,
sum_y=self.sum_y,
sum_xx=self.sum_xx,
sum_yy=self.sum_yy,
sum_xy=self.sum_xy,
center=self.center,
)

def result(self) -> types.NumbersT:
"""Calculates the Pearson Correlation Coefficient (PCC).

Expand Down
74 changes: 66 additions & 8 deletions ml_metrics/_src/aggregates/rolling_stats_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1457,21 +1457,17 @@ def test_r2_tjur_relative_returns_nan(self, y_true, y_pred):
)


class RRegressionTest(parameterized.TestCase):
class PartialCrossFeatureStatsTest(absltest.TestCase):

@parameterized.named_parameters(
dict(testcase_name='centered', center=True),
dict(testcase_name='not_centered', center=False),
)
def test_r_regression_merge(self, center):
def test_partial_cross_feature_stats_merge(self):
x_1 = (1, 2, 3, 4)
y_1 = (10, 9, 2.5, 6)

x_2 = (5, 6, 7)
y_2 = (4, 3, 2)

state_1 = rolling_stats.RRegression(center=center).add(x_1, y_1)
state_2 = rolling_stats.RRegression(center=center).add(x_2, y_2)
state_1 = rolling_stats.RRegression().add(x_1, y_1)
state_2 = rolling_stats.RRegression().add(x_2, y_2)
result = state_1.merge(state_2)

expected_result = rolling_stats.RRegression(
Expand All @@ -1485,6 +1481,68 @@ def test_r_regression_merge(self, center):

self.assertEqual(result, expected_result)


class CovarianceTest(absltest.TestCase):

def test_covariance_single_output(self):
x = (1, 2, 3, 4, 5, 6, 7)
y = (10, 9, 2.5, 6, 4, 3, 2)

# covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
# = (111.5 - 28 * 36.5 / 7) / 7 = -4.928571428571429
expected_result = -4.928571428571429

actual_result = rolling_stats.Covariance().add(x, y).result()

self.assertAlmostEqual(actual_result, expected_result, places=10)

def test_covariance_single_output_as_agg_fn(self):
x = (1, 2, 3, 4, 5, 6, 7)
y = (10, 9, 2.5, 6, 4, 3, 2)

# covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
# = (111.5 - 28 * 36.5 / 7) / 7 = -4.928571428571429
expected_result = -4.928571428571429

actual_result = rolling_stats.Covariance().as_agg_fn()(x, y)

self.assertAlmostEqual(actual_result, expected_result, places=10)

def test_covariance_multi_output(self):
x1 = (10, 9, 2.5, 6, 4, 3, 2)
x2 = (8, 6, 7, 5, 3, 0, 9)
y = (1, 2, 3, 4, 5, 6, 7)

# covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
# covariance(x1, y) = (111.5 - 36.5 * 28 / 7) / 7 = -4.928571428571429
# covariance(x2, y) = (139 - 38 * 28 / 7) / 7 = -1.8571428571428572
expected_result = (-4.928571428571429, -1.8571428571428572)

x_all = np.array((x1, x2)).T

actual_result = rolling_stats.Covariance().add(x_all, y).result()

np.testing.assert_almost_equal(actual_result, expected_result)

def test_covariance_multi_output_as_agg_fn(self):
x1 = (10, 9, 2.5, 6, 4, 3, 2)
x2 = (8, 6, 7, 5, 3, 0, 9)
y = (1, 2, 3, 4, 5, 6, 7)

# covariance(X, Y) = [sum(XY) - sum(X) * sum(Y) / num_samples] / num_samples
# covariance(x1, y) = (111.5 - 36.5 * 28 / 7) / 7 = -4.928571428571429
# covariance(x2, y) = (139 - 38 * 28 / 7) / 7 = -1.8571428571428572
expected_result = (-4.928571428571429, -1.8571428571428572)

x_all = np.array((x1, x2)).T

actual_result = rolling_stats.Covariance().as_agg_fn()(x_all, y)

np.testing.assert_almost_equal(actual_result, expected_result)


class RRegressionTest(parameterized.TestCase):

@parameterized.named_parameters(
dict(
testcase_name='centered',
Expand Down
Loading