Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dpsynth/discrete_mechanisms/independent.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,11 @@ def __call__(
attributes = len(data.domain)
per_query_sigma = self.gdp_sigma * attributes**0.5
measurements = initial_measurements or []
existing_cliques = {m.clique for m in measurements}
for attr in data.domain:
clique = (attr,)
if clique in existing_cliques:
continue
marginal = data.project(clique).datavector()
noisy_marginal = (
marginal + rng.normal(size=marginal.shape) * per_query_sigma
Expand Down
2 changes: 1 addition & 1 deletion dpsynth/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def standardize(self, value: Any) -> int | float | None:
low_value = invalid_value = self.min_value
high_value = self.max_value
else:
low_value = high_value = invalid_value = None
low_value = high_value = invalid_value = float('nan')
try:
value = float(value) # works for anything that converts to float.
if math.isnan(value):
Expand Down
11 changes: 10 additions & 1 deletion dpsynth/local_mode/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def _median(
"""
if lower > upper:
raise ValueError(f'{lower=} cannot be greater than {upper=}.')
if lower == upper:
return lower

clamped_data = np.clip(data, lower, upper)
n = clamped_data.size
Expand Down Expand Up @@ -526,8 +528,15 @@ def __call__(self, rng: np.random.Generator, data: np.ndarray) -> list[float]:
"""Computes differentially private quantiles."""
if self._epsilon_levels is None:
raise ValueError(_UNCALIBRATED_MSG.format(param='_epsilon_levels'))
# Filter NaN values — they represent missing data and cannot participate
# in the exponential mechanism's interval scoring.
finite_data = data[np.isfinite(data.astype(float))]
return _quantiles(
rng, data, self.lower, self.upper, np.asarray(self._epsilon_levels)
rng,
finite_data,
self.lower,
self.upper,
np.asarray(self._epsilon_levels),
)


Expand Down
7 changes: 5 additions & 2 deletions dpsynth/pipeline_transformations/dp_auto_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

"""Use DP mechanism to automatically dsicretize numerical data."""

import math
from typing import Any

from dpsynth import domain
Expand Down Expand Up @@ -102,9 +103,11 @@ def _quantiles(
def extract_and_normalize_fields(row):
for field_name, attribute in field_name_to_attribute.items():
value = attribute.standardize(row[field_name])
if value is not None:
if value is not None and not (
isinstance(value, float) and math.isnan(value)
):
normalizer = attribute.max_value - attribute.min_value
yield field_name, (row[field_name] - attribute.min_value) / normalizer
yield field_name, (value - attribute.min_value) / normalizer

extracted_fields = backend.flat_map(
pcol, extract_and_normalize_fields, "Extract and scale fields"
Expand Down
2 changes: 1 addition & 1 deletion dpsynth/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def create_discretize_transformation(

def transform(value: Any) -> pd.Interval | None:
value = attribute_domain.standardize(value)
if value is None:
if value is None or (isinstance(value, float) and math.isnan(value)):
return None
return intervals[intervals.get_loc(value)]

Expand Down
18 changes: 16 additions & 2 deletions tests/discrete_mechanisms/independent_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
class IndependentTest(absltest.TestCase):

def test_fits_one_way_marginals(self):
data = mbi.Dataset.synthetic(mbi.Domain(["a", "b", "c"], [3, 4, 5]), N=1000)
data = mbi.Dataset.synthetic(mbi.Domain(['a', 'b', 'c'], [3, 4, 5]), N=1000)

config = independent.IndependentMechanism(pgm_iters=500)
synthetic = config.calibrate(zcdp_rho=10000)(np.random.default_rng(0), data)
Expand All @@ -31,6 +31,20 @@ def test_fits_one_way_marginals(self):
actual = synthetic.project([col]).datavector()
np.testing.assert_allclose(actual, expected, atol=0.1)

def test_skips_duplicate_cliques_from_initial_measurements(self):
"""IndependentMechanism should not re-measure pre-measured cliques."""
data = mbi.Dataset.synthetic(mbi.Domain(['a', 'b', 'c'], [3, 4, 5]), N=100)
# Pre-measure column 'a'.
marginal_a = data.project(('a',)).datavector()
initial = [mbi.LinearMeasurement(marginal_a, ('a',), stddev=1.0)]

if __name__ == "__main__":
config = independent.IndependentMechanism(pgm_iters=500)
# This should not raise 'Cliques must be unique'.
model = config.calibrate(zcdp_rho=100.0)(
np.random.default_rng(0), data, initial_measurements=initial
)
self.assertIsNotNone(model)


if __name__ == '__main__':
absltest.main()
2 changes: 1 addition & 1 deletion tests/domain_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_standardize_numerical(self):
self.assertEqual(attribute.standardize(value), value)

for value in ood_values:
self.assertIsNone(attribute.standardize(value))
self.assertTrue(math.isnan(attribute.standardize(value)))

def test_freeform_text_defaults(self):
attribute = domain.FreeFormTextAttribute()
Expand Down
22 changes: 22 additions & 0 deletions tests/local_mode/primitives_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,28 @@ def test_dp_event_single_partition(self):
self.assertIsInstance(event, dp_accounting.ComposedDpEvent)
self.assertEmpty(event.events)

def test_quantiles_filters_nan(self):
"""DPQuantiles should handle NaN in input data without crashing."""
data = np.array([1.0, np.nan, 3.0, np.nan, 5.0])
mechanism = primitives.DPQuantiles(lower=0.0, upper=10.0, num_partitions=4)
calibrated = mechanism.calibrate(zcdp_rho=10.0)
edges = calibrated(self.rng, data)
self.assertLen(edges, 3)
for e in edges:
self.assertBetween(e, 0.0, 10.0)

def test_median_zero_length_intervals(self):
"""_median should handle degenerate lower == upper from recursive splits."""
data = np.array([5.0, 5.0, 5.0, 5.0])
med = primitives._median(self.rng, data, lower=5.0, upper=5.0, epsilon=0.01)
self.assertEqual(med, 5.0)

def test_median_data_at_boundaries(self):
"""_median handles data near boundaries where jitter gets clipped back."""
data = np.array([0.0, 0.0, 0.0, 5.0, 10.0, 10.0])
med = primitives._median(self.rng, data, lower=0.0, upper=10.0, epsilon=1.0)
self.assertBetween(med, 0.0, 10.0)


class DPGaussianHistogramTest(absltest.TestCase):

Expand Down
Loading