diff --git a/dpsynth/discrete_mechanisms/independent.py b/dpsynth/discrete_mechanisms/independent.py index 6485669..0791c58 100644 --- a/dpsynth/discrete_mechanisms/independent.py +++ b/dpsynth/discrete_mechanisms/independent.py @@ -72,8 +72,11 @@ def __call__( attributes = len(data.domain) per_query_sigma = self.gdp_sigma * attributes**0.5 measurements = initial_measurements or [] + existing_cliques = {m.clique for m in measurements} for attr in data.domain: clique = (attr,) + if clique in existing_cliques: + continue marginal = data.project(clique).datavector() noisy_marginal = ( marginal + rng.normal(size=marginal.shape) * per_query_sigma diff --git a/dpsynth/domain.py b/dpsynth/domain.py index 67458df..a3793cf 100644 --- a/dpsynth/domain.py +++ b/dpsynth/domain.py @@ -193,7 +193,7 @@ def standardize(self, value: Any) -> int | float | None: low_value = invalid_value = self.min_value high_value = self.max_value else: - low_value = high_value = invalid_value = None + low_value = high_value = invalid_value = float('nan') try: value = float(value) # works for anything that converts to float. if math.isnan(value): diff --git a/dpsynth/local_mode/primitives.py b/dpsynth/local_mode/primitives.py index 37b65eb..03bfd7b 100644 --- a/dpsynth/local_mode/primitives.py +++ b/dpsynth/local_mode/primitives.py @@ -130,6 +130,8 @@ def _median( """ if lower > upper: raise ValueError(f'{lower=} cannot be greater than {upper=}.') + if lower == upper: + return lower clamped_data = np.clip(data, lower, upper) n = clamped_data.size @@ -526,8 +528,15 @@ def __call__(self, rng: np.random.Generator, data: np.ndarray) -> list[float]: """Computes differentially private quantiles.""" if self._epsilon_levels is None: raise ValueError(_UNCALIBRATED_MSG.format(param='_epsilon_levels')) + # Filter NaN values — they represent missing data and cannot participate + # in the exponential mechanism's interval scoring. + finite_data = data[np.isfinite(data.astype(float))] return _quantiles( - rng, data, self.lower, self.upper, np.asarray(self._epsilon_levels) + rng, + finite_data, + self.lower, + self.upper, + np.asarray(self._epsilon_levels), ) diff --git a/dpsynth/pipeline_transformations/dp_auto_discretizer.py b/dpsynth/pipeline_transformations/dp_auto_discretizer.py index 7e03cdc..c95f5c9 100644 --- a/dpsynth/pipeline_transformations/dp_auto_discretizer.py +++ b/dpsynth/pipeline_transformations/dp_auto_discretizer.py @@ -14,6 +14,7 @@ """Use DP mechanism to automatically dsicretize numerical data.""" +import math from typing import Any from dpsynth import domain @@ -102,9 +103,11 @@ def _quantiles( def extract_and_normalize_fields(row): for field_name, attribute in field_name_to_attribute.items(): value = attribute.standardize(row[field_name]) - if value is not None: + if value is not None and not ( + isinstance(value, float) and math.isnan(value) + ): normalizer = attribute.max_value - attribute.min_value - yield field_name, (row[field_name] - attribute.min_value) / normalizer + yield field_name, (value - attribute.min_value) / normalizer extracted_fields = backend.flat_map( pcol, extract_and_normalize_fields, "Extract and scale fields" diff --git a/dpsynth/transformations.py b/dpsynth/transformations.py index 3f53cab..c7fe937 100644 --- a/dpsynth/transformations.py +++ b/dpsynth/transformations.py @@ -173,7 +173,7 @@ def create_discretize_transformation( def transform(value: Any) -> pd.Interval | None: value = attribute_domain.standardize(value) - if value is None: + if value is None or (isinstance(value, float) and math.isnan(value)): return None return intervals[intervals.get_loc(value)] diff --git a/tests/discrete_mechanisms/independent_test.py b/tests/discrete_mechanisms/independent_test.py index e3668b7..6a1cf86 100644 --- a/tests/discrete_mechanisms/independent_test.py +++ b/tests/discrete_mechanisms/independent_test.py @@ -21,7 +21,7 @@ class IndependentTest(absltest.TestCase): def test_fits_one_way_marginals(self): - data = mbi.Dataset.synthetic(mbi.Domain(["a", "b", "c"], [3, 4, 5]), N=1000) + data = mbi.Dataset.synthetic(mbi.Domain(['a', 'b', 'c'], [3, 4, 5]), N=1000) config = independent.IndependentMechanism(pgm_iters=500) synthetic = config.calibrate(zcdp_rho=10000)(np.random.default_rng(0), data) @@ -31,6 +31,20 @@ def test_fits_one_way_marginals(self): actual = synthetic.project([col]).datavector() np.testing.assert_allclose(actual, expected, atol=0.1) + def test_skips_duplicate_cliques_from_initial_measurements(self): + """IndependentMechanism should not re-measure pre-measured cliques.""" + data = mbi.Dataset.synthetic(mbi.Domain(['a', 'b', 'c'], [3, 4, 5]), N=100) + # Pre-measure column 'a'. + marginal_a = data.project(('a',)).datavector() + initial = [mbi.LinearMeasurement(marginal_a, ('a',), stddev=1.0)] -if __name__ == "__main__": + config = independent.IndependentMechanism(pgm_iters=500) + # This should not raise 'Cliques must be unique'. + model = config.calibrate(zcdp_rho=100.0)( + np.random.default_rng(0), data, initial_measurements=initial + ) + self.assertIsNotNone(model) + + +if __name__ == '__main__': absltest.main() diff --git a/tests/domain_test.py b/tests/domain_test.py index 5cae77a..046fd64 100644 --- a/tests/domain_test.py +++ b/tests/domain_test.py @@ -98,7 +98,7 @@ def test_standardize_numerical(self): self.assertEqual(attribute.standardize(value), value) for value in ood_values: - self.assertIsNone(attribute.standardize(value)) + self.assertTrue(math.isnan(attribute.standardize(value))) def test_freeform_text_defaults(self): attribute = domain.FreeFormTextAttribute() diff --git a/tests/local_mode/primitives_test.py b/tests/local_mode/primitives_test.py index 5257c68..2a7617c 100644 --- a/tests/local_mode/primitives_test.py +++ b/tests/local_mode/primitives_test.py @@ -354,6 +354,28 @@ def test_dp_event_single_partition(self): self.assertIsInstance(event, dp_accounting.ComposedDpEvent) self.assertEmpty(event.events) + def test_quantiles_filters_nan(self): + """DPQuantiles should handle NaN in input data without crashing.""" + data = np.array([1.0, np.nan, 3.0, np.nan, 5.0]) + mechanism = primitives.DPQuantiles(lower=0.0, upper=10.0, num_partitions=4) + calibrated = mechanism.calibrate(zcdp_rho=10.0) + edges = calibrated(self.rng, data) + self.assertLen(edges, 3) + for e in edges: + self.assertBetween(e, 0.0, 10.0) + + def test_median_zero_length_intervals(self): + """_median should handle degenerate lower == upper from recursive splits.""" + data = np.array([5.0, 5.0, 5.0, 5.0]) + med = primitives._median(self.rng, data, lower=5.0, upper=5.0, epsilon=0.01) + self.assertEqual(med, 5.0) + + def test_median_data_at_boundaries(self): + """_median handles data near boundaries where jitter gets clipped back.""" + data = np.array([0.0, 0.0, 0.0, 5.0, 10.0, 10.0]) + med = primitives._median(self.rng, data, lower=0.0, upper=10.0, epsilon=1.0) + self.assertBetween(med, 0.0, 10.0) + class DPGaussianHistogramTest(absltest.TestCase):