diff --git a/dpsynth/local_mode/initialization.py b/dpsynth/local_mode/initialization.py index cf05f3d..3c43495 100644 --- a/dpsynth/local_mode/initialization.py +++ b/dpsynth/local_mode/initialization.py @@ -157,11 +157,13 @@ class OpenSetCategoricalInitializer(primitives.DPMechanism): name: Attribute name used as the clique key in the measurement. attribute: The OpenSetCategoricalAttribute specifying the default value. delta: Failure probability for the partition selection threshold. + min_count: Minimum true count for a partition to be discovered. """ name: str attribute: domain.OpenSetCategoricalAttribute delta: float + min_count: int = 1 mechanism: primitives.DPPartitionSelection | None = dataclasses.field( default=None, repr=False ) @@ -170,6 +172,7 @@ def calibrate(self, *, zcdp_rho: float) -> OpenSetCategoricalInitializer: """Returns a copy calibrated to the given zCDP budget.""" mechanism = primitives.DPPartitionSelection( delta=self.delta, + min_count=self.min_count, ).calibrate(zcdp_rho=zcdp_rho) return dataclasses.replace(self, mechanism=mechanism) diff --git a/dpsynth/local_mode/primitives.py b/dpsynth/local_mode/primitives.py index 37b65eb..51d6976 100644 --- a/dpsynth/local_mode/primitives.py +++ b/dpsynth/local_mode/primitives.py @@ -272,6 +272,7 @@ def select_partitions_gaussian_thresholding( data: np.ndarray, gdp_budget: float, delta: float, + min_count: int = 1, ) -> tuple[np.ndarray, np.ndarray, float]: """Selects partitions using Gaussian Thresholding (Weighted Gaussian). @@ -285,8 +286,16 @@ def select_partitions_gaussian_thresholding( Under item-level DP each record is treated as a distinct user contributing to exactly one partition, so the histogram has L2 sensitivity 1. The - threshold is T = 1 + sigma * Phi^{-1}(1 - delta), following the paper's - formula with max_part = 1. + threshold is T = min_count + sigma * Phi^{-1}(1 - delta), following the + paper's formula with max_part = 1 and a shift of (min_count - 1) to + account for the minimum count guarantee. + + When ``min_count > 1``, partitions with true count below ``min_count`` + are pre-filtered and the threshold shifts up accordingly. The privacy + guarantee is preserved: partitions where both neighboring datasets are + above ``min_count`` are covered by the Gaussian mechanism, and the + boundary case (one dataset at ``min_count - 1``, the other at + ``min_count``) is covered by the same additive delta. Args: rng: A numpy random number generator. @@ -294,6 +303,8 @@ def select_partitions_gaussian_thresholding( gdp_budget: Privacy budget in terms of squared Gaussian DP mu parameter (gdp_budget = mu^2 = 1 / sigma^2). delta: Failure probability (false positive bound per empty partition). + min_count: Minimum true count for a partition to be eligible. Partitions + with fewer occurrences in the data are never returned. Must be >= 1. Returns: A tuple containing: @@ -305,6 +316,8 @@ def select_partitions_gaussian_thresholding( """ if gdp_budget <= 0 or delta <= 0: raise ValueError(f'{gdp_budget=} and {delta=} must be positive.') + if min_count < 1: + raise ValueError(f'{min_count=} must be >= 1.') sigma = 1.0 / np.sqrt(gdp_budget) @@ -312,12 +325,20 @@ def select_partitions_gaussian_thresholding( return np.empty(0, dtype=data.dtype), np.empty(0, dtype=float), sigma unique_parts, counts = np.unique(data, return_counts=True) + + # Filter partitions below the minimum count before adding noise. + above_min = counts >= min_count + unique_parts, counts = unique_parts[above_min], counts[above_min] + if unique_parts.size == 0: + return np.empty(0, dtype=data.dtype), np.empty(0, dtype=float), sigma + noisy_counts = counts + rng.normal(scale=sigma, size=counts.size) - # Threshold: ensures that an empty partition (true count 0) passes with - # probability at most delta. For max_part=1 this simplifies to: - # T = 1/sqrt(1) + sigma * Phi^{-1}(1 - delta) = 1 + sigma * ppf(1-delta) - threshold = 1.0 + sigma * scipy.stats.norm.ppf(1.0 - delta) + # Threshold shifted by (min_count - 1) relative to the base formula. + # Base: T = 1 + sigma * ppf(1 - delta) bounds Pr[N(0, sigma^2) >= T] <= delta. + # With min_count, worst-case non-eligible count is (min_count - 1), so + # T' = min_count + sigma * ppf(1 - delta). + threshold = float(min_count) + sigma * scipy.stats.norm.ppf(1.0 - delta) passed = noisy_counts >= threshold return unique_parts[passed], noisy_counts[passed], sigma @@ -574,10 +595,12 @@ class DPPartitionSelection(DPMechanism): Attributes: delta: Failure probability for the thresholding step. + min_count: Minimum true count for a partition to be returned. sigma: Gaussian noise standard deviation. Set directly or via ``calibrate``. """ delta: float + min_count: int = 1 sigma: float | None = None def calibrate(self, *, zcdp_rho: float) -> DPPartitionSelection: @@ -596,18 +619,10 @@ def dp_event(self) -> dp_accounting.DpEvent: def __call__( self, rng: np.random.Generator, data: np.ndarray ) -> tuple[np.ndarray, np.ndarray, float]: - """Runs partition selection on integer-encoded partition IDs. - - Args: - rng: A numpy random number generator. - data: 1D array of integer partition IDs. - - Returns: - A tuple of (selected_partitions, noisy_counts, sigma). - """ + """Runs partition selection on integer-encoded partition IDs.""" if self.sigma is None: raise ValueError(_UNCALIBRATED_MSG.format(param='sigma')) gdp_budget = np.inf if self.sigma == 0.0 else 1.0 / (self.sigma**2) return select_partitions_gaussian_thresholding( - rng, data, gdp_budget, self.delta + rng, data, gdp_budget, self.delta, min_count=self.min_count ) diff --git a/tests/local_mode/primitives_test.py b/tests/local_mode/primitives_test.py index 5257c68..3fecf32 100644 --- a/tests/local_mode/primitives_test.py +++ b/tests/local_mode/primitives_test.py @@ -272,6 +272,52 @@ def test_string_data_type(self): ) self.assertTrue(all(isinstance(p, str) for p in selected)) + def test_min_count_filters_low_count_partitions(self): + # Partition 1 has count 50, partition 2 has count 3. + data = np.array([1] * 50 + [2] * 3) + selected, _, _ = primitives.select_partitions_gaussian_thresholding( + self.rng, data, gdp_budget=10.0, delta=1e-5, min_count=5 + ) + self.assertIn(1, selected) + self.assertNotIn(2, selected) + + def test_min_count_one_matches_default(self): + data = np.array([1] * 50 + [2] * 5) + rng1 = np.random.default_rng(42) + rng2 = np.random.default_rng(42) + result1 = primitives.select_partitions_gaussian_thresholding( + rng1, data, gdp_budget=10.0, delta=1e-5 + ) + result2 = primitives.select_partitions_gaussian_thresholding( + rng2, data, gdp_budget=10.0, delta=1e-5, min_count=1 + ) + np.testing.assert_array_equal(result1[0], result2[0]) + np.testing.assert_array_equal(result1[1], result2[1]) + + def test_min_count_all_filtered_returns_empty(self): + data = np.array([1, 2, 3]) + selected, counts, _ = primitives.select_partitions_gaussian_thresholding( + self.rng, data, gdp_budget=10.0, delta=1e-5, min_count=5 + ) + self.assertEmpty(selected) + self.assertEmpty(counts) + + def test_min_count_zero_raises(self): + data = np.array([1, 2, 3]) + with self.assertRaises(ValueError): + primitives.select_partitions_gaussian_thresholding( + self.rng, data, gdp_budget=1.0, delta=1e-5, min_count=0 + ) + + def test_min_count_increases_threshold(self): + # With very high budget (no noise), threshold is approximately min_count. + # Partitions with count exactly at min_count should pass. + data = np.array([1] * 10 + [2] * 10) + selected, _, _ = primitives.select_partitions_gaussian_thresholding( + self.rng, data, gdp_budget=np.inf, delta=0.1, min_count=10 + ) + self.assertCountEqual(selected, [1, 2]) + class GaussianHistogramTest(absltest.TestCase):