Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions dpsynth/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,13 @@ def exclusive_min_value(self) -> float:
return self.min_value - 1
return math.nextafter(self.min_value, -math.inf)

@property
def exclusive_max_value(self) -> float:
"""Returns the exclusive maximum value for this attribute."""
if self.dtype == 'int':
return self.max_value + 1
return math.nextafter(self.max_value, math.inf)

def standardize(self, value: Any) -> int | float | None:
"""Standardizes a value to one of the possible values."""
if self.clip_to_range:
Expand Down
64 changes: 58 additions & 6 deletions dpsynth/local_mode/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,77 @@ def calibrate(self, *, zcdp_rho: float) -> NumericalInitializer:
"""Returns a copy calibrated to the given zCDP budget."""
mechanism = primitives.DPQuantiles(
lower=self.attribute.min_value,
upper=self.attribute.max_value,
upper=self.attribute.exclusive_max_value,
num_partitions=self.num_partitions,
# Infer from attribute, not data.dtype: NaN promotes int to float.
integer_jitter=self.attribute.dtype == 'int',
).calibrate(zcdp_rho=zcdp_rho)
return dataclasses.replace(self, mechanism=mechanism)

@property
def _zcdp_rho(self) -> float:
"""Total zCDP rho, derived as sum(eps_i^2 / 8) over composed events."""
event = self.dp_event # raises if not calibrated
assert isinstance(event, dp_accounting.ComposedDpEvent)
return sum(e.epsilon**2 / 8.0 for e in event.events)

@property
def dp_event(self) -> dp_accounting.DpEvent:
"""Returns the composed privacy event for the quantile computation."""
return _validate_mechanism(self.mechanism).dp_event

def __call__(
self, rng: np.random.Generator, data: np.ndarray
self,
rng: np.random.Generator,
data: np.ndarray,
*,
estimated_total: float | None = None,
) -> ColumnMeasurement:
"""Returns a ColumnMeasurement with the discretization transform."""
"""Returns a ColumnMeasurement with the discretization transform.

Args:
rng: A numpy random number generator.
data: 1D array of numerical data.
estimated_total: If provided, a heuristic one-way measurement is included
assuming a uniform distribution over the original bins.

Returns:
A ColumnMeasurement with bin edges and optionally a heuristic measurement.
"""
# Dedup: concentrated data can make quantiles return duplicate edges.
edges = _validate_mechanism(self.mechanism)(rng, data)
bin_edges = np.unique(np.asarray(edges, dtype=float))
raw_edges = _validate_mechanism(self.mechanism)(rng, data)
raw_edges = np.asarray(raw_edges, dtype=float)
if self.attribute.dtype == 'int':
# Snap edges to the integer lattice. Bins are right-closed (left,
# right] and discretize uses searchsorted with side='left', so
# floor preserves the partition: edge 4.7 → floor 4 gives the
# same integer split {≤4} | {≥5} via (…, 4] | (4, …].
raw_edges = np.floor(raw_edges)
bin_edges, edge_counts = np.unique(raw_edges, return_counts=True)
# For integer data with upper=max_value+1, edges can land at max_value
# after floor. Remove such edges and absorb their count into the last
# bin's weight so categorical_attribute_from_edges doesn't create a
# degenerate (max_value, max_value] tail bin.
max_val = self.attribute.max_value
if len(bin_edges) > 0 and bin_edges[-1] >= max_val:
tail_count = edge_counts[-1]
bin_edges = bin_edges[:-1]
edge_counts = edge_counts[:-1]
bin_weights = np.append(edge_counts, tail_count + 1)
else:
bin_weights = np.append(edge_counts, 1)
cat_attr = vtx.categorical_attribute_from_edges(bin_edges, self.attribute)
return ColumnMeasurement(cat_attr, bin_edges)

measurement = None
if estimated_total is not None:
rho = self._zcdp_rho
uniform_counts = bin_weights * (estimated_total / self.num_partitions)
stddev = 1.0 / np.sqrt(rho)
measurement = mbi.LinearMeasurement(
uniform_counts, (self.name,), stddev=stddev
)

return ColumnMeasurement(cat_attr, bin_edges, measurement=measurement)


@dataclasses.dataclass
Expand Down
34 changes: 28 additions & 6 deletions dpsynth/local_mode/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def _median(
upper: float,
epsilon: float,
jitter_multiple: float = 1e-4,
integer_jitter: bool = False,
) -> float:
"""Computes a differentially private median using the exponential mechanism.

Expand All @@ -124,6 +125,8 @@ def _median(
upper: Upper bound for the data.
epsilon: Exponential mechanism privacy parameter.
jitter_multiple: Multiplier for the jitter scale, relative to upper-lower.
integer_jitter: If True, use positive jitter U(0, 0.5) so that integer data
points are never pushed across an integer boundary before floor.

Returns:
A differentially private median estimate.
Expand All @@ -139,10 +142,13 @@ def _median(
return (lower + upper) / 2
return float(np.median(clamped_data))

# Jitter size proportional to range. A small jitter makes duplicates unique
# and gives them non-zero length intervals, allowing them to be sampled.
jitter_scale = (upper - lower) * jitter_multiple
jitter = rng.uniform(-jitter_scale, jitter_scale, size=clamped_data.size)
# Jitter breaks ties, giving duplicate data points non-zero length intervals.
if integer_jitter:
# Positive jitter keeps floor() from shifting integers to a neighbor.
jitter = rng.uniform(0, 0.5, size=clamped_data.size)
else:
jitter_scale = (upper - lower) * jitter_multiple
jitter = rng.uniform(-jitter_scale, jitter_scale, size=clamped_data.size)
jittered_data = np.clip(clamped_data + jitter, lower, upper)

sorted_data = np.sort(jittered_data)
Expand Down Expand Up @@ -203,6 +209,7 @@ def _quantiles(
lower: float,
upper: float,
epsilon_levels: np.ndarray,
integer_jitter: bool = False,
) -> list[float]:
"""Computes uniformly spaced differentially private quantiles.

Expand All @@ -217,6 +224,7 @@ def _quantiles(
upper: Upper bound for the data.
epsilon_levels: Per-level exponential mechanism epsilons, as returned by
``_quantile_epsilon_levels``.
integer_jitter: If True, use positive jitter U(0, 0.5) for integer data.

Returns:
A list of ``2 ** len(epsilon_levels) - 1`` sorted private quantile
Expand All @@ -231,7 +239,14 @@ def quantiles_rec(current_data, curr_lower, curr_upper, current_depth):
return []

eps = epsilon_levels[current_depth - 1]
med = _median(rng, current_data, curr_lower, curr_upper, eps)
med = _median(
rng,
current_data,
curr_lower,
curr_upper,
eps,
integer_jitter=integer_jitter,
)

left_mask = current_data <= med
left_data = current_data[left_mask]
Expand Down Expand Up @@ -471,11 +486,13 @@ class DPQuantiles(DPMechanism):
lower: Lower bound for the data domain.
upper: Upper bound for the data domain.
num_partitions: Number of partitions (must be a power of 2).
integer_jitter: If True, use positive jitter U(0, 0.5) for integer data.
"""

lower: float
upper: float
num_partitions: int
integer_jitter: bool = False
_epsilon_levels: Sequence[float] | None = dataclasses.field(
default=None, repr=False
)
Expand Down Expand Up @@ -527,7 +544,12 @@ def __call__(self, rng: np.random.Generator, data: np.ndarray) -> list[float]:
if self._epsilon_levels is None:
raise ValueError(_UNCALIBRATED_MSG.format(param='_epsilon_levels'))
return _quantiles(
rng, data, self.lower, self.upper, np.asarray(self._epsilon_levels)
rng,
data,
self.lower,
self.upper,
np.asarray(self._epsilon_levels),
integer_jitter=self.integer_jitter,
)


Expand Down
6 changes: 5 additions & 1 deletion dpsynth/local_mode/vectorized_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,11 @@ def categorical_attribute_from_edges(
"""
min_, max_ = attribute_domain.exclusive_min_value, attribute_domain.max_value
full_edges = np.r_[min_, bin_edges, max_]
intervals = [f'({l}, {r}]' for l, r in zip(full_edges[:-1], full_edges[1:])]
if attribute_domain.dtype == 'int':
e = full_edges.astype(int)
intervals = [f'[{l+1}, {r}]' for l, r in zip(e[:-1], e[1:])]
else:
intervals = [f'({l}, {r}]' for l, r in zip(full_edges[:-1], full_edges[1:])]
if not attribute_domain.clip_to_range:
intervals = ['OUT_OF_DOMAIN'] + intervals
return domain.CategoricalAttribute(intervals)
Expand Down
Loading
Loading