Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.4
hooks:
- id: ruff
args: [--fix]
- id: ruff-format

- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
rev: v9.21.0
hooks:
- id: commitlint
stages: [commit-msg]
additional_dependencies: ["@commitlint/config-conventional"]
20 changes: 10 additions & 10 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,11 @@ def benchmark_single(forge: DataForge, iterations: int = 10_000) -> None:
("llm.token_count()", forge.llm.token_count),
("llm.cost_estimate()", forge.llm.cost_estimate),
("llm.rate_limit_header()", forge.llm.rate_limit_header),
# AI Chat (compound)
("ai_chat.chat_role()", forge.ai_chat.chat_role),
("ai_chat.chat_model()", forge.ai_chat.chat_model),
("ai_chat.chat_content()", forge.ai_chat.chat_content),
("ai_chat.chat_tokens()", forge.ai_chat.chat_tokens),
# AI Chat (on LLM provider)
("llm.chat_role()", forge.llm.chat_role),
("llm.chat_model()", forge.llm.chat_model),
("llm.chat_content()", forge.llm.chat_content),
("llm.chat_tokens()", forge.llm.chat_tokens),
# Social Media
("social_media.platform()", forge.social_media.platform),
("social_media.username()", forge.social_media.username),
Expand Down Expand Up @@ -384,11 +384,11 @@ def benchmark_batch(forge: DataForge) -> None:
("llm.token_count(count=N)", forge.llm.token_count),
("llm.cost_estimate(count=N)", forge.llm.cost_estimate),
("llm.rate_limit_header(count=N)", forge.llm.rate_limit_header),
# AI Chat (compound)
("ai_chat.chat_role(count=N)", forge.ai_chat.chat_role),
("ai_chat.chat_model(count=N)", forge.ai_chat.chat_model),
("ai_chat.chat_content(count=N)", forge.ai_chat.chat_content),
("ai_chat.chat_tokens(count=N)", forge.ai_chat.chat_tokens),
# AI Chat (on LLM provider)
("llm.chat_role(count=N)", forge.llm.chat_role),
("llm.chat_model(count=N)", forge.llm.chat_model),
("llm.chat_content(count=N)", forge.llm.chat_content),
("llm.chat_tokens(count=N)", forge.llm.chat_tokens),
# Social Media
("social_media.platform(count=N)", forge.social_media.platform),
("social_media.username(count=N)", forge.social_media.username),
Expand Down
9 changes: 7 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,18 @@ science = "dataforge.providers.science"
text = "dataforge.providers.text"
ai_prompt = "dataforge.providers.ai_prompt"
llm = "dataforge.providers.llm"
ai_chat = "dataforge.providers.ai_chat"

[project.entry-points.pytest11]
dataforge = "dataforge.pytest_plugin"

[dependency-groups]
dev = ["pytest>=8.0", "pytest-asyncio>=0.24", "ruff>=0.9"]
dev = ["pytest>=8.0", "pytest-asyncio>=0.24", "ruff>=0.15.4,<0.16"]

[tool.ruff]
target-version = "py312"

[tool.ruff.lint]
select = ["E4", "E7", "E9", "F"]

[tool.uv.build-backend]
module-name = "dataforge"
Expand Down
100 changes: 5 additions & 95 deletions src/dataforge/anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,4 @@
"""Data anonymization — deterministic PII replacement with referential integrity.

Replaces personally identifiable information (PII) with realistic fake
data using deterministic HMAC-SHA256 seeding for consistency: the same
real value always maps to the same fake value across tables and runs.

Usage::

from dataforge import DataForge
from dataforge.anonymizer import Anonymizer

forge = DataForge(seed=42)
anon = Anonymizer(forge, secret="my-secret-key")

# Anonymize a list of dicts
original = [
{"name": "Alice Smith", "email": "alice@real.com", "ssn": "123-45-6789"},
{"name": "Bob Jones", "email": "bob@real.com", "ssn": "987-65-4321"},
]
anonymized = anon.anonymize(original, fields={
"name": "full_name",
"email": "email",
"ssn": "ssn",
})

# Streaming CSV anonymization
anon.anonymize_csv("input.csv", "output.csv", fields={...})
"""
"""Data anonymization — deterministic PII replacement with referential integrity."""

from __future__ import annotations

Expand All @@ -38,36 +11,20 @@


class Anonymizer:
"""Deterministic PII anonymizer with consistent value mappings.

Uses HMAC-SHA256 to derive deterministic seeds from (secret + original_value),
ensuring the same input always produces the same fake output. This
preserves referential integrity across tables automatically.

Parameters
----------
forge : DataForge
The DataForge instance for generating fake values.
secret : str
Secret key for HMAC derivation. Different secrets produce
different anonymizations. Keep this secret to prevent
de-anonymization.
"""
"""Deterministic PII anonymizer with consistent value mappings."""

__slots__ = ("_forge", "_secret", "_cache", "_field_methods")

def __init__(self, forge: DataForge, secret: str = "dataforge-anonymizer") -> None:
self._forge = forge
self._secret = secret.encode("utf-8")
self._cache: dict[tuple[str, str], Any] = {} # (field, original) → fake
# Cache resolved field methods to avoid repeated _resolve_field calls
self._cache: dict[tuple[str, str], Any] = {}
self._field_methods: dict[str, Any] = {}

def _derive_seed(self, field: str, value: str) -> int:
"""Derive a deterministic integer seed from field name and value."""
msg = f"{field}:{value}".encode("utf-8")
digest = _hmac.new(self._secret, msg, _hashlib.sha256).digest()
# Use first 8 bytes as seed (64-bit)
return int.from_bytes(digest[:8], "big")

def _get_method(self, field: str) -> Any:
Expand Down Expand Up @@ -95,17 +52,11 @@ def _generate_fake(self, field: str, original_value: Any) -> Any:

seed = self._derive_seed(field, str_val)

# Instead of creating a full DataForge copy, re-seed the RNG
# of a lightweight forge copy. We use copy() only once and
# rely on the cache to amortize the cost.
method = self._get_method(field)
if method is not None:
# Save and restore the forge's RNG state to get deterministic output
# without creating a new forge instance.
import random as _random_mod

temp_rng = _random_mod.Random(seed)
# Swap the engine's RNG temporarily for deterministic generation
engine = self._forge._engine
orig_rng = engine._rng
engine._rng = temp_rng
Expand All @@ -114,7 +65,6 @@ def _generate_fake(self, field: str, original_value: Any) -> Any:
finally:
engine._rng = orig_rng
else:
# Fallback: just hash the value
fake_val = (
_hmac.new(
self._secret, str_val.encode("utf-8"), _hashlib.sha256
Expand All @@ -123,11 +73,9 @@ def _generate_fake(self, field: str, original_value: Any) -> Any:
else ""
)

# Format-preserving for emails
if field in ("email", "internet.email") and isinstance(original_value, str):
fake_val = self._format_preserve_email(fake_val, original_value)

# Format-preserving for phone numbers
if field in ("phone_number", "phone.phone_number") and isinstance(
original_value, str
):
Expand All @@ -142,7 +90,6 @@ def _format_preserve_email(fake: Any, original: str) -> str:
fake_str = str(fake)
if "@" in fake_str:
return fake_str
# If fake doesn't have @, construct one
if "@" in original:
_, domain = original.rsplit("@", 1)
return f"{fake_str}@{domain}"
Expand All @@ -152,10 +99,8 @@ def _format_preserve_email(fake: Any, original: str) -> str:
def _format_preserve_phone(fake: Any, original: str) -> str:
"""Try to preserve phone number format (length and separators)."""
fake_str = str(fake)
# If lengths match, return as-is
if len(fake_str) == len(original):
return fake_str
# Try to match the original format
result = []
fake_digits = [c for c in fake_str if c.isdigit()]
d_idx = 0
Expand All @@ -175,21 +120,7 @@ def anonymize(
rows: list[dict[str, Any]],
fields: dict[str, str],
) -> list[dict[str, Any]]:
"""Anonymize a list of row dicts.

Parameters
----------
rows : list[dict[str, Any]]
Input rows (not modified in place).
fields : dict[str, str]
Mapping of column name → DataForge field name.
Only specified columns are anonymized; others pass through.

Returns
-------
list[dict[str, Any]]
Anonymized rows.
"""
"""Anonymize a list of row dicts."""
result: list[dict[str, Any]] = []
for row in rows:
new_row = dict(row)
Expand All @@ -210,28 +141,7 @@ def anonymize_csv(
encoding: str = "utf-8",
batch_size: int = 1000,
) -> int:
"""Anonymize a CSV file in streaming fashion.

Parameters
----------
input_path : str
Path to input CSV.
output_path : str
Path to output CSV.
fields : dict[str, str]
Column → DataForge field mappings.
delimiter : str
CSV delimiter.
encoding : str
File encoding.
batch_size : int
Rows to process per batch.

Returns
-------
int
Number of rows processed.
"""
"""Anonymize a CSV file in streaming fashion."""
import csv

total = 0
Expand Down
Loading
Loading