diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..183d708 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.4 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook + rev: v9.21.0 + hooks: + - id: commitlint + stages: [commit-msg] + additional_dependencies: ["@commitlint/config-conventional"] diff --git a/benchmark.py b/benchmark.py index f148be7..b3bc78b 100644 --- a/benchmark.py +++ b/benchmark.py @@ -194,11 +194,11 @@ def benchmark_single(forge: DataForge, iterations: int = 10_000) -> None: ("llm.token_count()", forge.llm.token_count), ("llm.cost_estimate()", forge.llm.cost_estimate), ("llm.rate_limit_header()", forge.llm.rate_limit_header), - # AI Chat (compound) - ("ai_chat.chat_role()", forge.ai_chat.chat_role), - ("ai_chat.chat_model()", forge.ai_chat.chat_model), - ("ai_chat.chat_content()", forge.ai_chat.chat_content), - ("ai_chat.chat_tokens()", forge.ai_chat.chat_tokens), + # AI Chat (on LLM provider) + ("llm.chat_role()", forge.llm.chat_role), + ("llm.chat_model()", forge.llm.chat_model), + ("llm.chat_content()", forge.llm.chat_content), + ("llm.chat_tokens()", forge.llm.chat_tokens), # Social Media ("social_media.platform()", forge.social_media.platform), ("social_media.username()", forge.social_media.username), @@ -384,11 +384,11 @@ def benchmark_batch(forge: DataForge) -> None: ("llm.token_count(count=N)", forge.llm.token_count), ("llm.cost_estimate(count=N)", forge.llm.cost_estimate), ("llm.rate_limit_header(count=N)", forge.llm.rate_limit_header), - # AI Chat (compound) - ("ai_chat.chat_role(count=N)", forge.ai_chat.chat_role), - ("ai_chat.chat_model(count=N)", forge.ai_chat.chat_model), - ("ai_chat.chat_content(count=N)", forge.ai_chat.chat_content), - ("ai_chat.chat_tokens(count=N)", forge.ai_chat.chat_tokens), + # AI Chat (on LLM provider) + ("llm.chat_role(count=N)", forge.llm.chat_role), + ("llm.chat_model(count=N)", forge.llm.chat_model), + ("llm.chat_content(count=N)", forge.llm.chat_content), + ("llm.chat_tokens(count=N)", forge.llm.chat_tokens), # Social Media ("social_media.platform(count=N)", forge.social_media.platform), ("social_media.username(count=N)", forge.social_media.username), diff --git a/pyproject.toml b/pyproject.toml index c7681ae..1532853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,13 +51,18 @@ science = "dataforge.providers.science" text = "dataforge.providers.text" ai_prompt = "dataforge.providers.ai_prompt" llm = "dataforge.providers.llm" -ai_chat = "dataforge.providers.ai_chat" [project.entry-points.pytest11] dataforge = "dataforge.pytest_plugin" [dependency-groups] -dev = ["pytest>=8.0", "pytest-asyncio>=0.24", "ruff>=0.9"] +dev = ["pytest>=8.0", "pytest-asyncio>=0.24", "ruff>=0.15.4,<0.16"] + +[tool.ruff] +target-version = "py312" + +[tool.ruff.lint] +select = ["E4", "E7", "E9", "F"] [tool.uv.build-backend] module-name = "dataforge" diff --git a/src/dataforge/anonymizer.py b/src/dataforge/anonymizer.py index c90d5fc..9221b69 100644 --- a/src/dataforge/anonymizer.py +++ b/src/dataforge/anonymizer.py @@ -1,31 +1,4 @@ -"""Data anonymization — deterministic PII replacement with referential integrity. - -Replaces personally identifiable information (PII) with realistic fake -data using deterministic HMAC-SHA256 seeding for consistency: the same -real value always maps to the same fake value across tables and runs. - -Usage:: - - from dataforge import DataForge - from dataforge.anonymizer import Anonymizer - - forge = DataForge(seed=42) - anon = Anonymizer(forge, secret="my-secret-key") - - # Anonymize a list of dicts - original = [ - {"name": "Alice Smith", "email": "alice@real.com", "ssn": "123-45-6789"}, - {"name": "Bob Jones", "email": "bob@real.com", "ssn": "987-65-4321"}, - ] - anonymized = anon.anonymize(original, fields={ - "name": "full_name", - "email": "email", - "ssn": "ssn", - }) - - # Streaming CSV anonymization - anon.anonymize_csv("input.csv", "output.csv", fields={...}) -""" +"""Data anonymization — deterministic PII replacement with referential integrity.""" from __future__ import annotations @@ -38,36 +11,20 @@ class Anonymizer: - """Deterministic PII anonymizer with consistent value mappings. - - Uses HMAC-SHA256 to derive deterministic seeds from (secret + original_value), - ensuring the same input always produces the same fake output. This - preserves referential integrity across tables automatically. - - Parameters - ---------- - forge : DataForge - The DataForge instance for generating fake values. - secret : str - Secret key for HMAC derivation. Different secrets produce - different anonymizations. Keep this secret to prevent - de-anonymization. - """ + """Deterministic PII anonymizer with consistent value mappings.""" __slots__ = ("_forge", "_secret", "_cache", "_field_methods") def __init__(self, forge: DataForge, secret: str = "dataforge-anonymizer") -> None: self._forge = forge self._secret = secret.encode("utf-8") - self._cache: dict[tuple[str, str], Any] = {} # (field, original) → fake - # Cache resolved field methods to avoid repeated _resolve_field calls + self._cache: dict[tuple[str, str], Any] = {} self._field_methods: dict[str, Any] = {} def _derive_seed(self, field: str, value: str) -> int: """Derive a deterministic integer seed from field name and value.""" msg = f"{field}:{value}".encode("utf-8") digest = _hmac.new(self._secret, msg, _hashlib.sha256).digest() - # Use first 8 bytes as seed (64-bit) return int.from_bytes(digest[:8], "big") def _get_method(self, field: str) -> Any: @@ -95,17 +52,11 @@ def _generate_fake(self, field: str, original_value: Any) -> Any: seed = self._derive_seed(field, str_val) - # Instead of creating a full DataForge copy, re-seed the RNG - # of a lightweight forge copy. We use copy() only once and - # rely on the cache to amortize the cost. method = self._get_method(field) if method is not None: - # Save and restore the forge's RNG state to get deterministic output - # without creating a new forge instance. import random as _random_mod temp_rng = _random_mod.Random(seed) - # Swap the engine's RNG temporarily for deterministic generation engine = self._forge._engine orig_rng = engine._rng engine._rng = temp_rng @@ -114,7 +65,6 @@ def _generate_fake(self, field: str, original_value: Any) -> Any: finally: engine._rng = orig_rng else: - # Fallback: just hash the value fake_val = ( _hmac.new( self._secret, str_val.encode("utf-8"), _hashlib.sha256 @@ -123,11 +73,9 @@ def _generate_fake(self, field: str, original_value: Any) -> Any: else "" ) - # Format-preserving for emails if field in ("email", "internet.email") and isinstance(original_value, str): fake_val = self._format_preserve_email(fake_val, original_value) - # Format-preserving for phone numbers if field in ("phone_number", "phone.phone_number") and isinstance( original_value, str ): @@ -142,7 +90,6 @@ def _format_preserve_email(fake: Any, original: str) -> str: fake_str = str(fake) if "@" in fake_str: return fake_str - # If fake doesn't have @, construct one if "@" in original: _, domain = original.rsplit("@", 1) return f"{fake_str}@{domain}" @@ -152,10 +99,8 @@ def _format_preserve_email(fake: Any, original: str) -> str: def _format_preserve_phone(fake: Any, original: str) -> str: """Try to preserve phone number format (length and separators).""" fake_str = str(fake) - # If lengths match, return as-is if len(fake_str) == len(original): return fake_str - # Try to match the original format result = [] fake_digits = [c for c in fake_str if c.isdigit()] d_idx = 0 @@ -175,21 +120,7 @@ def anonymize( rows: list[dict[str, Any]], fields: dict[str, str], ) -> list[dict[str, Any]]: - """Anonymize a list of row dicts. - - Parameters - ---------- - rows : list[dict[str, Any]] - Input rows (not modified in place). - fields : dict[str, str] - Mapping of column name → DataForge field name. - Only specified columns are anonymized; others pass through. - - Returns - ------- - list[dict[str, Any]] - Anonymized rows. - """ + """Anonymize a list of row dicts.""" result: list[dict[str, Any]] = [] for row in rows: new_row = dict(row) @@ -210,28 +141,7 @@ def anonymize_csv( encoding: str = "utf-8", batch_size: int = 1000, ) -> int: - """Anonymize a CSV file in streaming fashion. - - Parameters - ---------- - input_path : str - Path to input CSV. - output_path : str - Path to output CSV. - fields : dict[str, str] - Column → DataForge field mappings. - delimiter : str - CSV delimiter. - encoding : str - File encoding. - batch_size : int - Rows to process per batch. - - Returns - ------- - int - Number of rows processed. - """ + """Anonymize a CSV file in streaming fashion.""" import csv total = 0 diff --git a/src/dataforge/backend.py b/src/dataforge/backend.py index 5269ce7..bc83a25 100644 --- a/src/dataforge/backend.py +++ b/src/dataforge/backend.py @@ -1,8 +1,4 @@ -"""RandomEngine — the speed engine behind dataforge. - -Provides a unified interface for random selection using stdlib -``random`` — optimised for both scalar picks and batch generation. -""" +"""RandomEngine — the speed engine behind dataforge.""" import math as _math import random as _random @@ -10,45 +6,25 @@ _T = TypeVar("_T") -# Pre-computed power-of-10 table for random_digits_str — eliminates -# per-call ``10**n`` computation for n=1..18. -_POW10: tuple[int, ...] = tuple(10**i for i in range(19)) # _POW10[0]=1 .. _POW10[18] +_POW10: tuple[int, ...] = tuple(10**i for i in range(19)) -# Pre-computed letter pool for letterify/bothify _LETTERS_UPPER: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" _LETTERS_LOWER: str = "abcdefghijklmnopqrstuvwxyz" _LETTERS_ALL: str = _LETTERS_UPPER + _LETTERS_LOWER -# Module-level cache for cumulative weights — keyed on id(weights). -# Safe because weight tuples are stored as class attributes with stable -# identity and cumulative weights are deterministic from input weights. _CUM_WEIGHTS_CACHE: dict[int, list[float]] = {} class RandomEngine: - """Core randomness engine. - - Parameters - ---------- - seed : int | None - Optional seed for reproducibility. - """ + """Core randomness engine.""" __slots__ = ("_rng",) def __init__(self, seed: int | None = None) -> None: self._rng: _random.Random = _random.Random(seed) - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - def choice(self, data: tuple[_T, ...]) -> _T: - """Return a single random element from *data*. - - Uses stdlib ``random.Random.choice`` which is the fastest path - for picking one item. - """ + """Return a single random element from *data*.""" return self._rng.choice(data) def choices(self, data: tuple[_T, ...], count: int) -> list[_T]: @@ -56,19 +32,7 @@ def choices(self, data: tuple[_T, ...], count: int) -> list[_T]: return self._rng.choices(data, k=count) def sample(self, data: tuple[_T, ...], count: int) -> list[_T]: - """Return *count* unique random elements from *data* (without replacement). - - Parameters - ---------- - data : tuple - The population to sample from. - count : int - Number of unique items to pick (must be <= len(data)). - - Returns - ------- - list - """ + """Return *count* unique random elements from *data* (without replacement).""" return self._rng.sample(data, count) def random_int(self, min_val: int = 0, max_val: int = 9999) -> int: @@ -81,43 +45,17 @@ def random_float( max_val: float = 1.0, precision: int = 2, ) -> float: - """Return a random float between *min_val* and *max_val*. - - Parameters - ---------- - min_val : float - Lower bound (inclusive). - max_val : float - Upper bound (inclusive). - precision : int - Number of decimal places to round to. - - Returns - ------- - float - """ + """Return a random float between *min_val* and *max_val*.""" val = self._rng.uniform(min_val, max_val) return round(val, precision) def numerify(self, pattern: str) -> str: - """Replace every ``#`` in *pattern* with a random digit. - - Example: ``"#####"`` → ``"38201"`` - - Optimized: if the pattern is all ``#`` characters, generates - all digits in a single call via :meth:`random_digits_str`. - For mixed patterns, pre-counts ``#`` and generates all digits - in one bulk call, then substitutes via iterator. - """ - # Fast path: pattern is entirely # characters (very common). - # Use length check instead of iterating all characters with all(). + """Replace every ``#`` in *pattern* with a random digit.""" hash_count = pattern.count("#") if hash_count == len(pattern): return self.random_digits_str(hash_count) if hash_count == 0: return pattern - # Slow path optimized: generate all digits in one call, then - # substitute in-place on a list — avoids iterator overhead. digits = self.random_digits_str(hash_count) chars = list(pattern) d = 0 @@ -128,17 +66,7 @@ def numerify(self, pattern: str) -> str: return "".join(chars) def letterify(self, pattern: str, upper: bool = False) -> str: - """Replace every ``?`` in *pattern* with a random letter. - - Example: ``"??-###"`` → ``"Ab-381"`` - - Parameters - ---------- - pattern : str - Pattern with ``?`` placeholders for letters. - upper : bool - If True, use uppercase letters only. - """ + """Replace every ``?`` in *pattern* with a random letter.""" q_count = pattern.count("?") if q_count == 0: return pattern @@ -148,10 +76,7 @@ def letterify(self, pattern: str, upper: bool = False) -> str: return "".join(next(it) if ch == "?" else ch for ch in pattern) def bothify(self, pattern: str) -> str: - """Replace ``#`` with digits and ``?`` with letters in *pattern*. - - Example: ``"??-####"`` → ``"Kz-3802"`` - """ + """Replace ``#`` with digits and ``?`` with letters in *pattern*.""" hash_count = pattern.count("#") q_count = pattern.count("?") if hash_count == 0 and q_count == 0: @@ -170,26 +95,15 @@ def bothify(self, pattern: str) -> str: return "".join(chars) def getrandbits(self, k: int) -> int: - """Return a random integer with *k* random bits. - - This is the fastest way to generate a large block of randomness - in a single call — used by providers that need to build strings - from many random hex/decimal digits (IPv6, MAC, barcodes, etc.). - """ + """Return a random integer with *k* random bits.""" return self._rng.getrandbits(k) def random_digits_str(self, n: int) -> str: - """Return a string of *n* random decimal digits. - - Uses a pre-computed ``_POW10`` lookup table to avoid per-call - ``10**n`` computation. For small n (≤ 18), a single - ``randint`` call is the fastest path. - """ + """Return a string of *n* random decimal digits.""" _pow10 = _POW10 if n <= 18: val = self._rng.randint(0, _pow10[n] - 1) return str(val).zfill(n) - # For larger n, concatenate chunks of 18 digits parts: list[str] = [] remaining = n _max18 = _pow10[18] - 1 @@ -211,26 +125,7 @@ def weighted_choices( weights: tuple[float, ...] | list[float], count: int, ) -> list[_T]: - """Return *count* random elements from *data* with *weights*. - - Each element in *data* is selected with probability proportional - to its corresponding weight. - - Parameters - ---------- - data : tuple - The items to choose from. - weights : tuple[float, ...] or list[float] - Non-negative weights (need not sum to 1). - count : int - Number of items to pick. - - Returns - ------- - list - """ - # Use module-level cumulative weights cache — avoids - # recomputing inside random.choices() on every call. + """Return *count* random elements from *data* with *weights*.""" w_id = id(weights) cum = _CUM_WEIGHTS_CACHE.get(w_id) if cum is None: @@ -245,10 +140,7 @@ def weighted_choice( data: tuple[_T, ...], weights: tuple[float, ...] | list[float], ) -> _T: - """Return a single random element from *data* with *weights*. - - Scalar version of :meth:`weighted_choices`. - """ + """Return a single random element from *data* with *weights*.""" cache = _CUM_WEIGHTS_CACHE w_id = id(weights) cum = cache.get(w_id) @@ -259,194 +151,56 @@ def weighted_choice( cache[w_id] = cum return self._rng.choices(data, cum_weights=cum, k=1)[0] - # ------------------------------------------------------------------ - # Statistical distributions - # ------------------------------------------------------------------ - def gauss(self, mu: float = 0.0, sigma: float = 1.0) -> float: - """Return a random value from a Gaussian (normal) distribution. - - Uses stdlib ``random.gauss`` which is faster than ``normalvariate`` - and thread-safe in CPython 3.12+. - - Parameters - ---------- - mu : float - Mean of the distribution. - sigma : float - Standard deviation of the distribution. - """ + """Return a random value from a Gaussian distribution.""" return self._rng.gauss(mu, sigma) def gauss_int(self, mu: float, sigma: float, min_val: int, max_val: int) -> int: - """Return a clamped integer from a Gaussian distribution. - - Generates a float from ``gauss(mu, sigma)`` and clamps to - ``[min_val, max_val]``. Useful for realistic age, score, - salary distributions. - - Parameters - ---------- - mu : float - Mean of the distribution. - sigma : float - Standard deviation. - min_val : int - Lower clamp bound. - max_val : int - Upper clamp bound. - - Returns - ------- - int - """ + """Return a clamped integer from a Gaussian distribution.""" val = self._rng.gauss(mu, sigma) return max(min_val, min(max_val, round(val))) def exponential(self, lambd: float = 1.0) -> float: - """Return a random value from an exponential distribution. - - Parameters - ---------- - lambd : float - Rate parameter (1/mean). Must be > 0. - """ + """Return a random value from an exponential distribution.""" return self._rng.expovariate(lambd) def log_normal(self, mu: float = 0.0, sigma: float = 1.0) -> float: - """Return a random value from a log-normal distribution. - - Parameters - ---------- - mu : float - Mean of the underlying normal distribution. - sigma : float - Standard deviation of the underlying normal distribution. - """ + """Return a random value from a log-normal distribution.""" return self._rng.lognormvariate(mu, sigma) def triangular( self, low: float = 0.0, high: float = 1.0, mode: float | None = None ) -> float: - """Return a random value from a triangular distribution. - - Parameters - ---------- - low : float - Lower bound. - high : float - Upper bound. - mode : float | None - Peak of the distribution. Defaults to midpoint. - """ + """Return a random value from a triangular distribution.""" if mode is None: mode = (low + high) / 2.0 return self._rng.triangular(low, high, mode) def pareto(self, alpha: float = 1.0) -> float: - """Return a random value from a Pareto distribution. - - Useful for power-law/Zipf-like distributions (wealth, popularity). - - Parameters - ---------- - alpha : float - Shape parameter (must be > 0). - """ + """Return a random value from a Pareto distribution.""" return self._rng.paretovariate(alpha) def vonmises(self, mu: float = 0.0, kappa: float = 0.0) -> float: - """Return a random value from a von Mises distribution. - - Circular analog of the normal distribution — useful for - directional data (angles, hours of day). - - Parameters - ---------- - mu : float - Mean angle in radians. - kappa : float - Concentration parameter (0 = uniform on circle). - """ + """Return a random value from a von Mises distribution.""" return self._rng.vonmisesvariate(mu, kappa) def beta(self, alpha: float = 2.0, beta_param: float = 5.0) -> float: - """Return a random value from a Beta distribution. - - Useful for probabilities, completion rates, conversion ratios. - - Parameters - ---------- - alpha : float - Alpha shape parameter (> 0). - beta_param : float - Beta shape parameter (> 0). - """ + """Return a random value from a Beta distribution.""" return self._rng.betavariate(alpha, beta_param) def gamma(self, alpha: float = 1.0, beta_param: float = 1.0) -> float: - """Return a random value from a Gamma distribution. - - Parameters - ---------- - alpha : float - Shape parameter (> 0). - beta_param : float - Scale parameter (> 0). - """ + """Return a random value from a Gamma distribution.""" return self._rng.gammavariate(alpha, beta_param) def zipf(self, alpha: float = 1.5, n: int = 100) -> int: - """Return a random integer from an approximate Zipf distribution. - - Uses rejection sampling from a Pareto distribution. - Values range from 1 to *n*. - - Parameters - ---------- - alpha : float - Exponent parameter (> 1.0). - n : int - Upper bound for the returned integer. - - Returns - ------- - int - A random integer in [1, n]. - """ - # Fast approximate Zipf via truncated Pareto + """Return a random integer from an approximate Zipf distribution.""" while True: val = int(_math.ceil(self._rng.paretovariate(alpha - 1))) if val <= n: return val - # ------------------------------------------------------------------ - # Regex-based string generation - # ------------------------------------------------------------------ - def regexify(self, pattern: str) -> str: - """Generate a random string matching a simplified regex *pattern*. - - Supports a practical subset of regex syntax: - - ``[a-z]``, ``[A-Z]``, ``[0-9]``, ``[abc]`` — character classes - - ``{n}``, ``{n,m}`` — exact and range repetition - - ``+`` — 1-3 repetitions - - ``*`` — 0-3 repetitions - - ``?`` — 0 or 1 - - ``(a|b|c)`` — alternation groups - - ``.`` — any printable ASCII character - - ``\\d``, ``\\w``, ``\\s`` — digit, word char, whitespace - - Literal characters - - Parameters - ---------- - pattern : str - Simplified regex pattern. - - Returns - ------- - str - """ + """Generate a random string matching a simplified regex *pattern*.""" _rng = self._rng result: list[str] = [] i = 0 @@ -456,7 +210,6 @@ def regexify(self, pattern: str) -> str: ch = pattern[i] if ch == "\\" and i + 1 < n: - # Escape sequences esc = pattern[i + 1] if esc == "d": chars = "0123456789" @@ -465,7 +218,6 @@ def regexify(self, pattern: str) -> str: elif esc == "s": chars = " \t" else: - # Literal escaped char result.append(esc) i += 2 i, _ = self._regexify_quantifier(pattern, i, _rng, result, esc) @@ -476,7 +228,6 @@ def regexify(self, pattern: str) -> str: continue if ch == "[": - # Character class close = pattern.find("]", i + 1) if close == -1: result.append(ch) @@ -489,7 +240,6 @@ def regexify(self, pattern: str) -> str: continue if ch == "(": - # Alternation group close = pattern.find(")", i + 1) if close == -1: result.append(ch) @@ -498,7 +248,6 @@ def regexify(self, pattern: str) -> str: options = pattern[i + 1 : close].split("|") chosen = _rng.choice(options) i = close + 1 - # Check for quantifier on group if i < n and pattern[i] in "{+*?": if pattern[i] == "{": end_brace = pattern.find("}", i + 1) @@ -536,13 +285,11 @@ def regexify(self, pattern: str) -> str: continue if ch == ".": - # Any printable ASCII base = chr(_rng.randint(33, 126)) i += 1 i, _ = self._regexify_quantifier(pattern, i, _rng, result, base) continue - # Literal character i += 1 i, _ = self._regexify_quantifier(pattern, i, _rng, result, ch) diff --git a/src/dataforge/chaos.py b/src/dataforge/chaos.py index 8430acc..2d54414 100644 --- a/src/dataforge/chaos.py +++ b/src/dataforge/chaos.py @@ -1,31 +1,4 @@ -"""Chaos mode — inject data quality issues for testing resilience. - -A post-processing transformer that injects realistic data quality -problems: nulls, type mismatches, boundary values, duplicates, -whitespace issues, encoding chaos, format inconsistencies, and -truncation. - -Usage:: - - from dataforge import DataForge - from dataforge.chaos import ChaosTransformer - - forge = DataForge(seed=42) - schema = forge.schema(["first_name", "email", "age"]) - rows = schema.generate(count=100) - - chaos = ChaosTransformer( - null_rate=0.05, - type_mismatch_rate=0.02, - boundary_rate=0.01, - duplicate_rate=0.03, - whitespace_rate=0.02, - encoding_rate=0.01, - format_rate=0.02, - truncation_rate=0.01, - ) - dirty_rows = chaos.transform(rows) -""" +"""Chaos mode — inject data quality issues for testing resilience.""" from __future__ import annotations @@ -33,10 +6,6 @@ from typing import Any -# ------------------------------------------------------------------ -# Boundary value catalogs -# ------------------------------------------------------------------ - _BOUNDARY_STR: tuple[str, ...] = ( "", " ", @@ -58,17 +27,17 @@ "Robert'); DROP TABLE students;--", "a" * 1000, "\x00", - "\ufeff", # BOM + "\ufeff", ) _BOUNDARY_INT: tuple[Any, ...] = ( 0, -1, 1, - -2147483648, # INT32_MIN - 2147483647, # INT32_MAX - -9223372036854775808, # INT64_MIN - 9223372036854775807, # INT64_MAX + -2147483648, + 2147483647, + -9223372036854775808, + 9223372036854775807, "not_a_number", "", None, @@ -82,8 +51,8 @@ float("inf"), float("-inf"), float("nan"), - 1e-308, # near MIN_FLOAT - 1e308, # near MAX_FLOAT + 1e-308, + 1e308, "not_a_number", "", None, @@ -96,66 +65,40 @@ "2038-01-19", "not-a-date", "", - "2024-02-30", # invalid day - "2024-13-01", # invalid month + "2024-02-30", + "2024-13-01", ) -# Unicode edge cases for encoding chaos _UNICODE_CHAOS: tuple[str, ...] = ( - "\u200b", # zero-width space - "\u200e", # left-to-right mark - "\u200f", # right-to-left mark - "\u00e9", # é - "\u00f1", # ñ - "\u00fc", # ü - "\u4e2d", # Chinese character - "\U0001f600", # emoji - "\u202e", # right-to-left override - "\ufeff", # BOM - "\u0000", # null - "\ud83d", # lone surrogate (may cause issues) + "\u200b", + "\u200e", + "\u200f", + "\u00e9", + "\u00f1", + "\u00fc", + "\u4e2d", + "\U0001f600", + "\u202e", + "\ufeff", + "\u0000", + "\ud83d", ) -# Whitespace variants _WHITESPACE_CHAOS: tuple[str, ...] = ( - " ", # extra leading space - " ", # double space - "\t", # tab - " \t", # mixed - "\n", # newline - "\r", # carriage return - "\u00a0", # non-breaking space - "\u2003", # em space - "\u200b", # zero-width space + " ", + " ", + "\t", + " \t", + "\n", + "\r", + "\u00a0", + "\u2003", + "\u200b", ) class ChaosTransformer: - """Inject data quality issues into generated data. - - All rates are probabilities (0.0–1.0) applied per-cell. - - Parameters - ---------- - null_rate : float - Probability of replacing a value with None. - type_mismatch_rate : float - Probability of injecting a type-mismatched value. - boundary_rate : float - Probability of injecting a boundary/edge-case value. - duplicate_rate : float - Probability of duplicating a random existing row. - whitespace_rate : float - Probability of adding whitespace chaos to string values. - encoding_rate : float - Probability of injecting unicode edge cases into strings. - format_rate : float - Probability of format inconsistency (case, separators). - truncation_rate : float - Probability of truncating string values. - seed : int | None - Optional seed for reproducibility. - """ + """Inject data quality issues into generated data.""" __slots__ = ( "_null_rate", @@ -196,28 +139,12 @@ def transform( rows: list[dict[str, Any]], columns: list[str] | None = None, ) -> list[dict[str, Any]]: - """Apply chaos transformations to rows. - - Parameters - ---------- - rows : list[dict[str, Any]] - Input rows (will NOT be modified in place — copies are made). - columns : list[str] | None - Specific columns to apply chaos to. If None, all columns - are eligible. - - Returns - ------- - list[dict[str, Any]] - Transformed rows with injected data quality issues. - """ + """Apply chaos transformations to rows.""" if not rows: return rows rng = self._rng - # Pre-check which cell-level transformations are active to avoid - # checking rates that are 0 in the inner loop. null_rate = self._null_rate type_mismatch_rate = self._type_mismatch_rate boundary_rate = self._boundary_rate @@ -238,7 +165,6 @@ def transform( target_cols = columns or list(rows[0].keys()) - # Only copy rows if we have cell-level transforms to apply if has_any_cell_transform: result: list[dict[str, Any]] = [dict(row) for row in rows] _random = rng.random @@ -248,22 +174,18 @@ def transform( continue val = row[col] - # Null injection if null_rate > 0 and _random() < null_rate: row[col] = None continue - # Type mismatch if type_mismatch_rate > 0 and _random() < type_mismatch_rate: row[col] = self._inject_type_mismatch(val, rng) continue - # Boundary values if boundary_rate > 0 and _random() < boundary_rate: row[col] = self._inject_boundary(val, rng) continue - # String-specific transformations if isinstance(val, str): if whitespace_rate > 0 and _random() < whitespace_rate: row[col] = self._inject_whitespace(val, rng) @@ -281,10 +203,8 @@ def transform( row[col] = self._inject_truncation(val, rng) continue else: - # No cell-level transforms — still need copies for duplicate injection result = [dict(row) for row in rows] - # Row-level: duplicate injection if self._duplicate_rate > 0 and len(result) > 1: n_dups = rng.binomialvariate(len(result), self._duplicate_rate) for _ in range(n_dups): @@ -309,7 +229,6 @@ def _inject_type_mismatch(val: Any, rng: _random_mod.Random) -> Any: def _inject_boundary(val: Any, rng: _random_mod.Random) -> Any: """Replace value with a boundary/edge-case value.""" if isinstance(val, str): - # Detect if it looks like a date if len(val) == 10 and val[4:5] == "-" and val[7:8] == "-": return rng.choice(_BOUNDARY_DATE) return rng.choice(_BOUNDARY_STR) @@ -325,11 +244,10 @@ def _inject_whitespace(val: str, rng: _random_mod.Random) -> str: chaos = rng.choice(_WHITESPACE_CHAOS) action = rng.randint(0, 2) if action == 0: - return chaos + val # prepend + return chaos + val elif action == 1: - return val + chaos # append + return val + chaos else: - # Insert in middle if len(val) > 1: pos = rng.randint(1, len(val) - 1) return val[:pos] + chaos + val[pos:] @@ -361,10 +279,8 @@ def _inject_format_issue(val: str, rng: _random_mod.Random) -> str: elif action == 2: return val.title() elif action == 3: - # Random case return "".join(c.upper() if rng.random() > 0.5 else c.lower() for c in val) else: - # Replace separators for old, new in [("-", "/"), ("/", "-"), (" ", "_"), ("_", " ")]: if old in val: return val.replace(old, new) diff --git a/src/dataforge/cli.py b/src/dataforge/cli.py index 9a6b42f..598c359 100644 --- a/src/dataforge/cli.py +++ b/src/dataforge/cli.py @@ -1,34 +1,4 @@ -"""dataforge CLI — generate fake data from the command line. - -Usage:: - - dataforge --count 100 --format csv name email phone - dataforge --count 10 --format json first_name last_name city - dataforge --locale de_DE --count 5 full_name address - dataforge --list-fields - dataforge --list-providers - dataforge --version - - # SQL output - dataforge --format sql --table users first_name email city - - # TSV output - dataforge --format tsv first_name email - - # Custom delimiter - dataforge --format csv --delimiter "|" first_name email - - # Column renaming - dataforge "Name=full_name" "Email=email" "City=city" - - # Streaming (memory-efficient for large counts) - dataforge --stream --count 1000000 --format csv -o data.csv first_name email - - # Unique values - dataforge --unique --count 50 first_name - -Supported output formats: text, csv, tsv, json, jsonl, sql -""" +"""dataforge CLI — generate fake data from the command line.""" import argparse import csv @@ -41,10 +11,7 @@ def _parse_field_spec(spec: str) -> tuple[str, str]: - """Parse a field spec like ``"Name=full_name"`` or ``"email"``. - - Returns ``(column_name, field_name)``. - """ + """Parse a field spec like ``"Name=full_name"`` or ``"email"``.""" if "=" in spec: col_name, field_name = spec.split("=", 1) return col_name.strip(), field_name.strip() @@ -218,7 +185,6 @@ def main(argv: list[str] | None = None) -> int: field_map = get_field_map() - # --tui: launch interactive TUI if args.tui: try: from dataforge.tui import launch @@ -229,7 +195,6 @@ def main(argv: list[str] | None = None) -> int: return 1 return 0 - # --infer: infer schema from CSV and generate data if args.infer: forge = DataForge(locale=args.locale, seed=args.seed) try: @@ -241,7 +206,6 @@ def main(argv: list[str] | None = None) -> int: print(json.dumps(rows, indent=2, ensure_ascii=False, default=str)) return 0 - # --anonymize: anonymize a CSV file if args.anonymize: forge = DataForge(locale=args.locale, seed=args.seed) from dataforge.anonymizer import Anonymizer @@ -256,7 +220,6 @@ def main(argv: list[str] | None = None) -> int: print(f"Anonymized output written to {output}", file=sys.stderr) return 0 - # --list-providers if args.list_providers: from dataforge.registry import get_provider_info @@ -267,27 +230,22 @@ def main(argv: list[str] | None = None) -> int: print(f" {name:20s} ({len(fm)} fields)") return 0 - # --list-fields if args.list_fields: - # Group fields by provider for name in sorted(field_map.keys()): provider, method = field_map[name] print(f" {name:24s} ({provider}.{method})") return 0 if not args.fields: - # Default fields (only when no --schema) if not args.schema: args.fields = ["first_name", "last_name", "email"] else: args.fields = [] - # Parse field specs (handle column renaming "Name=full_name") field_specs = [_parse_field_spec(f) for f in args.fields] headers = [col_name for col_name, _ in field_specs] field_names = [field_name for _, field_name in field_specs] - # Validate fields before generating (skip when --schema provides fields) if not args.schema: for col_name, field_name in field_specs: if field_name not in field_map and "." not in field_name: @@ -297,7 +255,6 @@ def main(argv: list[str] | None = None) -> int: ) return 1 - # Validate --stream requires --output if args.stream and not args.output: print( "Error: --stream requires --output to specify a file path.", @@ -305,7 +262,6 @@ def main(argv: list[str] | None = None) -> int: ) return 1 - # Validate --format sql does not combine with --stream (not supported) if args.stream and args.format == "sql": print( "Error: --stream is not supported with --format sql.", @@ -315,16 +271,13 @@ def main(argv: list[str] | None = None) -> int: forge = DataForge(locale=args.locale, seed=args.seed) - # Build field dict for Schema (supports column renaming) if field_specs and any(col != field for col, field in field_specs): - # Column renaming in use — build dict fields_arg: list[str] | dict[str, str] = { col: field for col, field in field_specs } else: fields_arg = field_names - # Parse --null-fields null_fields: dict[str, float] | None = None if args.null_fields: null_fields = {} @@ -355,7 +308,6 @@ def main(argv: list[str] | None = None) -> int: return 1 null_fields[name.strip()] = prob - # --schema: load field definitions from a file if args.schema: from dataforge.schema_io import load_schema, dict_to_schema_args @@ -369,11 +321,9 @@ def main(argv: list[str] | None = None) -> int: schema_def ) - # CLI --count overrides schema count only when explicitly provided count = args.count if args.count != 10 else schema_count args.count = count - # Build fields_arg and headers from loaded schema if isinstance(schema_fields, dict): fields_arg = schema_fields headers = list(schema_fields.keys()) @@ -381,11 +331,9 @@ def main(argv: list[str] | None = None) -> int: fields_arg = schema_fields headers = list(schema_fields) - # Merge null_fields: CLI --null-fields wins over schema file if null_fields is None and schema_null: null_fields = schema_null - # --save-schema: save current schema definition to a file and exit if args.save_schema: from dataforge.schema_io import schema_to_dict, save_schema as _save_schema @@ -405,23 +353,19 @@ def main(argv: list[str] | None = None) -> int: ) return 0 - # Resolve delimiter delimiter = args.delimiter if delimiter is None: delimiter = "\t" if args.format == "tsv" else "," - # Resolve encoding and compression encoding = args.encoding compress: bool | None = True if args.compress else None - # Build chaos transformer if --chaos is set chaos_arg = None if args.chaos is not None: from dataforge.chaos import ChaosTransformer chaos_arg = ChaosTransformer(null_rate=args.chaos) - # --stream mode: write directly to file if args.stream: fmt = args.format path = args.output @@ -444,8 +388,6 @@ def main(argv: list[str] | None = None) -> int: ) _progress_done(written) elif fmt == "json": - # JSON array can't easily stream, but we can generate - # and write — still respects --output schema_j = forge.schema( fields_arg, null_fields=null_fields, chaos=chaos_arg ) @@ -457,7 +399,6 @@ def main(argv: list[str] | None = None) -> int: ) _progress_done(args.count) elif fmt == "text": - # Stream text rows to file from dataforge.schema import _open_file written = 0 @@ -469,9 +410,7 @@ def main(argv: list[str] | None = None) -> int: _progress_done(written) return 0 - # Non-streaming mode: generate all data in memory if args.unique: - # Generate with unique proxy — row at a time schema = forge.schema(fields_arg, null_fields=null_fields, chaos=chaos_arg) rows: list[dict[str, object]] = [] seen: dict[str, set[object]] = {h: set() for h in headers} @@ -499,7 +438,6 @@ def main(argv: list[str] | None = None) -> int: schema_gen = forge.schema(fields_arg, null_fields=null_fields, chaos=chaos_arg) rows = schema_gen.generate(count=args.count) - # Determine output destination out_file = None if args.output: from dataforge.schema import _open_file @@ -516,7 +454,6 @@ def main(argv: list[str] | None = None) -> int: fmt = args.format if fmt == "text": - # Aligned columns str_rows = [{h: _format_value(row[h]) for h in headers} for row in rows] col_widths = [len(h) for h in headers] for row in str_rows: @@ -541,13 +478,11 @@ def main(argv: list[str] | None = None) -> int: writer = csv.DictWriter(buf, fieldnames=headers, delimiter=delimiter) if not args.no_header: writer.writeheader() - # Convert all values to strings for CSV for row in rows: writer.writerow({h: _format_value(row[h]) for h in headers}) print(buf.getvalue(), end="", file=out) elif fmt == "json": - # Serialize with native types (int, bool stay as numbers/bools) print( json.dumps( [{h: row[h] for h in headers} for row in rows], diff --git a/src/dataforge/constraints.py b/src/dataforge/constraints.py index 9ea7568..bc4ab41 100644 --- a/src/dataforge/constraints.py +++ b/src/dataforge/constraints.py @@ -1,28 +1,4 @@ -"""Constraint engine — correlated and conditional field generation. - -Enables fields that depend on other fields via geographic correlation, -temporal ordering, statistical correlation, conditional value pools, -and range constraints. - -The engine builds a dependency DAG, performs topological ordering, and -uses a two-pass generation strategy: - 1. Independent fields are generated column-first (fast batch path). - 2. Dependent fields are generated row-by-row in topological order. - -Usage:: - - from dataforge import DataForge - - forge = DataForge(seed=42) - schema = forge.schema({ - "country": "country", - "state": {"field": "address.state", "depends_on": "country"}, - "city": {"field": "address.city", "depends_on": "state"}, - "start_date": "date", - "end_date": {"field": "date", "temporal": "after", "reference": "start_date"}, - }) - rows = schema.generate(count=1000) -""" +"""Constraint engine — correlated and conditional field generation.""" from __future__ import annotations @@ -36,11 +12,6 @@ from dataforge.backend import RandomEngine -# ------------------------------------------------------------------ -# Constraint types -# ------------------------------------------------------------------ - - class FieldConstraint: """Base class for field constraints.""" @@ -62,7 +33,6 @@ class DependsOnConstraint(FieldConstraint): __slots__ = ("field", "column_name", "depends_on", "dep_type", "_geo_loaded") - # Class-level cache for geo module references — avoids repeated imports _geo_get_cities: Any = None _geo_get_states: Any = None _geo_get_zip: Any = None @@ -78,7 +48,6 @@ def __init__( self.field = field self.column_name = column_name self.depends_on = depends_on - # Detect dependency type from field name self.dep_type = self._detect_dep_type(field, depends_on) self._geo_loaded = False @@ -149,7 +118,6 @@ def generate( if self.dep_type == "currency": return cls._geo_currency.get(parent_str, "USD") - # Generic: fall back to provider method provider_attr, method_name = forge._resolve_field(self.field) provider = getattr(forge, provider_attr) method = getattr(provider, method_name) @@ -171,7 +139,7 @@ def __init__( ) -> None: self.field = field self.column_name = column_name - self.temporal = temporal # "before" or "after" + self.temporal = temporal self.reference = reference self.offset_days = offset_days @@ -180,12 +148,10 @@ def generate( ) -> Any: ref_val = row.get(self.reference) if ref_val is None: - # Fall back to regular generation provider_attr, method_name = forge._resolve_field(self.field) provider = getattr(forge, provider_attr) return getattr(provider, method_name)() - # Parse reference date if isinstance(ref_val, str): ref_date = _datetime.date.fromisoformat(ref_val) elif isinstance(ref_val, _datetime.datetime): @@ -200,7 +166,7 @@ def generate( if self.temporal == "after": result_date = ref_date + _datetime.timedelta(days=offset) - else: # "before" + else: result_date = ref_date - _datetime.timedelta(days=offset) return result_date.isoformat() @@ -239,8 +205,6 @@ def generate( except (ValueError, TypeError): return engine.gauss(self.mean, self.std) - # Cholesky-based correlated generation: - # y = rho * x + sqrt(1 - rho^2) * z, where z ~ N(0, 1) rho = self.correlation z = engine.gauss(0.0, 1.0) y = rho * x + _math.sqrt(max(0.0, 1.0 - rho * rho)) * z @@ -307,8 +271,8 @@ def __init__( self.column_name = column_name self.min_val = min_val self.max_val = max_val - self.min_ref = min_ref # column name for dynamic min - self.max_ref = max_ref # column name for dynamic max + self.min_ref = min_ref + self.max_ref = max_ref self.precision = precision def generate( @@ -317,7 +281,6 @@ def generate( lo = self.min_val if self.min_val is not None else 0.0 hi = self.max_val if self.max_val is not None else 100.0 - # Dynamic bounds from other columns if self.min_ref and self.min_ref in row: try: lo = float(row[self.min_ref]) @@ -335,29 +298,11 @@ def generate( return engine.random_float(lo, hi, self.precision) -# ------------------------------------------------------------------ -# Constraint engine: parse specs and build dependency DAG -# ------------------------------------------------------------------ - - def parse_field_spec( column_name: str, spec: dict[str, Any], ) -> tuple[FieldConstraint | None, list[str]]: - """Parse a dict-based field spec into a constraint and its dependencies. - - Parameters - ---------- - column_name : str - The output column name. - spec : dict - The field specification dict. - - Returns - ------- - tuple[FieldConstraint | None, list[str]] - The constraint object and a list of dependency column names. - """ + """Parse a dict-based field spec into a constraint and its dependencies.""" field = spec.get("field", column_name) deps: list[str] = [] @@ -416,31 +361,19 @@ def parse_field_spec( precision=int(spec.get("precision", 2)), ), [x for x in [spec.get("min_ref"), spec.get("max_ref")] if x] - # No constraint, just a field override return None, [] def build_dependency_order( field_specs: dict[str, Any], ) -> tuple[ - list[str], # independent columns (batch-able) - list[tuple[str, FieldConstraint]], # dependent columns in topo order - dict[str, FieldConstraint], # constraint map + list[str], + list[tuple[str, FieldConstraint]], + dict[str, FieldConstraint], ]: - """Build dependency DAG and return generation order. - - Parameters - ---------- - field_specs : dict - Column name → field spec (str or dict). - - Returns - ------- - tuple - (independent_columns, ordered_dependent, constraint_map) - """ + """Build dependency DAG and return generation order.""" constraints: dict[str, FieldConstraint] = {} - dep_graph: dict[str, list[str]] = {} # column → [depends_on columns] + dep_graph: dict[str, list[str]] = {} all_columns = list(field_specs.keys()) for col_name, spec in field_specs.items(): @@ -454,11 +387,9 @@ def build_dependency_order( else: dep_graph[col_name] = [] - # Separate independent and dependent columns dependent_set = set(constraints.keys()) independent = [c for c in all_columns if c not in dependent_set] - # Topological sort of dependent columns in_degree: dict[str, int] = {c: 0 for c in dependent_set} adj: dict[str, list[str]] = {c: [] for c in dependent_set} diff --git a/src/dataforge/core.py b/src/dataforge/core.py index ca639c6..14fa9f8 100644 --- a/src/dataforge/core.py +++ b/src/dataforge/core.py @@ -10,22 +10,16 @@ forge.person.full_name(count=1000) # list of 1000 full names forge.address.full_address() # "4821 Oak Ave, Chicago, IL 60614" forge.internet.email() # "james.smith@gmail.com" - forge.company.company_name() # "Acme Inc" - forge.phone.phone_number() # "555-123-4567" - forge.lorem.sentence() # "Lorem ipsum dolor sit amet." - forge.dt.date() # "2024-03-15" """ import importlib -from typing import TYPE_CHECKING, Any +from typing import Any from types import ModuleType from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Heuristic field-name mappings for ORM / model introspection -# ------------------------------------------------------------------ # Maps common model field names to DataForge field shorthand names. # Used by schema_from_pydantic() and schema_from_sqlalchemy(). @@ -134,12 +128,7 @@ "passport_no": "passport_number", } -# ------------------------------------------------------------------ # Type-based fallback mappings for ORM / model introspection -# ------------------------------------------------------------------ -# When a field name cannot be resolved by name alone, these -# fallbacks map Python type annotations to sensible DataForge fields. -# Keys are (module, qualname) tuples for non-builtin types. _TYPE_FALLBACK_BUILTINS: dict[type, str] = { str: "misc.random_element", # sentinel — too ambiguous @@ -148,8 +137,6 @@ bool: "boolean", } -# String-based type name fallbacks for stdlib / common types. -# Keyed on the type's __qualname__ (works without importing the module). _TYPE_FALLBACK_NAMES: dict[str, str] = { "date": "date", "datetime": "datetime", @@ -160,11 +147,7 @@ def _resolve_type_annotation(annotation: Any) -> type | None: - """Extract the concrete type from a possibly-wrapped annotation. - - Handles ``Optional[X]``, ``X | None``, ``list[X]``, and plain types. - Returns the core type or ``None`` if it cannot be determined. - """ + """Extract the concrete type from a possibly-wrapped annotation.""" import typing import types as _types @@ -193,12 +176,7 @@ def _resolve_type_annotation(annotation: Any) -> type | None: def _type_fallback(annotation: Any) -> str | None: - """Map a Python type annotation to a DataForge field name. - - Returns ``None`` if no sensible fallback exists. For ``str``, - ``int``, and ``float``, returns ``None`` because bare types are - too ambiguous to guess meaningfully. - """ + """Map a Python type annotation to a DataForge field name, or None.""" concrete = _resolve_type_annotation(annotation) if concrete is None: return None @@ -228,16 +206,10 @@ def _pydantic_heuristic(field_name: str) -> str | None: def _sqlalchemy_heuristic(col_name: str, column: "Any") -> str | None: - """Map a SQLAlchemy column name to a DataForge field name (or None). - - Uses the column name first, then falls back to type-based - heuristics for common SQL column types. - """ + """Map a SQLAlchemy column name to a DataForge field name (or None).""" alias = _FIELD_ALIASES.get(col_name) if alias: return alias - # Type-based fallback: if the column is an Integer primary key - # we already skip it. Other type-based heuristics could go here. return None @@ -259,104 +231,14 @@ def _sqlalchemy_heuristic(col_name: str, column: "Any") -> str | None: def _sqlalchemy_type_fallback(column: "Any") -> str | None: - """Map a SQLAlchemy column type to a DataForge field name. - - Uses the column's type class name to identify common SQL types - and map them to appropriate DataForge generators. - """ + """Map a SQLAlchemy column type to a DataForge field name.""" col_type = type(column.type) type_name = col_type.__name__ return _SA_TYPE_MAP.get(type_name) -if TYPE_CHECKING: - from dataforge.providers.address import AddressProvider - from dataforge.providers.automotive import AutomotiveProvider - from dataforge.providers.barcode import BarcodeProvider - from dataforge.providers.color import ColorProvider - from dataforge.providers.company import CompanyProvider - from dataforge.providers.crypto import CryptoProvider - from dataforge.providers.datetime import DateTimeProvider - from dataforge.providers.ecommerce import EcommerceProvider - from dataforge.providers.education import EducationProvider - from dataforge.providers.file import FileProvider - from dataforge.providers.finance import FinanceProvider - from dataforge.providers.geo import GeoProvider - from dataforge.providers.government import GovernmentProvider - from dataforge.providers.internet import InternetProvider - from dataforge.providers.lorem import LoremProvider - from dataforge.providers.medical import MedicalProvider - from dataforge.providers.misc import MiscProvider - from dataforge.providers.network import NetworkProvider - from dataforge.providers.payment import PaymentProvider - from dataforge.providers.person import PersonProvider - from dataforge.providers.phone import PhoneProvider - from dataforge.providers.profile import ProfileProvider - from dataforge.providers.science import ScienceProvider - from dataforge.providers.text import TextProvider - from dataforge.providers.ai_prompt import AiPromptProvider - from dataforge.providers.llm import LlmProvider - from dataforge.providers.ai_chat import AiChatProvider - from dataforge.providers.social_media import SocialMediaProvider - from dataforge.providers.music import MusicProvider - from dataforge.providers.sports import SportsProvider - from dataforge.providers.food import FoodProvider - from dataforge.providers.legal import LegalProvider - from dataforge.providers.real_estate import RealEstateProvider - from dataforge.providers.weather import WeatherProvider - from dataforge.providers.hardware import HardwareProvider - from dataforge.providers.logistics import LogisticsProvider - - class DataForge: - """High-performance fake data generator. - - Providers are loaded **lazily** — nothing is imported until a - provider property is first accessed. The provider registry - (:mod:`dataforge.registry`) resolves field names and provider - classes automatically, so new providers can be added without - editing this file. - - Parameters - ---------- - locale : str - The locale to use for data generation (e.g. ``"en_US"``). - Locale data is loaded **lazily** — nothing is imported until - a provider property is first accessed. - seed : int | None - Optional seed for reproducible output. When set, the stdlib - ``random`` backend is seeded for deterministic generation. - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> forge.person.first_name() - '...' - >>> forge.address.city() - '...' - >>> forge.internet.email() - '...' - >>> forge.company.company_name() - '...' - >>> forge.phone.phone_number() - '...' - >>> forge.lorem.sentence() - '...' - >>> forge.dt.date() - '...' - >>> forge.finance.credit_card_number() - '...' - >>> forge.color.hex_color() - '...' - >>> forge.file.file_name() - '...' - >>> forge.network.ipv6() - '...' - >>> forge.misc.uuid4() - '...' - >>> forge.barcode.ean13() - '...' - """ + """High-performance fake data generator with lazy provider loading.""" __slots__ = ( "_engine", @@ -373,17 +255,10 @@ def __init__(self, locale: str = "en_US", seed: int | None = None) -> None: self._locale_cache: dict[str, ModuleType] = {} self._unique_proxy: Any = None - # ------------------------------------------------------------------ # Dynamic provider access via registry - # ------------------------------------------------------------------ def _get_provider(self, name: str) -> BaseProvider: - """Lazily instantiate and cache a provider by registry name. - - Uses the provider registry to resolve the class and its - locale module requirements. Providers are instantiated once - and cached in ``_providers``. - """ + """Lazily instantiate and cache a provider by registry name.""" prov = self._providers.get(name) if prov is not None: return prov @@ -411,194 +286,7 @@ def _get_provider(self, name: str) -> BaseProvider: self._providers[name] = prov return prov - # ------------------------------------------------------------------ - # Explicit provider properties (for IDE autocomplete + type safety) - # These delegate to _get_provider() which uses the registry. - # ------------------------------------------------------------------ - - @property - def person(self) -> "PersonProvider": - """Access the person data provider (names, prefixes, suffixes).""" - return self._get_provider("person") # type: ignore[return-value] - - @property - def address(self) -> "AddressProvider": - """Access the address data provider (streets, cities, zip codes).""" - return self._get_provider("address") # type: ignore[return-value] - - @property - def internet(self) -> "InternetProvider": - """Access the internet data provider (emails, usernames, domains, IPs).""" - return self._get_provider("internet") # type: ignore[return-value] - - @property - def company(self) -> "CompanyProvider": - """Access the company data provider (names, catch phrases, job titles).""" - return self._get_provider("company") # type: ignore[return-value] - - @property - def phone(self) -> "PhoneProvider": - """Access the phone data provider (phone numbers, cell numbers).""" - return self._get_provider("phone") # type: ignore[return-value] - - @property - def lorem(self) -> "LoremProvider": - """Access the Lorem Ipsum text provider (words, sentences, paragraphs).""" - return self._get_provider("lorem") # type: ignore[return-value] - - @property - def dt(self) -> "DateTimeProvider": - """Access the datetime provider (dates, times, datetimes).""" - return self._get_provider("dt") # type: ignore[return-value] - - @property - def finance(self) -> "FinanceProvider": - """Access the finance provider (credit cards, IBANs, currencies).""" - return self._get_provider("finance") # type: ignore[return-value] - - @property - def color(self) -> "ColorProvider": - """Access the color provider (hex, RGB, HSL, color names).""" - return self._get_provider("color") # type: ignore[return-value] - - @property - def file(self) -> "FileProvider": - """Access the file provider (file names, extensions, MIME types, paths).""" - return self._get_provider("file") # type: ignore[return-value] - - @property - def network(self) -> "NetworkProvider": - """Access the network provider (IPv6, MAC, port, hostname, user agent).""" - return self._get_provider("network") # type: ignore[return-value] - - @property - def misc(self) -> "MiscProvider": - """Access the misc provider (UUID4, boolean, random_element, null_or).""" - return self._get_provider("misc") # type: ignore[return-value] - - @property - def barcode(self) -> "BarcodeProvider": - """Access the barcode provider (EAN-13, EAN-8, ISBN-13, ISBN-10).""" - return self._get_provider("barcode") # type: ignore[return-value] - - @property - def crypto(self) -> "CryptoProvider": - """Access the crypto provider (MD5, SHA-1, SHA-256 hex strings).""" - return self._get_provider("crypto") # type: ignore[return-value] - - @property - def automotive(self) -> "AutomotiveProvider": - """Access the automotive provider (plates, VINs, makes, models).""" - return self._get_provider("automotive") # type: ignore[return-value] - - @property - def education(self) -> "EducationProvider": - """Access the education provider (universities, degrees, fields).""" - return self._get_provider("education") # type: ignore[return-value] - - @property - def profile(self) -> "ProfileProvider": - """Access the profile provider (coherent user profiles).""" - return self._get_provider("profile") # type: ignore[return-value] - - @property - def government(self) -> "GovernmentProvider": - """Access the government provider (SSN, tax ID, passports).""" - return self._get_provider("government") # type: ignore[return-value] - - @property - def ecommerce(self) -> "EcommerceProvider": - """Access the e-commerce provider (products, SKUs, orders).""" - return self._get_provider("ecommerce") # type: ignore[return-value] - - @property - def medical(self) -> "MedicalProvider": - """Access the medical provider (ICD-10, drugs, blood types).""" - return self._get_provider("medical") # type: ignore[return-value] - - @property - def payment(self) -> "PaymentProvider": - """Access the payment provider (card types, processors, transactions).""" - return self._get_provider("payment") # type: ignore[return-value] - - @property - def text(self) -> "TextProvider": - """Access the text provider (quotes, headlines, paragraphs).""" - return self._get_provider("text") # type: ignore[return-value] - - @property - def geo(self) -> "GeoProvider": - """Access the geo provider (continents, oceans, rivers, coordinates).""" - return self._get_provider("geo") # type: ignore[return-value] - - @property - def science(self) -> "ScienceProvider": - """Access the science provider (elements, planets, units).""" - return self._get_provider("science") # type: ignore[return-value] - - @property - def ai_prompt(self) -> "AiPromptProvider": - """Access the AI prompt provider (user/system/creative prompts).""" - return self._get_provider("ai_prompt") # type: ignore[return-value] - - @property - def llm(self) -> "LlmProvider": - """Access the LLM provider (models, agents, RAG, moderation, billing).""" - return self._get_provider("llm") # type: ignore[return-value] - - @property - def ai_chat(self) -> "AiChatProvider": - """Access the AI chat provider (conversation turns, messages).""" - return self._get_provider("ai_chat") # type: ignore[return-value] - - @property - def social_media(self) -> "SocialMediaProvider": - """Access the social media provider (platforms, usernames, hashtags).""" - return self._get_provider("social_media") # type: ignore[return-value] - - @property - def music(self) -> "MusicProvider": - """Access the music provider (genres, artists, albums, songs).""" - return self._get_provider("music") # type: ignore[return-value] - - @property - def sports(self) -> "SportsProvider": - """Access the sports provider (sports, teams, leagues, venues).""" - return self._get_provider("sports") # type: ignore[return-value] - - @property - def food(self) -> "FoodProvider": - """Access the food provider (dishes, cuisines, ingredients, restaurants).""" - return self._get_provider("food") # type: ignore[return-value] - - @property - def legal(self) -> "LegalProvider": - """Access the legal provider (cases, courts, practice areas, firms).""" - return self._get_provider("legal") # type: ignore[return-value] - - @property - def real_estate(self) -> "RealEstateProvider": - """Access the real estate provider (properties, prices, neighborhoods).""" - return self._get_provider("real_estate") # type: ignore[return-value] - - @property - def weather(self) -> "WeatherProvider": - """Access the weather provider (conditions, temperature, wind, alerts).""" - return self._get_provider("weather") # type: ignore[return-value] - - @property - def hardware(self) -> "HardwareProvider": - """Access the hardware provider (CPUs, GPUs, RAM, storage, peripherals).""" - return self._get_provider("hardware") # type: ignore[return-value] - - @property - def logistics(self) -> "LogisticsProvider": - """Access the logistics provider (carriers, shipping, containers, tracking).""" - return self._get_provider("logistics") # type: ignore[return-value] - - # ------------------------------------------------------------------ # Unique value generation - # ------------------------------------------------------------------ @property def unique(self) -> "Any": @@ -622,9 +310,7 @@ def unique(self) -> "Any": self._unique_proxy = UniqueProxy(self) return self._unique_proxy - # ------------------------------------------------------------------ # Provider registration - # ------------------------------------------------------------------ def register_provider( self, @@ -633,18 +319,6 @@ def register_provider( ) -> None: """Register a custom provider class at runtime. - The provider is added to this ``DataForge`` instance's - internal registry and can be accessed via ``getattr``. - - Parameters - ---------- - provider_cls : type[BaseProvider] - The provider class to register. Must be a - ``BaseProvider`` subclass with ``_provider_name``. - name : str | None - Override the provider name. Defaults to the class's - ``_provider_name`` attribute. - Examples -------- >>> from dataforge.providers.base import BaseProvider @@ -680,12 +354,7 @@ def register_provider( register_runtime_provider(prov_name, provider_cls, locale_modules) def __getattr__(self, name: str) -> Any: - """Dynamic attribute access for registered providers. - - Allows ``forge.my_provider`` to work for providers - registered via :meth:`register_provider` at runtime, - without requiring a ``@property`` on the class. - """ + """Dynamic attribute access for registered providers.""" # Check if it's a cached provider providers = object.__getattribute__(self, "_providers") if name in providers: @@ -698,35 +367,17 @@ def __getattr__(self, name: str) -> Any: f"'{type(self).__name__}' object has no attribute '{name}'" ) from None - # ------------------------------------------------------------------ # Seed control - # ------------------------------------------------------------------ def seed(self, value: int) -> None: - """Re-seed the random engine for reproducible output. - - This resets the internal state of the stdlib ``random`` backend. - """ + """Re-seed the random engine for reproducible output.""" self._engine.seed(value) def copy(self, seed: int | None = None) -> "DataForge": - """Create a new ``DataForge`` instance with the same locale. - - Parameters - ---------- - seed : int | None - Optional seed for the new instance. If ``None``, the new - instance is unseeded (non-deterministic). - - Returns - ------- - DataForge - """ + """Create a new DataForge instance with the same locale.""" return DataForge(locale=self._locale, seed=seed) - # ------------------------------------------------------------------ # Schema API - # ------------------------------------------------------------------ def schema( self, @@ -735,67 +386,7 @@ def schema( unique_together: "list[tuple[str, ...]] | None" = None, chaos: "Any | None" = None, ) -> "Any": - """Create a pre-resolved :class:`Schema` for maximum throughput. - - Parameters - ---------- - fields : list[str] | dict[str, str | Callable | dict] - Fields to generate. String values are resolved to provider - methods. Callable values receive the current row dict and - can reference previously generated columns. Dict values - define constraints (``depends_on``, ``temporal``, ``correlate``, - ``conditional``, ``range``). - null_fields : dict[str, float] | None - Optional mapping of column names to null probabilities - (0.0–1.0). Example: ``{"email": 0.3}`` makes ~30% of - email values ``None``. - unique_together : list[tuple[str, ...]] | None - Optional list of column-name tuples whose combinations - must be unique. Example: ``[("first_name", "last_name")]`` - ensures no two rows share the same name pair. - chaos : ChaosTransformer | dict | None - Optional chaos/data-quality transformer. Pass a - :class:`~dataforge.chaos.ChaosTransformer` instance or a - config dict (e.g. ``{"null_rate": 0.1, "type_mismatch_rate": 0.05}``). - - Returns - ------- - Schema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> s = forge.schema(["first_name", "email"]) - >>> rows = s.generate(count=1000) - - Nullable fields: - - >>> s = forge.schema(["first_name", "email"], - ... null_fields={"email": 0.2}) - >>> rows = s.generate(count=100) - >>> none_count = sum(1 for r in rows if r["email"] is None) - - Unique combinations: - - >>> s = forge.schema(["first_name", "last_name", "email"], - ... unique_together=[("first_name", "last_name")]) - >>> rows = s.generate(count=50) - - Constrained/correlated fields: - - >>> s = forge.schema({ - ... "country": "country", - ... "state": {"field": "address.state", "depends_on": "country"}, - ... }) - >>> rows = s.generate(count=100) - - Chaos mode: - - >>> from dataforge.chaos import ChaosTransformer - >>> s = forge.schema(["first_name", "email"], - ... chaos=ChaosTransformer(null_rate=0.1)) - >>> rows = s.generate(count=100) - """ + """Create a pre-resolved Schema for maximum throughput.""" from dataforge.schema import Schema return Schema( @@ -810,46 +401,7 @@ def relational( self, tables: "dict[str, dict[str, Any]]", ) -> "Any": - """Create a :class:`RelationalSchema` for multi-table generation. - - Generates related tables with referential integrity. Parent - tables are generated first; child tables receive foreign keys - pointing to parent rows. - - Parameters - ---------- - tables : dict[str, dict] - Table specifications. Each spec can include: - - - ``fields`` — list or dict of field specs (same as Schema) - - ``count`` — number of rows (default: 10) - - ``parent`` — name of the parent table (creates a FK) - - ``parent_key`` — FK column name (default: ``{parent}_id``) - - ``children_per_parent`` — ``(min, max)`` cardinality bounds - - ``null_fields`` — per-field null probabilities - - Returns - ------- - RelationalSchema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> rel = forge.relational({ - ... "users": { - ... "fields": ["first_name", "last_name", "email"], - ... "count": 10, - ... }, - ... "orders": { - ... "fields": ["date", "city"], - ... "count": 30, - ... "parent": "users", - ... }, - ... }) - >>> data = rel.generate() - >>> len(data["users"]) - 10 - """ + """Create a RelationalSchema for multi-table generation with referential integrity.""" from dataforge.relational import RelationalSchema return RelationalSchema(self, tables) @@ -858,31 +410,7 @@ def schema_from_dict( self, d: dict[str, Any], ) -> "Any": - """Create a :class:`Schema` from a schema definition dict. - - The dict format matches what :meth:`Schema.to_schema_dict` - produces, and what :func:`dataforge.schema_io.load_schema` - reads from JSON/YAML/TOML files. - - Parameters - ---------- - d : dict[str, Any] - Schema definition with ``fields``, optional ``count``, - ``null_fields``, and ``unique_together`` keys. - - Returns - ------- - Schema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> s = forge.schema_from_dict({ - ... "fields": {"name": "full_name", "email": "email"}, - ... "count": 100, - ... }) - >>> rows = s.generate() # uses count from dict - """ + """Create a Schema from a schema definition dict.""" from dataforge.schema import Schema from dataforge.schema_io import dict_to_schema_args @@ -899,63 +427,23 @@ def schema_from_file( path: str, format: str | None = None, ) -> "Any": - """Create a :class:`Schema` by loading a schema definition file. - - Supports JSON, YAML, and TOML formats. The format is - auto-detected from the file extension when *format* is - ``None``. - - Parameters - ---------- - path : str - Path to the schema definition file. - format : str | None - Input format (``"json"``, ``"yaml"``, ``"toml"``). - Auto-detected from extension when ``None``. - - Returns - ------- - Schema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> s = forge.schema_from_file("my_schema.yaml") - >>> rows = s.generate(count=100) - """ + """Create a Schema by loading a JSON/YAML/TOML definition file.""" from dataforge.schema_io import load_schema d = load_schema(path, format=format) return self.schema_from_dict(d) - # ------------------------------------------------------------------ # Locale management - # ------------------------------------------------------------------ @property def locale(self) -> str: """The currently active locale string (e.g. ``"en_US"``).""" return self._locale - # ------------------------------------------------------------------ # Internal helpers - # ------------------------------------------------------------------ def _load_locale_module(self, module_name: str) -> ModuleType: - """Dynamically import a locale data module. - - Results are cached so that repeated access to the same provider - does not re-import the module. - - If the requested locale does not provide the specified module, - falls back to ``en_US`` and emits a warning. - - Parameters - ---------- - module_name : str - The name of the submodule inside the locale package - (e.g. ``"person"``, ``"address"``). - """ + """Dynamically import and cache a locale data module.""" key = f"{self._locale}.{module_name}" if key not in self._locale_cache: try: @@ -980,11 +468,7 @@ def _load_locale_module(self, module_name: str) -> ModuleType: return self._locale_cache[key] def _resolve_field(self, field: str) -> tuple[str, str]: - """Resolve a field name to (provider_attr, method_name). - - Supports both direct names (e.g. ``"first_name"``) and - dotted paths (e.g. ``"person.first_name"``). - """ + """Resolve a field name to (provider_attr, method_name).""" # Dotted path: "person.first_name" → ("person", "first_name") if "." in field: provider_attr, method_name = field.split(".", 1) @@ -1000,41 +484,14 @@ def _resolve_field(self, field: str) -> tuple[str, str]: f"(e.g. 'person.first_name') or a known shorthand." ) - # ------------------------------------------------------------------ # Bulk data generation - # ------------------------------------------------------------------ def to_dict( self, fields: list[str] | dict[str, str], count: int = 10, ) -> list[dict[str, Any]]: - """Generate *count* rows of fake data as a list of dicts. - - Uses :class:`Schema` internally for zero-duplication. - Values are preserved in their native Python types. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate. Can be a list of field names (e.g. - ``["first_name", "email"]``) or a dict mapping output column - names to field names (e.g. ``{"Name": "full_name"}``). - count : int - Number of rows to generate. - - Returns - ------- - list[dict[str, Any]] - Each dict maps column name → generated value (native type). - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> rows = forge.to_dict(["first_name", "email"], count=3) - >>> len(rows) - 3 - """ + """Generate *count* rows of fake data as a list of dicts.""" return self.schema(fields).generate(count=count) def to_json( @@ -1046,31 +503,7 @@ def to_json( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate fake data and return as a JSON array. - - Delegates to :meth:`Schema.to_json` for zero-duplication. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - count : int - Number of rows. - path : str | None - If provided, write JSON to this file path. - indent : int - JSON indentation level (default: 2). - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - The JSON content as a string. - """ + """Generate fake data as JSON. Delegates to Schema.to_json.""" return self.schema(fields).to_json( count=count, path=path, @@ -1088,32 +521,7 @@ def to_csv( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate fake data and return (or write) as CSV. - - Delegates to :meth:`Schema.to_csv` for zero-duplication. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - count : int - Number of rows. - path : str | None - If provided, write CSV to this file path. Otherwise return - the CSV as a string. - delimiter : str - Field delimiter (default: comma). - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - The CSV content as a string. - """ + """Generate fake data as CSV. Delegates to Schema.to_csv.""" return self.schema(fields).to_csv( count=count, path=path, @@ -1130,29 +538,7 @@ def to_jsonl( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate fake data and return (or write) as JSON Lines. - - Delegates to :meth:`Schema.to_jsonl` for zero-duplication. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - count : int - Number of rows. - path : str | None - If provided, write JSONL to this file path. - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - The JSONL content as a string. - """ + """Generate fake data as JSON Lines. Delegates to Schema.to_jsonl.""" return self.schema(fields).to_jsonl( count=count, path=path, @@ -1170,33 +556,7 @@ def to_sql( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate fake data and return as SQL INSERT statements. - - Delegates to :meth:`Schema.to_sql` for zero-duplication. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - table : str - Target table name. - count : int - Number of rows. - dialect : str - SQL dialect: ``"sqlite"``, ``"mysql"``, or ``"postgresql"``. - path : str | None - If provided, write SQL to this file path. - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - SQL INSERT statements as a string. - """ + """Generate fake data as SQL INSERT statements. Delegates to Schema.to_sql.""" return self.schema(fields).to_sql( table=table, count=count, @@ -1211,23 +571,7 @@ def to_dataframe( fields: list[str] | dict[str, str], count: int = 10, ) -> "Any": - """Generate fake data as a pandas DataFrame. - - Delegates to :meth:`Schema.to_dataframe` for zero-duplication. - Requires ``pandas`` to be installed. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - count : int - Number of rows. - - Returns - ------- - pandas.DataFrame - A DataFrame with one column per field. - """ + """Generate fake data as a pandas DataFrame. Requires pandas.""" return self.schema(fields).to_dataframe(count=count) def stream_to_csv( @@ -1240,34 +584,7 @@ def stream_to_csv( encoding: str = "utf-8", compress: bool | None = None, ) -> int: - """Stream fake data directly to a CSV file. - - Memory-efficient: writes in batches without materializing - all rows in memory. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate. - path : str - File path to write. - count : int - Number of rows. - batch_size : int | None - Rows per batch. Auto-tuned when ``None``. - delimiter : str - Field delimiter (default: comma). - encoding : str - Character encoding (default: utf-8). - compress : bool | None - If ``True``, gzip the output. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - int - Number of rows written. - """ + """Stream fake data to a CSV file in batches. Returns rows written.""" return self.schema(fields).stream_to_csv( path=path, count=count, @@ -1286,32 +603,7 @@ def stream_to_jsonl( encoding: str = "utf-8", compress: bool | None = None, ) -> int: - """Stream fake data directly to a JSON Lines file. - - Memory-efficient: writes in batches without materializing - all rows in memory. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate. - path : str - File path to write. - count : int - Number of rows. - batch_size : int | None - Rows per batch. Auto-tuned when ``None``. - encoding : str - Character encoding (default: utf-8). - compress : bool | None - If ``True``, gzip the output. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - int - Number of rows written. - """ + """Stream fake data to a JSON Lines file in batches. Returns rows written.""" return self.schema(fields).stream_to_jsonl( path=path, count=count, @@ -1326,24 +618,7 @@ def to_arrow( count: int = 10, batch_size: int | None = None, ) -> "Any": - """Generate fake data as a PyArrow Table. - - Delegates to :meth:`Schema.to_arrow` for zero-duplication. - Requires ``pyarrow`` to be installed. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - count : int - Number of rows. - batch_size : int | None - Rows per internal batch. Auto-tuned when ``None``. - - Returns - ------- - pyarrow.Table - """ + """Generate fake data as a PyArrow Table. Requires pyarrow.""" return self.schema(fields).to_arrow(count=count, batch_size=batch_size) def to_polars( @@ -1352,24 +627,7 @@ def to_polars( count: int = 10, batch_size: int | None = None, ) -> "Any": - """Generate fake data as a Polars DataFrame. - - Delegates to :meth:`Schema.to_polars` for zero-duplication. - Requires ``polars`` to be installed. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate (same format as :meth:`to_dict`). - count : int - Number of rows. - batch_size : int | None - Rows per internal batch. Auto-tuned when ``None``. - - Returns - ------- - polars.DataFrame - """ + """Generate fake data as a Polars DataFrame. Requires polars.""" return self.schema(fields).to_polars(count=count, batch_size=batch_size) def to_parquet( @@ -1379,27 +637,7 @@ def to_parquet( count: int = 10, batch_size: int | None = None, ) -> int: - """Generate fake data and write as a Parquet file. - - Requires ``pyarrow`` to be installed. Data is written in - batched row-groups for bounded memory usage. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Fields to generate. - path : str - File path to write. - count : int - Number of rows. - batch_size : int | None - Rows per row-group. Auto-tuned when ``None``. - - Returns - ------- - int - Number of rows written. - """ + """Generate fake data and write as Parquet. Requires pyarrow. Returns rows written.""" return self.schema(fields).to_parquet( path=path, count=count, batch_size=batch_size ) @@ -1407,106 +645,38 @@ def to_parquet( def __repr__(self) -> str: return f"DataForge(locale={self._locale!r})" - # ------------------------------------------------------------------ # Introspection API - # ------------------------------------------------------------------ @staticmethod def list_providers() -> list[str]: - """Return a sorted list of all available provider names. - - Returns - ------- - list[str] - Provider names (e.g. ``["address", "company", "person", ...]``). - """ + """Return a sorted list of all available provider names.""" from dataforge.registry import get_provider_info return sorted(get_provider_info()) @staticmethod def list_fields() -> dict[str, tuple[str, str]]: - """Return all available field names with their provider/method info. - - Returns - ------- - dict[str, tuple[str, str]] - Mapping of ``{field_name: (provider_name, method_name)}``, - sorted by field name. - - Examples - -------- - >>> fields = DataForge.list_fields() - >>> fields["first_name"] - ('person', 'first_name') - """ + """Return all available field names with their provider/method info.""" from dataforge.registry import get_field_map fm = get_field_map() return dict(sorted(fm.items())) - # ------------------------------------------------------------------ # Time-series generation - # ------------------------------------------------------------------ def timeseries(self, **kwargs: Any) -> "Any": - """Create a :class:`~dataforge.timeseries.TimeSeriesSchema`. - - Parameters - ---------- - **kwargs - All keyword arguments are forwarded to - :class:`~dataforge.timeseries.TimeSeriesSchema`. - Common options: ``start``, ``end``, ``interval``, - ``trend``, ``seasonality_amplitude``, ``noise_std``, - ``anomaly_rate``, ``spike_amplitude``. - - Returns - ------- - TimeSeriesSchema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> ts = forge.timeseries( - ... start="2024-01-01", end="2024-01-31", - ... interval="1h", trend=0.01, noise_std=0.5, - ... ) - >>> rows = ts.generate() - """ + """Create a TimeSeriesSchema. Kwargs forwarded to TimeSeriesSchema.""" from dataforge.timeseries import TimeSeriesSchema return TimeSeriesSchema(self, **kwargs) - # ------------------------------------------------------------------ # Schema inference - # ------------------------------------------------------------------ def infer_schema( self, data: "list[dict[str, Any]]", ) -> "Any": - """Infer a :class:`Schema` from sample data (list of dicts). - - Analyzes the data to detect types, semantic patterns, and - distributions, then builds a matching Schema. - - Parameters - ---------- - data : list[dict] - Sample rows to analyze. - - Returns - ------- - Schema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> sample = [{"name": "Alice", "email": "alice@example.com"}] - >>> s = forge.infer_schema(sample) - >>> rows = s.generate(count=100) - """ + """Infer a Schema from sample data (list of dicts).""" from dataforge.inference import SchemaInferrer inferrer = SchemaInferrer(self) @@ -1517,72 +687,16 @@ def infer_schema_from_csv( path: str, max_rows: int = 1000, ) -> "Any": - """Infer a :class:`Schema` from a CSV file. - - Parameters - ---------- - path : str - Path to the CSV file to analyze. - max_rows : int - Maximum rows to sample for inference. - - Returns - ------- - Schema - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> s = forge.infer_schema_from_csv("users.csv") - >>> rows = s.generate(count=1000) - """ + """Infer a Schema from a CSV file.""" from dataforge.inference import SchemaInferrer inferrer = SchemaInferrer(self) return inferrer.from_csv(path, max_rows=max_rows) - # ------------------------------------------------------------------ # Schema factories from ORM / model introspection - # ------------------------------------------------------------------ def schema_from_pydantic(self, model: type) -> "Any": - """Create a :class:`Schema` by introspecting a Pydantic model. - - Maps model field names to DataForge fields using a three-tier - strategy: - - 1. **Exact name match** — if the field name matches a registered - DataForge field (e.g. ``first_name``, ``email``), use it. - 2. **Alias match** — common aliases like ``fname`` → ``first_name``. - 3. **Type-based fallback** — if the field type annotation is - ``bool``, ``datetime.date``, ``uuid.UUID``, etc., map to a - sensible generator automatically. - - Fields that cannot be mapped are silently skipped (a warning is - emitted). - - Requires ``pydantic`` to be installed. - - Parameters - ---------- - model : type - A Pydantic ``BaseModel`` subclass. - - Returns - ------- - Schema - - Examples - -------- - >>> from pydantic import BaseModel - >>> class User(BaseModel): - ... first_name: str - ... email: str - ... city: str - >>> forge = DataForge(seed=42) - >>> s = forge.schema_from_pydantic(User) - >>> rows = s.generate(count=5) - """ + """Create a Schema by introspecting a Pydantic BaseModel subclass.""" from dataforge.schema import Schema try: @@ -1652,47 +766,7 @@ def schema_from_pydantic(self, model: type) -> "Any": return Schema(self, mapped) def schema_from_sqlalchemy(self, model: type) -> "Any": - """Create a :class:`Schema` by introspecting a SQLAlchemy model. - - Maps column names to DataForge fields using a three-tier - strategy: - - 1. **Exact name match** — if the column name matches a registered - DataForge field, use it. - 2. **Alias match** — common aliases like ``fname`` → ``first_name``. - 3. **Column type fallback** — if the column type is ``Boolean``, - ``Date``, ``DateTime``, ``UUID``, ``Text``, etc., map to a - sensible generator automatically. - - Primary key columns named ``id`` are skipped automatically. - Columns that cannot be mapped are silently skipped (a warning - is emitted). - - Requires ``sqlalchemy`` to be installed. - - Parameters - ---------- - model : type - A SQLAlchemy declarative model class (must have - ``__table__`` attribute). - - Returns - ------- - Schema - - Examples - -------- - >>> from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column - >>> class Base(DeclarativeBase): pass - >>> class User(Base): - ... __tablename__ = "users" - ... id: Mapped[int] = mapped_column(primary_key=True) - ... first_name: Mapped[str] - ... email: Mapped[str] - >>> forge = DataForge(seed=42) - >>> s = forge.schema_from_sqlalchemy(User) - >>> rows = s.generate(count=5) - """ + """Create a Schema by introspecting a SQLAlchemy declarative model.""" from dataforge.schema import Schema try: diff --git a/src/dataforge/data/correlations/geo.py b/src/dataforge/data/correlations/geo.py index 15c2dcd..a286b92 100644 --- a/src/dataforge/data/correlations/geo.py +++ b/src/dataforge/data/correlations/geo.py @@ -10,9 +10,7 @@ from __future__ import annotations -# ------------------------------------------------------------------ # Country → States mapping -# ------------------------------------------------------------------ COUNTRY_STATES: dict[str, tuple[str, ...]] = { "United States": ( @@ -171,9 +169,7 @@ ), } -# ------------------------------------------------------------------ # State → Cities mapping (representative cities per state) -# ------------------------------------------------------------------ STATE_CITIES: dict[str, tuple[str, ...]] = { # United States @@ -256,9 +252,7 @@ "Jalisco": ("Guadalajara", "Zapopan", "Tlaquepaque"), } -# ------------------------------------------------------------------ # State → Zip code ranges (US-style prefix ranges) -# ------------------------------------------------------------------ STATE_ZIP_PREFIX: dict[str, tuple[str, ...]] = { "California": ( @@ -320,9 +314,7 @@ "Colorado": ("800", "801", "802", "803", "804", "805", "806", "808", "809", "810"), } -# ------------------------------------------------------------------ # Country → Phone format -# ------------------------------------------------------------------ COUNTRY_PHONE_FORMAT: dict[str, str] = { "United States": "+1-###-###-####", @@ -337,9 +329,7 @@ "Mexico": "+52-##-####-####", } -# ------------------------------------------------------------------ # Country → Currency -# ------------------------------------------------------------------ COUNTRY_CURRENCY: dict[str, str] = { "United States": "USD", diff --git a/src/dataforge/decorators.py b/src/dataforge/decorators.py index 96be933..7a90f12 100644 --- a/src/dataforge/decorators.py +++ b/src/dataforge/decorators.py @@ -1,48 +1,4 @@ -"""Provider decorators — simplified API for creating custom providers. - -The ``@provider`` decorator lets users create lightweight providers -with minimal boilerplate. Instead of subclassing ``BaseProvider`` and -defining ``_provider_name``, ``_field_map``, ``_locale_modules``, -``__slots__``, and ``@overload`` signatures manually, a single -decorator transforms a plain class into a fully registered provider. - -Usage:: - - from dataforge.decorators import provider - - @provider("greeting") - class GreetingProvider: - def hello(self) -> str: - return "Hello, world!" - - def goodbye(self) -> str: - return "Goodbye!" - - # Now usable: - forge = DataForge() - forge.register_provider(GreetingProvider) - forge.greeting.hello() # "Hello, world!" - forge.greeting.hello(count=5) # ["Hello, world!"] * 5 - -Each public method (no leading ``_``) is auto-wrapped to support -the ``count=1`` → scalar, ``count>1`` → list convention used by -all DataForge providers. A ``_field_map`` is auto-generated from -all public methods. - -Advanced usage:: - - @provider( - "weather_custom", - field_map={"temp": "temperature", "wind": "wind_speed"}, - locale_modules=("weather",), - ) - class WeatherCustom: - def temperature(self) -> str: - return "22°C" - - def wind_speed(self) -> str: - return "15 km/h" -""" +"""Provider decorators — simplified API for creating custom providers.""" from __future__ import annotations @@ -60,41 +16,9 @@ def provider( locale_modules: tuple[str, ...] = (), needs_forge: bool = False, ) -> Any: - """Class decorator that transforms a plain class into a DataForge provider. - - Parameters - ---------- - name : str - The provider name (used as ``forge.``). - field_map : dict[str, str] | None - Optional explicit field map. When ``None``, a field map is - auto-generated from all public methods (method_name → method_name). - locale_modules : tuple[str, ...] - Locale data modules required by this provider. - needs_forge : bool - If ``True``, the provider receives the ``DataForge`` instance - as a second constructor argument (for cross-provider access). - - Returns - ------- - type - A ``BaseProvider`` subclass ready for registration. - - Examples - -------- - >>> from dataforge.decorators import provider - >>> @provider("greet") - ... class GreetProvider: - ... def hello(self): - ... return "hi" - >>> GreetProvider._provider_name - 'greet' - >>> GreetProvider._field_map - {'hello': 'hello'} - """ + """Class decorator that transforms a plain class into a DataForge provider.""" def decorator(cls: type) -> type: - # Discover public methods from the user's class user_methods: dict[str, Any] = {} for attr_name in list(vars(cls)): if attr_name.startswith("_"): @@ -103,11 +27,8 @@ def decorator(cls: type) -> type: if callable(obj): user_methods[attr_name] = obj - # Build field_map if not explicitly provided fm = field_map if field_map is not None else {m: m for m in user_methods} - # Build the new class with BaseProvider as base - # We need to create wrapped methods that support count=1 / count>N namespace: dict[str, Any] = { "__slots__": (), "_provider_name": name, @@ -116,7 +37,6 @@ def decorator(cls: type) -> type: "_needs_forge": needs_forge, } - # Create __init__ that handles locale and forge arguments if needs_forge: def __init__(self: Any, engine: RandomEngine, forge: Any) -> None: @@ -125,7 +45,7 @@ def __init__(self: Any, engine: RandomEngine, forge: Any) -> None: namespace["__slots__"] = ("_forge",) elif locale_modules: - # Dynamic __init__ that accepts locale module arguments + def __init__(self: Any, engine: RandomEngine, *locale_args: Any) -> None: # type: ignore[misc] BaseProvider.__init__(self, engine) for i, mod_name in enumerate(locale_modules): @@ -135,15 +55,12 @@ def __init__(self: Any, engine: RandomEngine, *locale_args: Any) -> None: # typ slot_names = tuple(f"_locale_{m}" for m in locale_modules) namespace["__slots__"] = slot_names - # Wrap each user method to support count parameter for method_name, method_func in user_methods.items(): wrapped = _wrap_with_count(method_func) namespace[method_name] = wrapped - # Create the provider class new_cls = type(cls.__name__, (BaseProvider,), namespace) - # Preserve the original class's module and qualname for debugging new_cls.__module__ = cls.__module__ new_cls.__qualname__ = cls.__qualname__ @@ -153,15 +70,7 @@ def __init__(self: Any, engine: RandomEngine, *locale_args: Any) -> None: # typ def _wrap_with_count(func: Any) -> Any: - """Wrap a scalar-returning method to support the ``count`` parameter. - - The wrapped method calls the original function once when - ``count == 1`` (returning a scalar) and ``count`` times when - ``count > 1`` (returning a list). - - The original function should accept ``self`` as its first argument - and return a single value. - """ + """Wrap a scalar-returning method to support the ``count`` parameter.""" @functools.wraps(func) def wrapper(self: Any, count: int = 1) -> Any: diff --git a/src/dataforge/inference.py b/src/dataforge/inference.py index 87785a0..4c10ba5 100644 --- a/src/dataforge/inference.py +++ b/src/dataforge/inference.py @@ -1,29 +1,4 @@ -"""Schema inference — analyze data and auto-create matching Schemas. - -Analyzes CSV files, DataFrames, database tables, or lists of dicts -to detect types, semantic patterns, distributions, and null rates, -then builds a matching DataForge Schema. - -Usage:: - - from dataforge import DataForge - from dataforge.inference import SchemaInferrer - - forge = DataForge(seed=42) - inferrer = SchemaInferrer(forge) - - # From CSV - schema = inferrer.from_csv("data.csv") - - # From list of dicts - schema = inferrer.from_records([ - {"name": "Alice", "email": "alice@test.com", "age": 30}, - {"name": "Bob", "email": "bob@test.com", "age": 25}, - ]) - - # Inspect what was detected - print(inferrer.describe()) -""" +"""Schema inference — analyze data and auto-create matching Schemas.""" from __future__ import annotations @@ -33,10 +8,6 @@ if TYPE_CHECKING: from dataforge.core import DataForge -# ------------------------------------------------------------------ -# Semantic type detection patterns -# ------------------------------------------------------------------ - _SEMANTIC_PATTERNS: list[tuple[str, _re.Pattern[str], str]] = [ ("email", _re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"), "email"), ("phone", _re.compile(r"^[\+]?[\d\s\-\(\)]{7,20}$"), "phone_number"), @@ -66,12 +37,6 @@ ("isbn", _re.compile(r"^97[89]-?\d{1,5}-?\d{1,7}-?\d{1,7}-?\d$"), "isbn13"), ] -# ------------------------------------------------------------------ -# Column name heuristics (from core.py _FIELD_ALIASES) -# ------------------------------------------------------------------ - - -# Module-level cache for field aliases — populated on first use _CACHED_ALIASES: dict[str, str] | None = None @@ -85,22 +50,12 @@ def _get_field_aliases() -> dict[str, str]: return _CACHED_ALIASES -# ------------------------------------------------------------------ -# Type detection -# ------------------------------------------------------------------ - - -# Pre-compiled patterns for fast numeric string detection (avoids try/except overhead) _INT_PATTERN = _re.compile(r"^-?\d+$") _FLOAT_PATTERN = _re.compile(r"^-?\d+\.\d*$|^-?\d*\.\d+$|^-?\d+[eE][+-]?\d+$") def _detect_base_type(values: list[Any]) -> str: - """Detect the base type of a column's values. - - Returns one of: 'str', 'int', 'float', 'bool', 'date', - 'datetime', 'none', 'mixed'. - """ + """Detect the base type of a column's values.""" type_counts: dict[str, int] = {} _int_match = _INT_PATTERN.match _float_match = _FLOAT_PATTERN.match @@ -110,7 +65,6 @@ def _detect_base_type(values: list[Any]) -> str: continue t = type(val).__name__ if t == "str": - # Fast regex-based numeric detection (avoids try/except overhead) s = val.strip() if _int_match(s): type_counts["int"] = type_counts.get("int", 0) + 1 @@ -123,12 +77,10 @@ def _detect_base_type(values: list[Any]) -> str: continue type_counts[t] = type_counts.get(t, 0) + 1 - # Remove 'none' for type decision non_none = {k: v for k, v in type_counts.items() if k != "none"} if not non_none: return "none" dominant = max(non_none, key=lambda k: non_none[k]) - # If >80% of non-none values are the same type, use it total_non_none = sum(non_none.values()) if non_none[dominant] / total_non_none >= 0.8: return dominant @@ -140,26 +92,19 @@ def _detect_semantic_type( values: list[Any], base_type: str, ) -> str | None: - """Detect the semantic type of a column. - - Returns a DataForge field name or None. - """ - # 1. Try column name heuristic + """Detect the semantic type of a column.""" aliases = _get_field_aliases() name_lower = col_name.lower().strip().replace(" ", "_") if name_lower in aliases: return aliases[name_lower] - # Also try without common prefixes/suffixes for prefix in ("user_", "customer_", "order_", "item_"): if name_lower.startswith(prefix): stripped = name_lower[len(prefix) :] if stripped in aliases: return aliases[stripped] - # 2. Try regex patterns on string values if base_type == "str": - # Sample up to 100 non-null string values for pattern detection sample = [str(v) for v in values if v is not None and str(v).strip()][:100] if sample: for _name, pattern, field in _SEMANTIC_PATTERNS: @@ -167,11 +112,9 @@ def _detect_semantic_type( if matches / len(sample) >= 0.7: return field - # 3. Type-based fallback if base_type == "bool": return "boolean" if base_type == "int": - # Check if it looks like age, port, year, etc. if "age" in name_lower: return "misc.random_int" if "port" in name_lower: @@ -223,11 +166,6 @@ def _compute_stats(values: list[Any], base_type: str) -> dict[str, Any]: return stats -# ------------------------------------------------------------------ -# Column analysis result -# ------------------------------------------------------------------ - - class ColumnAnalysis: """Analysis result for a single column.""" @@ -263,21 +201,8 @@ def __repr__(self) -> str: ) -# ------------------------------------------------------------------ -# SchemaInferrer -# ------------------------------------------------------------------ - - class SchemaInferrer: - """Analyze data sources and build matching DataForge Schemas. - - Parameters - ---------- - forge : DataForge - The DataForge instance to create schemas with. - sample_size : int - Maximum number of rows to sample for analysis. - """ + """Analyze data sources and build matching DataForge Schemas.""" __slots__ = ("_forge", "_sample_size", "_analyses") @@ -290,25 +215,13 @@ def from_records( self, records: list[dict[str, Any]], ) -> Any: - """Infer a Schema from a list of dicts. - - Parameters - ---------- - records : list[dict[str, Any]] - Input data rows. - - Returns - ------- - Schema - """ + """Infer a Schema from a list of dicts.""" if not records: raise ValueError("Cannot infer schema from empty data.") - # Sample sample = records[: self._sample_size] columns = list(sample[0].keys()) - # Analyze each column self._analyses = [] field_map: dict[str, str] = {} null_fields: dict[str, float] = {} @@ -343,21 +256,7 @@ def from_csv( delimiter: str = ",", encoding: str = "utf-8", ) -> Any: - """Infer a Schema from a CSV file. - - Parameters - ---------- - path : str - Path to the CSV file. - delimiter : str - Field delimiter. - encoding : str - File encoding. - - Returns - ------- - Schema - """ + """Infer a Schema from a CSV file.""" import csv with open(path, "r", encoding=encoding, newline="") as f: @@ -371,17 +270,7 @@ def from_csv( return self.from_records(records) def from_dataframe(self, df: Any) -> Any: - """Infer a Schema from a pandas DataFrame. - - Parameters - ---------- - df : pandas.DataFrame - Input DataFrame. - - Returns - ------- - Schema - """ + """Infer a Schema from a pandas DataFrame.""" sample = df.head(self._sample_size) records = sample.to_dict("records") return self.from_records(records) @@ -397,17 +286,14 @@ def _analyze_column( null_rate = _compute_null_rate(values) stats = _compute_stats(values, base_type) - # Determine DataForge field dataforge_field: str | None = None if semantic_type: - # Verify it's a valid field try: self._forge._resolve_field(semantic_type) dataforge_field = semantic_type except ValueError: dataforge_field = None - # Fallback: try column name directly if dataforge_field is None: try: self._forge._resolve_field(col_name) @@ -415,7 +301,6 @@ def _analyze_column( except ValueError: pass - # Last resort: type-based fallback if dataforge_field is None: if base_type == "bool": dataforge_field = "boolean" @@ -434,12 +319,7 @@ def _analyze_column( ) def describe(self) -> str: - """Return a human-readable description of the inferred schema. - - Returns - ------- - str - """ + """Return a human-readable description of the inferred schema.""" if not self._analyses: return "No schema has been inferred yet." diff --git a/src/dataforge/locales/ar_SA/address.py b/src/dataforge/locales/ar_SA/address.py index 4d2bf6d..88e154a 100644 --- a/src/dataforge/locales/ar_SA/address.py +++ b/src/dataforge/locales/ar_SA/address.py @@ -45,16 +45,6 @@ "Southern Ring", "Western Ring", "Al-Kharj", - "Al-Suwaidi", - "Al-Naseem", - "Al-Malaz", - "Al-Rawdah", - "Al-Hamra", - "Al-Nuzha", - "Al-Wurud", - "Al-Sahafa", - "Al-Rayyan", - "Al-Andalus", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/ar_SA/company.py b/src/dataforge/locales/ar_SA/company.py index d63155e..f9ac85f 100644 --- a/src/dataforge/locales/ar_SA/company.py +++ b/src/dataforge/locales/ar_SA/company.py @@ -74,26 +74,6 @@ "Front-line", "Fundamental", "Horizontal", - "Implemented", - "Innovative", - "Integrated", - "Intuitive", - "Managed", - "Multi-lateral", - "Networked", - "Open-source", - "Optimized", - "Organic", - "Persistent", - "Proactive", - "Programmable", - "Progressive", - "Reactive", - "Realigned", - "Reduced", - "Robust", - "Seamless", - "Secured", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -117,26 +97,6 @@ "core", "database", "definition", - "emulation", - "encoding", - "encryption", - "extranet", - "firmware", - "flexibility", - "framework", - "function", - "groupware", - "hardware", - "hierarchy", - "implementation", - "infrastructure", - "initiative", - "interface", - "intranet", - "knowledge", - "leverage", - "matrix", - "methodology", ) job_titles: tuple[str, ...] = ( @@ -165,19 +125,4 @@ "Chief Financial Officer", "Chief Operating Officer", "Vice President of Engineering", - "Director of Engineering", - "Engineering Manager", - "Technical Lead", - "Senior Developer", - "Junior Developer", - "Consultant", - "Solutions Architect", - "Principal Engineer", - "Staff Engineer", - "Research Scientist", - "Data Engineer", - "Platform Engineer", - "Mobile Developer", - "Embedded Systems Engineer", - "Cybersecurity Specialist", ) diff --git a/src/dataforge/locales/ar_SA/person.py b/src/dataforge/locales/ar_SA/person.py index 21d6559..19c03f6 100644 --- a/src/dataforge/locales/ar_SA/person.py +++ b/src/dataforge/locales/ar_SA/person.py @@ -80,31 +80,6 @@ "Al-Asmari", "Al-Ghanim", "Al-Zaid", - "Al-Obaid", - "Al-Musallam", - "Al-Kathiri", - "Al-Mufarrij", - "Al-Saqr", - "Al-Fehaid", - "Al-Rashed", - "Al-Masoud", - "Al-Hamad", - "Al-Fadhel", - "Al-Barrak", - "Al-Subai", - "Al-Ajmi", - "Al-Bluwi", - "Al-Muwallad", - "Al-Thumairy", - "Al-Haddad", - "Al-Dosseri", - "Al-Jasser", - "Al-Shuraim", - "Al-Ghunaim", - "Al-Zamil", - "Al-Delaim", - "Al-Baqmi", - "Al-Mane", ) male_first_names: tuple[str, ...] = ( diff --git a/src/dataforge/locales/da_DK/address.py b/src/dataforge/locales/da_DK/address.py index f36df7f..fc98353 100644 --- a/src/dataforge/locales/da_DK/address.py +++ b/src/dataforge/locales/da_DK/address.py @@ -41,16 +41,6 @@ "Ring", "Rosen", "Ryg", - "Skov", - "Slots", - "Sol", - "Stations", - "Strand", - "Syd", - "Søndre", - "Torve", - "Vester", - "Øster", ) street_suffixes: tuple[str, ...] = ( @@ -122,21 +112,6 @@ "Roskilde", "Rønne", "Silkeborg", - "Skanderborg", - "Skive", - "Slagelse", - "Solrød", - "Svendborg", - "Sønderborg", - "Thisted", - "Tønder", - "Valby", - "Vanløse", - "Vejen", - "Vejle", - "Viborg", - "Virum", - "Vordingborg", ) states: tuple[str, ...] = ( diff --git a/src/dataforge/locales/da_DK/person.py b/src/dataforge/locales/da_DK/person.py index 22a670e..b899261 100644 --- a/src/dataforge/locales/da_DK/person.py +++ b/src/dataforge/locales/da_DK/person.py @@ -51,57 +51,6 @@ "Lars", "Lasse", "Laura", - "Leif", - "Lene", - "Line", - "Lisa", - "Lone", - "Louise", - "Maja", - "Maria", - "Marianne", - "Marie", - "Martin", - "Mathias", - "Mette", - "Michael", - "Mikkel", - "Morten", - "Niels", - "Nikolaj", - "Nina", - "Ole", - "Oscar", - "Peder", - "Per", - "Peter", - "Poul", - "Rasmus", - "Rikke", - "Robert", - "Rune", - "Sara", - "Simon", - "Sofie", - "Steen", - "Stine", - "Susanne", - "Søren", - "Thomas", - "Tina", - "Tobias", - "Tom", - "Trine", - "Troels", - "Ulla", - "Vibeke", - "Viktor", - "Villads", - "William", - "Aase", - "Åse", - "Astrid", - "Emil", ) last_names: tuple[str, ...] = ( @@ -180,31 +129,4 @@ "Søndergaard", "Sørensen", "Thomasen", - "Thomsen", - "Toft", - "Vestergaard", - "Villadsen", - "Winther", - "Østergaard", - "Aagaard", - "Aarup", - "Bach", - "Bonde", - "Buhl", - "Byg", - "Carstensen", - "Davidsen", - "Enevoldsen", - "Falk", - "Frost", - "Gade", - "Greve", - "Hald", - "Hammer", - "Hedegaard", - "Hjorth", - "Ipsen", - "Jakobsen", - "Juhl", - "Kjær", ) diff --git a/src/dataforge/locales/de_DE/address.py b/src/dataforge/locales/de_DE/address.py index 8c44cff..69e7778 100644 --- a/src/dataforge/locales/de_DE/address.py +++ b/src/dataforge/locales/de_DE/address.py @@ -41,16 +41,6 @@ "Mond", "Stern", "Morgen", - "Abend", - "Nord", - "Süd", - "Ost", - "West", - "Berg", - "Tal", - "See", - "Fluss", - "Park", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/de_DE/person.py b/src/dataforge/locales/de_DE/person.py index f44dc71..7597b18 100644 --- a/src/dataforge/locales/de_DE/person.py +++ b/src/dataforge/locales/de_DE/person.py @@ -51,61 +51,6 @@ "Julia", "Julian", "Jürgen", - "Karin", - "Karl", - "Katharina", - "Klaus", - "Kurt", - "Lars", - "Laura", - "Lena", - "Leon", - "Lisa", - "Lukas", - "Manfred", - "Marco", - "Maria", - "Markus", - "Martin", - "Matthias", - "Max", - "Maximilian", - "Michael", - "Monika", - "Nicole", - "Nina", - "Norbert", - "Oliver", - "Otto", - "Patrick", - "Paul", - "Peter", - "Petra", - "Philipp", - "Ralf", - "Renate", - "Robert", - "Roland", - "Sabine", - "Sandra", - "Sarah", - "Sebastian", - "Silke", - "Simon", - "Stefan", - "Stefanie", - "Susanne", - "Sven", - "Thomas", - "Tim", - "Tobias", - "Ulrich", - "Ursula", - "Uwe", - "Volker", - "Walter", - "Werner", - "Wolfgang", ) last_names: tuple[str, ...] = ( @@ -184,29 +129,4 @@ "Vogt", "Stein", "Jäger", - "Otto", - "Sommer", - "Groß", - "Seidel", - "Heinrich", - "Brandt", - "Haas", - "Schreiber", - "Graf", - "Schultz", - "Dietrich", - "Kuhn", - "Ziegler", - "Kühn", - "Pohl", - "Engel", - "Horn", - "Busch", - "Bergmann", - "Thomas", - "Voigt", - "Sauer", - "Arnold", - "Wolff", - "Pfeiffer", ) diff --git a/src/dataforge/locales/en_AU/address.py b/src/dataforge/locales/en_AU/address.py index 4b77845..97505d7 100644 --- a/src/dataforge/locales/en_AU/address.py +++ b/src/dataforge/locales/en_AU/address.py @@ -45,44 +45,6 @@ "Arthur", "Alfred", "Frederick", - "Glenmore", - "Pacific", - "Canterbury", - "Oxford", - "Anzac", - "Burns", - "Campbell", - "Cook", - "Mitchell", - "Stuart", - "Stirling", - "Hay", - "Adelaide", - "Brisbane", - "Sydney", - "Melbourne", - "Perth", - "Hobart", - "Darwin", - "Canberra", - "Darling", - "Harbour", - "Parramatta", - "Richmond", - "Windsor", - "Banksia", - "Wattle", - "Eucalyptus", - "Waratah", - "Grevillea", - "Acacia", - "Bottlebrush", - "Kurrajong", - "Cedar", - "Rosemary", - "Lake", - "Creek", - "River", ) street_suffixes: tuple[str, ...] = ( @@ -154,61 +116,6 @@ "Broome", "Karratha", "Katherine", - "Devonport", - "Burnie", - "Armidale", - "Goulburn", - "Broken Hill", - "Griffith", - "Murray Bridge", - "Victor Harbor", - "Port Augusta", - "Port Lincoln", - "Esperance", - "Albany", - "Northam", - "Emerald", - "Yeppoon", - "Innisfail", - "Kingaroy", - "Warwick", - "Dalby", - "Roma", - "Gympie", - "Nambour", - "Caloundra", - "Caboolture", - "Ipswich", - "Logan", - "Redcliffe", - "Redlands", - "Beaudesert", - "Lithgow", - "Cessnock", - "Singleton", - "Muswellbrook", - "Parkes", - "Forbes", - "Cowra", - "Young", - "Queanbeyan", - "Yass", - "Cooma", - "Bega", - "Moruya", - "Ulladulla", - "Wangaratta", - "Benalla", - "Seymour", - "Sale", - "Traralgon", - "Morwell", - "Wonthaggi", - "Colac", - "Horsham", - "Hamilton", - "Portland", - "Maryborough", ) states: tuple[str, ...] = ( diff --git a/src/dataforge/locales/en_AU/company.py b/src/dataforge/locales/en_AU/company.py index be35a88..4caa8aa 100644 --- a/src/dataforge/locales/en_AU/company.py +++ b/src/dataforge/locales/en_AU/company.py @@ -84,26 +84,6 @@ "Front-line", "Fundamental", "Horizontal", - "Implemented", - "Innovative", - "Integrated", - "Intuitive", - "Managed", - "Multi-lateral", - "Networked", - "Open-source", - "Optimized", - "Organic", - "Persistent", - "Proactive", - "Programmable", - "Progressive", - "Reactive", - "Realigned", - "Reduced", - "Robust", - "Seamless", - "Secured", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -127,26 +107,6 @@ "core", "database", "definition", - "emulation", - "encoding", - "encryption", - "extranet", - "firmware", - "flexibility", - "framework", - "function", - "groupware", - "hardware", - "hierarchy", - "implementation", - "infrastructure", - "initiative", - "interface", - "intranet", - "knowledge", - "leverage", - "matrix", - "methodology", ) job_titles: tuple[str, ...] = ( @@ -175,19 +135,4 @@ "Chief Financial Officer", "Chief Operating Officer", "Vice President of Engineering", - "Director of Engineering", - "Engineering Manager", - "Technical Lead", - "Senior Developer", - "Junior Developer", - "Intern", - "Consultant", - "Solutions Architect", - "Principal Engineer", - "Staff Engineer", - "Research Scientist", - "Data Engineer", - "Platform Engineer", - "Mobile Developer", - "Embedded Systems Engineer", ) diff --git a/src/dataforge/locales/en_AU/person.py b/src/dataforge/locales/en_AU/person.py index cd8f14d..8bb18dc 100644 --- a/src/dataforge/locales/en_AU/person.py +++ b/src/dataforge/locales/en_AU/person.py @@ -55,56 +55,6 @@ "Levi", "Tyler", "Austin", - "Xavier", - "Blake", - "Campbell", - "Hayden", - "Jasper", - "Nathan", - "Declan", - "Lincoln", - "Toby", - "Zachary", - "Aiden", - "Dylan", - "Ryder", - "Harry", - "Felix", - "Nate", - "Beau", - "Jayden", - "Jordan", - "Matthew", - "Ashton", - "Eli", - "Spencer", - "Bodhi", - "Marcus", - "Bailey", - "Callum", - "Luca", - "Aaron", - "Logan", - "Jake", - "Jaxon", - "Dominic", - "Banjo", - "Cody", - "Koby", - "Ned", - "Fraser", - "Taj", - "Wesley", - "Arlo", - "Sonny", - "Rowan", - "Darren", - "Shane", - "Brody", - "Zane", - "Rhys", - "Heath", - "Angus", ) female_first_names: tuple[str, ...] = ( @@ -158,56 +108,6 @@ "Holly", "Aurora", "Imogen", - "Indie", - "Jasmine", - "Daisy", - "Stella", - "Eva", - "Ayla", - "Summer", - "Penelope", - "Rose", - "Quinn", - "Annabelle", - "Harriet", - "Madeleine", - "Lara", - "Bonnie", - "Claire", - "Chelsea", - "Gemma", - "Tessa", - "Sarah", - "Kiara", - "Maeve", - "Millie", - "Thea", - "Freya", - "Jessica", - "Heidi", - "Paige", - "Wren", - "Nina", - "Skye", - "Tahlia", - "Eden", - "Lola", - "Peyton", - "Marley", - "Isabelle", - "Ellie", - "Alexis", - "Piper", - "Molly", - "Maisie", - "Darcy", - "Charli", - "Olive", - "Cleo", - "Sage", - "Tilly", - "Leah", - "Addison", ) # Computed from gendered tuples — single source of truth @@ -289,82 +189,4 @@ "Graham", "Ferguson", "Cameron", - "Chapman", - "Marshall", - "Barnes", - "Sullivan", - "Fisher", - "Lawrence", - "Cole", - "Walsh", - "Grant", - "Spencer", - "Webb", - "Reid", - "Gordon", - "Pearson", - "Bailey", - "Hart", - "Burns", - "Stevens", - "O'Brien", - "Lynch", - "Byrne", - "Barker", - "Gallagher", - "Griffin", - "O'Neill", - "Dunn", - "Burton", - "Ford", - "Hayes", - "Fox", - "Armstrong", - "Payne", - "Patel", - "Day", - "Stone", - "Dawson", - "Cross", - "Doyle", - "Chan", - "Lowe", - "Carr", - "Duncan", - "Hogan", - "Daly", - "Fitzgerald", - "Kaur", - "Tran", - "Le", - "Chen", - "Singh", - "Sharma", - "Li", - "Wang", - "Liu", - "Kim", - "Khan", - "Prasad", - "Tan", - "Reeves", - "Mackenzie", - "Fraser", - "McLean", - "Sutherland", - "Crawford", - "Kerr", - "O'Connor", - "Pearce", - "Long", - "Black", - "Ross", - "Boyd", - "Price", - "Robertson", - "Maguire", - "Lawson", - "Peters", - "Dixon", - "Hooper", ) diff --git a/src/dataforge/locales/en_CA/address.py b/src/dataforge/locales/en_CA/address.py index 22863d2..fd10fc1 100644 --- a/src/dataforge/locales/en_CA/address.py +++ b/src/dataforge/locales/en_CA/address.py @@ -45,41 +45,6 @@ "Bank", "Elgin", "Metcalfe", - "Barrington", - "Spring Garden", - "Water", - "George", - "Prince", - "Regent", - "Mountain", - "Lakeshore", - "Riverside", - "Park", - "Garden", - "Forest", - "Meadow", - "Valley", - "Highland", - "Birch", - "Spruce", - "Willow", - "Poplar", - "Ash", - "Walnut", - "Chestnut", - "Heritage", - "Pioneer", - "Railway", - "Mill", - "Bridge", - "Lake", - "River", - "Creek", - "Hillside", - "Orchard", - "Centre", - "Station", - "Country", ) street_suffixes: tuple[str, ...] = ( @@ -151,95 +116,6 @@ "Chatham-Kent", "Cornwall", "North Bay", - "Timmins", - "Brandon", - "Moose Jaw", - "Prince Albert", - "Yorkton", - "Swift Current", - "Grande Prairie", - "Fort McMurray", - "Airdrie", - "Spruce Grove", - "Cochrane", - "Leduc", - "Lloydminster", - "Brooks", - "Whitehorse", - "Yellowknife", - "Iqaluit", - "Brampton", - "Mississauga", - "Markham", - "Vaughan", - "Richmond Hill", - "Oakville", - "Burlington", - "Ajax", - "Pickering", - "Whitby", - "Clarington", - "Newmarket", - "Aurora", - "Caledon", - "Cambridge", - "Waterloo", - "Stratford", - "Woodstock", - "St. Thomas", - "Norfolk", - "Orangeville", - "Owen Sound", - "Collingwood", - "Orillia", - "Midland", - "Cobourg", - "Lindsay", - "Pembroke", - "Brockville", - "Smiths Falls", - "Kenora", - "Dryden", - "Terrace", - "Courtenay", - "Campbell River", - "Dawson Creek", - "Cranbrook", - "Vernon", - "Penticton", - "Salmon Arm", - "Trail", - "Summerside", - "Bathurst", - "Campbellton", - "Miramichi", - "Truro", - "New Glasgow", - "Antigonish", - "Kentville", - "Yarmouth", - "Bridgewater", - "Corner Brook", - "Gander", - "Happy Valley-Goose Bay", - "Steinbach", - "Portage la Prairie", - "Selkirk", - "Thompson", - "Dauphin", - "Winkler", - "Estevan", - "Weyburn", - "Humboldt", - "Melfort", - "North Battleford", - "Wetaskiwin", - "Camrose", - "Stettler", - "Canmore", - "Banff", - "Jasper", - "Lake Louise", ) states: tuple[str, ...] = ( diff --git a/src/dataforge/locales/en_CA/company.py b/src/dataforge/locales/en_CA/company.py index 933c54f..c0d4b8f 100644 --- a/src/dataforge/locales/en_CA/company.py +++ b/src/dataforge/locales/en_CA/company.py @@ -84,26 +84,6 @@ "Front-line", "Fundamental", "Horizontal", - "Implemented", - "Innovative", - "Integrated", - "Intuitive", - "Managed", - "Multi-lateral", - "Networked", - "Open-source", - "Optimized", - "Organic", - "Persistent", - "Proactive", - "Programmable", - "Progressive", - "Reactive", - "Realigned", - "Reduced", - "Robust", - "Seamless", - "Secured", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -127,26 +107,6 @@ "core", "database", "definition", - "emulation", - "encoding", - "encryption", - "extranet", - "firmware", - "flexibility", - "framework", - "function", - "groupware", - "hardware", - "hierarchy", - "implementation", - "infrastructure", - "initiative", - "interface", - "intranet", - "knowledge", - "leverage", - "matrix", - "methodology", ) job_titles: tuple[str, ...] = ( @@ -175,19 +135,4 @@ "Chief Financial Officer", "Chief Operating Officer", "Vice President of Engineering", - "Director of Engineering", - "Engineering Manager", - "Technical Lead", - "Senior Developer", - "Junior Developer", - "Intern", - "Consultant", - "Solutions Architect", - "Principal Engineer", - "Staff Engineer", - "Research Scientist", - "Data Engineer", - "Platform Engineer", - "Mobile Developer", - "Embedded Systems Engineer", ) diff --git a/src/dataforge/locales/en_CA/person.py b/src/dataforge/locales/en_CA/person.py index db6c3c2..f5f39f2 100644 --- a/src/dataforge/locales/en_CA/person.py +++ b/src/dataforge/locales/en_CA/person.py @@ -55,56 +55,6 @@ "Adrian", "Colton", "Jordan", - "Carson", - "Robert", - "Angel", - "Maverick", - "Easton", - "Cooper", - "Nolan", - "Adam", - "Xavier", - "Hudson", - "Leo", - "Asher", - "Jaxon", - "Parker", - "Christian", - "Blake", - "Landon", - "Ezra", - "Mason", - "Zachary", - "Miles", - "Harrison", - "Evan", - "Charles", - "Jonathan", - "Patrick", - "Michael", - "Aaron", - "Tyler", - "Tristan", - "Declan", - "Emmett", - "Max", - "Felix", - "Bennett", - "Jasper", - "Maxwell", - "Finn", - "Elliott", - "Gavin", - "Elliot", - "Graham", - "Reid", - "Marcus", - "Colin", - "Callum", - "Brayden", - "Luca", - "Simon", - "Damian", ) female_first_names: tuple[str, ...] = ( @@ -158,56 +108,6 @@ "Kennedy", "Maya", "Willow", - "Kinsley", - "Naomi", - "Sarah", - "Aaliyah", - "Allison", - "Gabriella", - "Alice", - "Madelyn", - "Sadie", - "Hailey", - "Eva", - "Emery", - "Quinn", - "Piper", - "Ruby", - "Serenity", - "Sophie", - "Kate", - "Ivy", - "Isabelle", - "Madison", - "Abigail", - "Grace", - "Victoria", - "Emily", - "Lauren", - "Gianna", - "Camila", - "Eliana", - "Avery", - "Mila", - "Aria", - "Athena", - "Mackenzie", - "Taylor", - "Elena", - "Peyton", - "Maria", - "Clara", - "Lydia", - "Jade", - "Ayla", - "Rachel", - "Katherine", - "Samantha", - "Brielle", - "Maisie", - "Isla", - "Freya", - "Fiona", ) # Computed from gendered tuples — single source of truth @@ -289,89 +189,4 @@ "Bergeron", "Ouellet", "Girard", - "Poirier", - "Fournier", - "Cloutier", - "Caron", - "Beaulieu", - "Landry", - "Boucher", - "Richard", - "Thomas", - "Nguyen", - "Patel", - "Kim", - "Li", - "Wong", - "Chan", - "Ali", - "Ahmed", - "Khan", - "Wang", - "Liu", - "Zhang", - "Wu", - "Gill", - "Grewal", - "Dhillon", - "Kaur", - "Sharma", - "Adams", - "Allen", - "Baker", - "Barnes", - "Bennett", - "Black", - "Boyd", - "Burke", - "Burns", - "Carter", - "Cooper", - "Craig", - "Crawford", - "Cunningham", - "Davidson", - "Dixon", - "Douglas", - "Edwards", - "Elliott", - "Evans", - "Fisher", - "Fleming", - "Forbes", - "Gibson", - "Gray", - "Green", - "Henderson", - "Holmes", - "Hughes", - "Hunt", - "Hunter", - "Jackson", - "James", - "Lawson", - "MacPherson", - "Mason", - "Miller", - "Mills", - "Morgan", - "Murphy", - "Nelson", - "Olson", - "Palmer", - "Parker", - "Patterson", - "Perry", - "Peters", - "Phillips", - "Price", - "Richardson", - "Russell", - "Shaw", - "Spencer", - "Sullivan", - "Turner", - "Ward", - "Webster", - "Wells", ) diff --git a/src/dataforge/locales/en_GB/address.py b/src/dataforge/locales/en_GB/address.py index 10a4b9f..b328209 100644 --- a/src/dataforge/locales/en_GB/address.py +++ b/src/dataforge/locales/en_GB/address.py @@ -45,42 +45,6 @@ "Ivy", "Rose", "Hawthorn", - "Acacia", - "Chestnut", - "Orchard", - "Cherry", - "Woodlands", - "Brook", - "Water", - "Riverside", - "Field", - "Hillside", - "Valley", - "Cambridge", - "Oxford", - "Clarence", - "Wellington", - "Gloucester", - "Kensington", - "Chester", - "Regent", - "Salisbury", - "Cavendish", - "Portland", - "Warwick", - "Lancaster", - "Pembroke", - "Suffolk", - "Norfolk", - "Devon", - "Cornwall", - "Dorset", - "Sutton", - "Cromwell", - "Tudor", - "Stuart", - "Fairfield", - "Lakeside", ) street_suffixes: tuple[str, ...] = ( @@ -157,58 +121,6 @@ "Stockport", "Rotherham", "Wigan", - "Milton Keynes", - "Warrington", - "Wakefield", - "Huddersfield", - "Telford", - "Slough", - "Watford", - "Gloucester", - "Worcester", - "Cheltenham", - "Colchester", - "Crawley", - "Basildon", - "Eastbourne", - "Maidstone", - "Chelmsford", - "Salford", - "Birkenhead", - "Grimsby", - "Hastings", - "Carlisle", - "Lincoln", - "Inverness", - "Perth", - "Stirling", - "Newport", - "Bangor", - "Wrexham", - "Newry", - "Lisburn", - "Londonderry", - "Hereford", - "Salisbury", - "Winchester", - "Truro", - "Wells", - "Ely", - "Ripon", - "Lichfield", - "Chichester", - "St Albans", - "Durham", - "Lancaster", - "Preston", - "Blackburn", - "Burnley", - "Halifax", - "Rochdale", - "Oldham", - "Barnsley", - "Harrogate", - "Scarborough", ) states: tuple[str, ...] = ( diff --git a/src/dataforge/locales/en_GB/company.py b/src/dataforge/locales/en_GB/company.py index b5fa881..3d2a63c 100644 --- a/src/dataforge/locales/en_GB/company.py +++ b/src/dataforge/locales/en_GB/company.py @@ -87,26 +87,6 @@ "Front-line", "Fundamental", "Horizontal", - "Implemented", - "Innovative", - "Integrated", - "Intuitive", - "Managed", - "Multi-lateral", - "Networked", - "Open-source", - "Optimised", - "Organic", - "Persistent", - "Proactive", - "Programmable", - "Progressive", - "Reactive", - "Realigned", - "Reduced", - "Robust", - "Seamless", - "Secured", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -130,26 +110,6 @@ "core", "database", "definition", - "emulation", - "encoding", - "encryption", - "extranet", - "firmware", - "flexibility", - "framework", - "function", - "groupware", - "hardware", - "hierarchy", - "implementation", - "infrastructure", - "initiative", - "interface", - "intranet", - "knowledge", - "leverage", - "matrix", - "methodology", ) job_titles: tuple[str, ...] = ( @@ -178,19 +138,4 @@ "Solicitor", "Chief Technology Officer", "Chief Executive Officer", - "Chief Financial Officer", - "Managing Director", - "Director of Engineering", - "Engineering Manager", - "Technical Lead", - "Senior Developer", - "Junior Developer", - "Graduate Developer", - "Consultant", - "Solutions Architect", - "Principal Engineer", - "Staff Engineer", - "Research Scientist", - "Data Engineer", - "Platform Engineer", ) diff --git a/src/dataforge/locales/en_GB/person.py b/src/dataforge/locales/en_GB/person.py index ae479c9..e9057c6 100644 --- a/src/dataforge/locales/en_GB/person.py +++ b/src/dataforge/locales/en_GB/person.py @@ -55,57 +55,6 @@ "Harvey", "Luca", "Matthew", - "Tommy", - "Lewis", - "Nathan", - "Aiden", - "Jake", - "Reuben", - "Zachary", - "Elliott", - "Teddy", - "Hugo", - "David", - "Robert", - "Louie", - "Jude", - "Stanley", - "Ollie", - "Frankie", - "Albert", - "Reggie", - "Felix", - "Dexter", - "Ronnie", - "Bobby", - "Jasper", - "Louis", - "Ryan", - "Alex", - "Kai", - "Tyler", - "Leon", - "Jamie", - "Aaron", - "Omar", - "Rory", - "Patrick", - "Caleb", - "Marcus", - "Cameron", - "Blake", - "Ellis", - "Rupert", - "Barnaby", - "Alistair", - "Edmund", - "Benedict", - "Hamish", - "Angus", - "Fraser", - "Fergus", - "Ewan", - "Rhys", ) female_first_names: tuple[str, ...] = ( @@ -159,56 +108,6 @@ "Maya", "Mila", "Bella", - "Thea", - "Harriet", - "Georgia", - "Molly", - "Abigail", - "Elizabeth", - "Orla", - "Darcy", - "Lottie", - "Martha", - "Heidi", - "Bonnie", - "Zara", - "Annabelle", - "Amber", - "Clara", - "Esme", - "Aisha", - "Robyn", - "Holly", - "Jasmine", - "Lydia", - "Ada", - "Beatrice", - "Margot", - "Arabella", - "Sara", - "Lola", - "Rose", - "Iris", - "Aurora", - "Niamh", - "Fiona", - "Gemma", - "Isobel", - "Bethany", - "Naomi", - "Victoria", - "Katherine", - "Megan", - "Lauren", - "Pippa", - "Daphne", - "Felicity", - "Tamsin", - "Philippa", - "Georgina", - "Saoirse", - "Cerys", - "Bronwen", ) # Computed from gendered tuples — single source of truth @@ -290,83 +189,4 @@ "Powell", "Webb", "Rogers", - "Ellis", - "Marshall", - "Mason", - "Richardson", - "Hunt", - "Kennedy", - "Day", - "Holmes", - "Stone", - "Mills", - "Barnes", - "Pearson", - "Dixon", - "Spencer", - "Knight", - "Graham", - "Jenkins", - "Atkinson", - "Griffiths", - "Booth", - "Walsh", - "Hart", - "Gardner", - "Watts", - "Brooks", - "Owen", - "Hussain", - "Ali", - "Khan", - "Ahmed", - "Begum", - "Andrews", - "Armstrong", - "Ball", - "Barlow", - "Barrett", - "Bates", - "Berry", - "Bird", - "Bishop", - "Black", - "Bolton", - "Bond", - "Bradshaw", - "Brennan", - "Briggs", - "Carr", - "Chambers", - "Cole", - "Cox", - "Cross", - "Cunningham", - "Dean", - "Doyle", - "Duncan", - "Dunn", - "Field", - "Fletcher", - "Flynn", - "Gallagher", - "George", - "Gibson", - "Goddard", - "Gordon", - "Grant", - "Gregory", - "Hamilton", - "Hancock", - "Hardy", - "Harding", - "Hayward", - "Henderson", - "Hewitt", - "Hodgson", - "Holland", - "Hooper", - "Houghton", - "Howard", - "Humphreys", ) diff --git a/src/dataforge/locales/en_US/address.py b/src/dataforge/locales/en_US/address.py index 59661bc..b6fa134 100644 --- a/src/dataforge/locales/en_US/address.py +++ b/src/dataforge/locales/en_US/address.py @@ -45,41 +45,6 @@ "Magnolia", "Cypress", "Ivy", - "Laurel", - "Holly", - "Hazel", - "Rose", - "Lily", - "Daisy", - "Violet", - "Orchid", - "Primrose", - "Sage", - "Broad", - "Market", - "Center", - "School", - "Mill", - "Bridge", - "Academy", - "Railroad", - "Union", - "Liberty", - "Summit", - "Prospect", - "Ridge", - "Grove", - "Court", - "Heritage", - "Colonial", - "Pioneer", - "Frontier", - "Prairie", - "Canyon", - "Creek", - "Brook", - "Pond", - "Harbor", ) street_suffixes: tuple[str, ...] = ( @@ -151,56 +116,6 @@ "Arlington", "New Orleans", "Wichita", - "Cleveland", - "Bakersfield", - "Aurora", - "Anaheim", - "Honolulu", - "Santa Ana", - "Riverside", - "Corpus Christi", - "Lexington", - "Pittsburgh", - "Anchorage", - "Stockton", - "Cincinnati", - "Saint Paul", - "Greensboro", - "Toledo", - "Newark", - "Plano", - "Henderson", - "Lincoln", - "Orlando", - "Jersey City", - "Chandler", - "St. Louis", - "Buffalo", - "Madison", - "Lubbock", - "Scottsdale", - "Reno", - "Glendale", - "Gilbert", - "Winston-Salem", - "North Las Vegas", - "Norfolk", - "Chesapeake", - "Garland", - "Irving", - "Hialeah", - "Fremont", - "Boise", - "Richmond", - "Baton Rouge", - "Spokane", - "Des Moines", - "Tacoma", - "San Bernardino", - "Modesto", - "Fontana", - "Moreno Valley", - "Santa Clarita", ) states: tuple[str, ...] = ( diff --git a/src/dataforge/locales/en_US/company.py b/src/dataforge/locales/en_US/company.py index 4890db3..f1c88ad 100644 --- a/src/dataforge/locales/en_US/company.py +++ b/src/dataforge/locales/en_US/company.py @@ -82,26 +82,6 @@ "Front-line", "Fundamental", "Horizontal", - "Implemented", - "Innovative", - "Integrated", - "Intuitive", - "Managed", - "Multi-lateral", - "Networked", - "Open-source", - "Optimized", - "Organic", - "Persistent", - "Proactive", - "Programmable", - "Progressive", - "Reactive", - "Realigned", - "Reduced", - "Robust", - "Seamless", - "Secured", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -125,26 +105,6 @@ "core", "database", "definition", - "emulation", - "encoding", - "encryption", - "extranet", - "firmware", - "flexibility", - "framework", - "function", - "groupware", - "hardware", - "hierarchy", - "implementation", - "infrastructure", - "initiative", - "interface", - "intranet", - "knowledge", - "leverage", - "matrix", - "methodology", ) job_titles: tuple[str, ...] = ( @@ -173,19 +133,4 @@ "Chief Financial Officer", "Chief Operating Officer", "Vice President of Engineering", - "Director of Engineering", - "Engineering Manager", - "Technical Lead", - "Senior Developer", - "Junior Developer", - "Intern", - "Consultant", - "Solutions Architect", - "Principal Engineer", - "Staff Engineer", - "Research Scientist", - "Data Engineer", - "Platform Engineer", - "Mobile Developer", - "Embedded Systems Engineer", ) diff --git a/src/dataforge/locales/en_US/person.py b/src/dataforge/locales/en_US/person.py index c657076..bb04e57 100644 --- a/src/dataforge/locales/en_US/person.py +++ b/src/dataforge/locales/en_US/person.py @@ -55,55 +55,6 @@ "Jack", "Dennis", "Jerry", - "Tyler", - "Aaron", - "Jose", - "Adam", - "Nathan", - "Henry", - "Peter", - "Zachary", - "Douglas", - "Arthur", - "Carl", - "Gerald", - "Roger", - "Keith", - "Lawrence", - "Terry", - "Sean", - "Albert", - "Joe", - "Christian", - "Austin", - "Willie", - "Jesse", - "Ethan", - "Billy", - "Bruce", - "Bryan", - "Ralph", - "Roy", - "Jordan", - "Eugene", - "Wayne", - "Vincent", - "Dylan", - "Alan", - "Russell", - "Louis", - "Philip", - "Bobby", - "Johnny", - "Bradley", - "Harry", - "Walter", - "Kyle", - "Gabriel", - "Noah", - "Logan", - "Elijah", - "Liam", ) female_first_names: tuple[str, ...] = ( @@ -157,55 +108,6 @@ "Catherine", "Maria", "Heather", - "Diane", - "Ruth", - "Julie", - "Olivia", - "Joyce", - "Virginia", - "Victoria", - "Kelly", - "Lauren", - "Christina", - "Joan", - "Evelyn", - "Judith", - "Megan", - "Andrea", - "Cheryl", - "Hannah", - "Jacqueline", - "Martha", - "Gloria", - "Teresa", - "Ann", - "Sara", - "Madison", - "Frances", - "Kathryn", - "Janice", - "Jean", - "Abigail", - "Alice", - "Judy", - "Sophia", - "Grace", - "Denise", - "Amber", - "Doris", - "Marilyn", - "Danielle", - "Beverly", - "Isabella", - "Theresa", - "Diana", - "Natalie", - "Brittany", - "Charlotte", - "Marie", - "Kayla", - "Alexis", - "Lori", ) # Computed from gendered tuples — single source of truth @@ -287,84 +189,4 @@ "Reed", "Kelly", "Howard", - "Ramos", - "Kim", - "Cox", - "Ward", - "Richardson", - "Watson", - "Brooks", - "Chavez", - "Wood", - "James", - "Bennett", - "Gray", - "Mendoza", - "Ruiz", - "Hughes", - "Price", - "Alvarez", - "Castillo", - "Sanders", - "Patel", - "Myers", - "Long", - "Ross", - "Foster", - "Jimenez", - "Powell", - "Jenkins", - "Perry", - "Russell", - "Sullivan", - "Bell", - "Coleman", - "Butler", - "Henderson", - "Barnes", - "Gonzales", - "Fisher", - "Vasquez", - "Simmons", - "Griffin", - "Marshall", - "Owens", - "Harrison", - "Dean", - "Freeman", - "Stone", - "Medina", - "Webb", - "Tucker", - "Palmer", - "Hart", - "Walsh", - "Day", - "Burns", - "Dixon", - "Hunt", - "Gordon", - "Duncan", - "Reid", - "Mcdonald", - "Knight", - "Daniels", - "Payne", - "Grant", - "Carr", - "Franklin", - "Jordan", - "Lawrence", - "Gibson", - "Mendez", - "Mills", - "Murray", - "Hamilton", - "Graham", - "Wallace", - "Woods", - "Cole", - "West", - "Reynolds", - "Ellis", ) diff --git a/src/dataforge/locales/es_ES/address.py b/src/dataforge/locales/es_ES/address.py index d7e84b6..5c2239c 100644 --- a/src/dataforge/locales/es_ES/address.py +++ b/src/dataforge/locales/es_ES/address.py @@ -41,16 +41,6 @@ "Miró", "Zurbarán", "Murillo", - "Olivo", - "Naranjo", - "Almendro", - "Pino", - "Encina", - "Roble", - "Sauce", - "Palmera", - "Ciprés", - "Higuera", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/es_ES/person.py b/src/dataforge/locales/es_ES/person.py index 6476222..c134542 100644 --- a/src/dataforge/locales/es_ES/person.py +++ b/src/dataforge/locales/es_ES/person.py @@ -51,56 +51,6 @@ "Natalia", "Nuria", "Óscar", - "Pablo", - "Patricia", - "Paula", - "Pedro", - "Pilar", - "Rafael", - "Raúl", - "Ricardo", - "Roberto", - "Rocío", - "Rodrigo", - "Rosa", - "Rubén", - "Sandra", - "Santiago", - "Sara", - "Sergio", - "Silvia", - "Sofía", - "Teresa", - "Tomás", - "Valentina", - "Vicente", - "Víctor", - "Alejandra", - "Almudena", - "Amparo", - "Blanca", - "Clara", - "Concha", - "Dolores", - "Esperanza", - "Estrella", - "Gemma", - "Gloria", - "Irene", - "Lidia", - "Lola", - "Lorena", - "Luisa", - "Manuela", - "Mercedes", - "Miriam", - "Montserrat", - "Olga", - "Paloma", - "Raquel", - "Soledad", - "Susana", - "Verónica", ) last_names: tuple[str, ...] = ( @@ -179,29 +129,4 @@ "Rivas", "Rojas", "Santiago", - "Soler", - "Vázquez", - "Velasco", - "Vera", - "Vicente", - "Arias", - "Benítez", - "Bravo", - "Campos", - "Carmona", - "Cuesta", - "Duran", - "Espinosa", - "Fernández", - "Franco", - "Gallardo", - "Hidalgo", - "Ibáñez", - "Izquierdo", - "Lara", - "Mateo", - "Montero", - "Mora", - "Núñez", - "Parra", ) diff --git a/src/dataforge/locales/fi_FI/address.py b/src/dataforge/locales/fi_FI/address.py index 744498f..9f4bb79 100644 --- a/src/dataforge/locales/fi_FI/address.py +++ b/src/dataforge/locales/fi_FI/address.py @@ -41,16 +41,6 @@ "Ranta", "Ratina", "Ruusu", - "Saarni", - "Satama", - "Savon", - "Sipoon", - "Suvanto", - "Tammi", - "Tehtaan", - "Toivo", - "Tuomi", - "Vaahtera", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/fi_FI/person.py b/src/dataforge/locales/fi_FI/person.py index 67c869f..cf607e5 100644 --- a/src/dataforge/locales/fi_FI/person.py +++ b/src/dataforge/locales/fi_FI/person.py @@ -51,55 +51,6 @@ "Merja", "Mika", "Mikko", - "Minna", - "Niina", - "Olli", - "Olavi", - "Oskari", - "Paavo", - "Paula", - "Pekka", - "Pentti", - "Pertti", - "Petri", - "Petteri", - "Pirjo", - "Päivi", - "Raija", - "Raimo", - "Riitta", - "Risto", - "Ritva", - "Sami", - "Sanna", - "Sari", - "Seppo", - "Simo", - "Sirpa", - "Sofia", - "Susanna", - "Tapio", - "Tarja", - "Tero", - "Tiina", - "Timo", - "Toivo", - "Tommi", - "Tuija", - "Tuomas", - "Tuula", - "Tuure", - "Unto", - "Urho", - "Väinö", - "Veera", - "Veikko", - "Veli", - "Venla", - "Vesa", - "Ville", - "Vilma", - "Virpi", ) last_names: tuple[str, ...] = ( @@ -178,28 +129,4 @@ "Pulkkinen", "Pöyhönen", "Rantala", - "Rantanen", - "Repo", - "Rinne", - "Räsänen", - "Saarinen", - "Salminen", - "Salo", - "Salonen", - "Savolainen", - "Soini", - "Suominen", - "Tarkiainen", - "Tervo", - "Tiainen", - "Toivonen", - "Tuominen", - "Turunen", - "Valtonen", - "Virtanen", - "Voutilainen", - "Väisänen", - "Väänänen", - "Ylinen", - "Ylönen", ) diff --git a/src/dataforge/locales/fr_FR/address.py b/src/dataforge/locales/fr_FR/address.py index 3e68633..862884c 100644 --- a/src/dataforge/locales/fr_FR/address.py +++ b/src/dataforge/locales/fr_FR/address.py @@ -41,16 +41,6 @@ "Stendhal", "Maupassant", "Proust", - "Colette", - "George Sand", - "Simone de Beauvoir", - "Marie Curie", - "Vercingétorix", - "Charlemagne", - "Henri IV", - "Louis XIV", - "Napoléon", - "Jeanne d'Arc", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/fr_FR/person.py b/src/dataforge/locales/fr_FR/person.py index b9298a1..cbd0db8 100644 --- a/src/dataforge/locales/fr_FR/person.py +++ b/src/dataforge/locales/fr_FR/person.py @@ -51,61 +51,6 @@ "Georges", "Gérard", "Guillaume", - "Guy", - "Hélène", - "Henri", - "Hugo", - "Isabelle", - "Jacques", - "Jean", - "Jeanne", - "Julien", - "Juliette", - "Laurent", - "Léa", - "Louis", - "Louise", - "Luc", - "Lucas", - "Lucie", - "Madeleine", - "Manon", - "Marc", - "Marcel", - "Marguerite", - "Marie", - "Martine", - "Mathieu", - "Michel", - "Monique", - "Nathalie", - "Nicolas", - "Nicole", - "Olivier", - "Pascal", - "Patrick", - "Paul", - "Philippe", - "Pierre", - "Raymond", - "René", - "Robert", - "Roger", - "Roland", - "Sandrine", - "Sarah", - "Sophie", - "Stéphane", - "Sylvie", - "Thierry", - "Thomas", - "Valérie", - "Vincent", - "Virginie", - "Xavier", - "Yves", - "Yvette", - "Zoé", ) last_names: tuple[str, ...] = ( @@ -184,29 +129,4 @@ "Marie", "Barbier", "Brun", - "Dumas", - "Brunet", - "Schmitt", - "Leroux", - "Colin", - "Fernandez", - "Pierre", - "Renard", - "Arnaud", - "Rolland", - "Caron", - "Aubert", - "Giraud", - "Leclerc", - "Vidal", - "Bourgeois", - "Renaud", - "Lemoine", - "Picard", - "Gaillard", - "Philippe", - "Leclercq", - "Lacroix", - "Fabre", - "Dupuis", ) diff --git a/src/dataforge/locales/hi_IN/address.py b/src/dataforge/locales/hi_IN/address.py index adfde20..62e4bd5 100644 --- a/src/dataforge/locales/hi_IN/address.py +++ b/src/dataforge/locales/hi_IN/address.py @@ -45,16 +45,6 @@ "Old City Road", "New Market Road", "Bazaar Road", - "Temple Road", - "Lake Road", - "Park Road", - "Sector Road", - "Phase Road", - "Industrial Area Road", - "University Road", - "Hospital Road", - "Court Road", - "Bypass Road", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/hi_IN/company.py b/src/dataforge/locales/hi_IN/company.py index 67e5225..bd736c5 100644 --- a/src/dataforge/locales/hi_IN/company.py +++ b/src/dataforge/locales/hi_IN/company.py @@ -82,26 +82,6 @@ "Front-line", "Fundamental", "Horizontal", - "Implemented", - "Innovative", - "Integrated", - "Intuitive", - "Managed", - "Multi-lateral", - "Networked", - "Open-source", - "Optimized", - "Organic", - "Persistent", - "Proactive", - "Programmable", - "Progressive", - "Reactive", - "Realigned", - "Reduced", - "Robust", - "Seamless", - "Secured", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -125,26 +105,6 @@ "core", "database", "definition", - "emulation", - "encoding", - "encryption", - "extranet", - "firmware", - "flexibility", - "framework", - "function", - "groupware", - "hardware", - "hierarchy", - "implementation", - "infrastructure", - "initiative", - "interface", - "intranet", - "knowledge", - "leverage", - "matrix", - "methodology", ) job_titles: tuple[str, ...] = ( @@ -173,19 +133,4 @@ "Chief Financial Officer", "Chief Operating Officer", "Vice President of Engineering", - "Director of Engineering", - "Engineering Manager", - "Technical Lead", - "Senior Developer", - "Junior Developer", - "Chartered Accountant", - "Company Secretary", - "Solutions Architect", - "Principal Engineer", - "Staff Engineer", - "Research Scientist", - "Data Engineer", - "Platform Engineer", - "Mobile Developer", - "Embedded Systems Engineer", ) diff --git a/src/dataforge/locales/hi_IN/person.py b/src/dataforge/locales/hi_IN/person.py index 8d75496..50a87ab 100644 --- a/src/dataforge/locales/hi_IN/person.py +++ b/src/dataforge/locales/hi_IN/person.py @@ -80,31 +80,6 @@ "Pawar", "More", "Ghosh", - "Mukherjee", - "Chatterjee", - "Banerjee", - "Das", - "Bose", - "Sen", - "Roy", - "Dutta", - "Nath", - "Prasad", - "Sinha", - "Jha", - "Thakkar", - "Raghavan", - "Krishnamurthy", - "Subramaniam", - "Naidu", - "Rajan", - "Hegde", - "Shetty", - "Rao", - "Pillai", - "Choudhury", - "Dixit", - "Mathur", ) male_first_names: tuple[str, ...] = ( @@ -211,7 +186,6 @@ "Preeti", "Garima", "Simran", - "Aparna", ) # Computed from gendered tuples — single source of truth diff --git a/src/dataforge/locales/it_IT/company.py b/src/dataforge/locales/it_IT/company.py index 13b0557..fdf3945 100644 --- a/src/dataforge/locales/it_IT/company.py +++ b/src/dataforge/locales/it_IT/company.py @@ -133,19 +133,4 @@ "Direttore Operativo", "Responsabile Ingegneria", "Capo Progetto", - "Sviluppatore Senior", - "Sviluppatore Junior", - "Stagista", - "Consulente", - "Architetto Soluzioni", - "Ingegnere Principale", - "Ricercatore", - "Ingegnere Dati", - "Ingegnere Piattaforma", - "Sviluppatore Mobile", - "Ingegnere Sistemi Embedded", - "Analista di Sistemi", - "Coordinatore IT", - "Tecnico Supporto", - "Responsabile IT", ) diff --git a/src/dataforge/locales/it_IT/person.py b/src/dataforge/locales/it_IT/person.py index 53e3025..560024b 100644 --- a/src/dataforge/locales/it_IT/person.py +++ b/src/dataforge/locales/it_IT/person.py @@ -51,56 +51,6 @@ "Serena", "Fabio", "Manuela", - "Carlo", - "Michela", - "Vincenzo", - "Patrizia", - "Enrico", - "Teresa", - "Filippo", - "Raffaella", - "Pietro", - "Veronica", - "Salvatore", - "Sabrina", - "Tommaso", - "Lucia", - "Giacomo", - "Angela", - "Leonardo", - "Rosa", - "Michele", - "Emanuela", - "Diego", - "Giovanna", - "Sergio", - "Beatrice", - "Maurizio", - "Irene", - "Mario", - "Sofia", - "Gianluca", - "Alice", - "Edoardo", - "Aurora", - "Cristiano", - "Greta", - "Bruno", - "Arianna", - "Angelo", - "Noemi", - "Fabrizio", - "Eleonora", - "Enzo", - "Giorgia", - "Giorgio", - "Marta", - "Luigi", - "Caterina", - "Raffaele", - "Margherita", - "Piero", - "Carlotta", ) last_names: tuple[str, ...] = ( @@ -179,9 +129,4 @@ "Damico", "Pagano", "Benedetti", - "Rossetti", - "Neri", - "Fiore", - "Donati", - "Sartori", ) diff --git a/src/dataforge/locales/ja_JP/address.py b/src/dataforge/locales/ja_JP/address.py index 36bc068..545451d 100644 --- a/src/dataforge/locales/ja_JP/address.py +++ b/src/dataforge/locales/ja_JP/address.py @@ -41,16 +41,6 @@ "鶯谷", "入谷", "三ノ輪", - "南千住", - "北千住", - "綾瀬", - "亀有", - "金町", - "松戸", - "柏", - "船橋", - "千葉", - "大宮", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/ja_JP/person.py b/src/dataforge/locales/ja_JP/person.py index f2fe433..b0fe6b6 100644 --- a/src/dataforge/locales/ja_JP/person.py +++ b/src/dataforge/locales/ja_JP/person.py @@ -51,56 +51,6 @@ "美月", "大和", "心春", - "翔", - "七海", - "颯真", - "彩花", - "律", - "美羽", - "陸", - "結菜", - "新", - "詩織", - "蒼", - "楓", - "瑛太", - "美優", - "奏太", - "花", - "悠斗", - "桃花", - "陽向", - "琴音", - "駿", - "千尋", - "健", - "亜美", - "亮", - "舞", - "渉", - "麻衣", - "勇気", - "理恵", - "光", - "沙織", - "海斗", - "瞳", - "涼太", - "香織", - "優太", - "奈々", - "慶太", - "友美", - "龍之介", - "千夏", - "晴", - "美穂", - "聡", - "真理", - "岳", - "綾", - "純", - "菜々子", ) last_names: tuple[str, ...] = ( @@ -179,29 +129,4 @@ "大野", "高田", "丸山", - "今井", - "河野", - "藤本", - "村田", - "武田", - "上野", - "杉山", - "増田", - "平野", - "大塚", - "千葉", - "久保", - "松井", - "岩崎", - "木下", - "野口", - "菊地", - "佐野", - "野村", - "新井", - "渡部", - "桜井", - "市川", - "望月", - "小松", ) diff --git a/src/dataforge/locales/ko_KR/company.py b/src/dataforge/locales/ko_KR/company.py index 2880f1b..fc21d7a 100644 --- a/src/dataforge/locales/ko_KR/company.py +++ b/src/dataforge/locales/ko_KR/company.py @@ -133,19 +133,4 @@ "최고운영책임자", "엔지니어링 부사장", "엔지니어링 매니저", - "기술 리드", - "시니어 개발자", - "주니어 개발자", - "인턴", - "컨설턴트", - "솔루션 아키텍트", - "수석 엔지니어", - "연구원", - "데이터 엔지니어", - "플랫폼 엔지니어", - "모바일 개발자", - "임베디드 시스템 엔지니어", - "시스템 분석가", - "IT 코디네이터", - "기술 지원", ) diff --git a/src/dataforge/locales/ko_KR/person.py b/src/dataforge/locales/ko_KR/person.py index ba81780..ac1864a 100644 --- a/src/dataforge/locales/ko_KR/person.py +++ b/src/dataforge/locales/ko_KR/person.py @@ -51,56 +51,6 @@ "나윤", "연우", "채은", - "준우", - "하영", - "지훈", - "서현", - "승민", - "지원", - "윤호", - "예나", - "태민", - "수연", - "동현", - "민지", - "성현", - "소희", - "재민", - "예지", - "수현", - "은지", - "영호", - "혜진", - "준혁", - "미영", - "상우", - "정은", - "민호", - "수진", - "진우", - "지혜", - "태현", - "영은", - "기현", - "미선", - "정민", - "은미", - "재현", - "경미", - "현준", - "은정", - "병준", - "선영", - "성민", - "지영", - "대현", - "진영", - "상현", - "미경", - "경환", - "정희", - "재훈", - "미숙", ) last_names: tuple[str, ...] = ( diff --git a/src/dataforge/locales/nb_NO/address.py b/src/dataforge/locales/nb_NO/address.py index 936b11c..c042cbb 100644 --- a/src/dataforge/locales/nb_NO/address.py +++ b/src/dataforge/locales/nb_NO/address.py @@ -45,16 +45,6 @@ "Edvard Griegs", "Haakon VIIs", "Olav Vs", - "Parkveien", - "Bogstad", - "Smestad", - "Majorstu", - "Frogner", - "Grünerløkka", - "Tøyen", - "Sagene", - "Nydalen", - "Ullevål", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/nb_NO/company.py b/src/dataforge/locales/nb_NO/company.py index b43e4b7..8936ab4 100644 --- a/src/dataforge/locales/nb_NO/company.py +++ b/src/dataforge/locales/nb_NO/company.py @@ -129,9 +129,4 @@ "Kvalitetsleder", "Forsker", "Analytiker", - "Kommunikasjonsrådgiver", - "HR-sjef", - "Driftsingeniør", - "Byggeleder", - "Sivilingeniør", ) diff --git a/src/dataforge/locales/nb_NO/person.py b/src/dataforge/locales/nb_NO/person.py index 57ed516..3422923 100644 --- a/src/dataforge/locales/nb_NO/person.py +++ b/src/dataforge/locales/nb_NO/person.py @@ -189,29 +189,4 @@ "Fjeld", "Thorsen", "Stenberg", - "Myklebust", - "Torgersen", - "Hovland", - "Sæther", - "Engen", - "Wold", - "Sveen", - "Aasen", - "Brenden", - "Dale", - "Røe", - "Haug", - "Lindstad", - "Vestby", - "Tvedt", - "Nesse", - "Askim", - "Foss", - "Evensen", - "Bråten", - "Holmen", - "Solli", - "Grønli", - "Berger", - "Ødegaard", ) diff --git a/src/dataforge/locales/nl_NL/address.py b/src/dataforge/locales/nl_NL/address.py index 7896999..c0d655a 100644 --- a/src/dataforge/locales/nl_NL/address.py +++ b/src/dataforge/locales/nl_NL/address.py @@ -45,16 +45,6 @@ "Zand", "Water", "Vijver", - "Polder", - "Moor", - "Duinen", - "Breedte", - "Lange", - "Korte", - "Noord", - "Zuid", - "Oost", - "West", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/nl_NL/company.py b/src/dataforge/locales/nl_NL/company.py index 795ecc2..dd07379 100644 --- a/src/dataforge/locales/nl_NL/company.py +++ b/src/dataforge/locales/nl_NL/company.py @@ -77,22 +77,6 @@ "Horizontaal", "Geïmplementeerd", "Innovatief", - "Geïntegreerd", - "Intuïtief", - "Beheerd", - "Multilateraal", - "Genetwerkt", - "Opensource", - "Geoptimaliseerd", - "Organisch", - "Persistent", - "Proactief", - "Programmeerbaar", - "Progressief", - "Reactief", - "Robuust", - "Naadloos", - "Beveiligd", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -116,26 +100,6 @@ "kern", "database", "definitie", - "emulatie", - "codering", - "encryptie", - "extranet", - "firmware", - "flexibiliteit", - "raamwerk", - "functie", - "groepssoftware", - "hardware", - "hiërarchie", - "implementatie", - "infrastructuur", - "initiatief", - "interface", - "intranet", - "kennis", - "hefboom", - "matrix", - "methodologie", ) job_titles: tuple[str, ...] = ( @@ -164,19 +128,4 @@ "Chief Financial Officer", "Chief Operating Officer", "Directeur", - "Afdelingshoofd", - "Teamleider", - "Technisch Lead", - "Senior Ontwikkelaar", - "Junior Ontwikkelaar", - "Stagiair", - "Consultant", - "Solutions Architect", - "Principal Engineer", - "Onderzoeker", - "Data Engineer", - "Platform Engineer", - "Mobiele Ontwikkelaar", - "Embedded Systems Engineer", - "Accountmanager", ) diff --git a/src/dataforge/locales/nl_NL/person.py b/src/dataforge/locales/nl_NL/person.py index fa961fd..b3075d6 100644 --- a/src/dataforge/locales/nl_NL/person.py +++ b/src/dataforge/locales/nl_NL/person.py @@ -80,32 +80,6 @@ "Van der Steen", "Wolters", "Sanders", - "Van der Velden", - "Timmermans", - "Gerritsen", - "Biemans", - "Van Dongen", - "Bosch", - "Van Rijn", - "Van der Laan", - "Mol", - "Schipper", - "Klomp", - "Evers", - "Van Houten", - "Jonker", - "Van Kempen", - "Kuijper", - "Berends", - "Hofman", - "Klein", - "Vogels", - "Bergman", - "Van der Horst", - "Claassen", - "Tak", - "Aarts", - "Lam", ) male_first_names: tuple[str, ...] = ( diff --git a/src/dataforge/locales/pl_PL/address.py b/src/dataforge/locales/pl_PL/address.py index bdf9505..a1fab9e 100644 --- a/src/dataforge/locales/pl_PL/address.py +++ b/src/dataforge/locales/pl_PL/address.py @@ -45,16 +45,6 @@ "Dębowa", "Jodłowa", "Akacjowa", - "Wrzosowa", - "Konopnickiej", - "Prusa", - "Orzeszkowej", - "Reymonta", - "Matejki", - "Bema", - "Dąbrowskiego", - "Sobieskiego", - "Głowackiego", ) street_suffixes: tuple[str, ...] = ("",) diff --git a/src/dataforge/locales/pl_PL/company.py b/src/dataforge/locales/pl_PL/company.py index 9141e6a..c4e6d33 100644 --- a/src/dataforge/locales/pl_PL/company.py +++ b/src/dataforge/locales/pl_PL/company.py @@ -79,21 +79,6 @@ "Horyzontalny", "Innowacyjny", "Zintegrowany", - "Intuicyjny", - "Zarządzany", - "Wielostronny", - "Sieciowy", - "Otwarty", - "Zoptymalizowany", - "Organiczny", - "Trwały", - "Proaktywny", - "Programowalny", - "Progresywny", - "Reaktywny", - "Solidny", - "Bezproblemowy", - "Zabezpieczony", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -117,26 +102,6 @@ "rdzeń", "baza danych", "definicja", - "emulacja", - "kodowanie", - "szyfrowanie", - "ekstranet", - "oprogramowanie", - "elastyczność", - "framework", - "funkcja", - "sprzęt", - "hierarchia", - "implementacja", - "infrastruktura", - "inicjatywa", - "interfejs", - "intranet", - "wiedza", - "matryca", - "metodologia", - "platforma", - "protokół", ) job_titles: tuple[str, ...] = ( @@ -165,19 +130,4 @@ "Dyrektor operacyjny", "Wiceprezes ds. inżynierii", "Kierownik działu IT", - "Starszy programista", - "Młodszy programista", - "Stażysta", - "Konsultant", - "Architekt rozwiązań", - "Główny inżynier", - "Naukowiec ds. badań", - "Inżynier danych", - "Inżynier platformy", - "Programista mobilny", - "Specjalista ds. cyberbezpieczeństwa", - "Tester oprogramowania", - "Lider techniczny", - "Koordynator IT", - "Specjalista ds. wsparcia", ) diff --git a/src/dataforge/locales/pl_PL/person.py b/src/dataforge/locales/pl_PL/person.py index 06e775e..f5827de 100644 --- a/src/dataforge/locales/pl_PL/person.py +++ b/src/dataforge/locales/pl_PL/person.py @@ -80,31 +80,6 @@ "Wasilewski", "Sobczak", "Czerwiński", - "Andrzejewski", - "Cieślak", - "Głowacki", - "Zakrzewski", - "Kołodziej", - "Sikorski", - "Krajewski", - "Gajewski", - "Szymczak", - "Kozak", - "Laskowski", - "Brzeziński", - "Wawrzyniak", - "Markiewicz", - "Kaczmarek", - "Baranowski", - "Przybylski", - "Kurek", - "Filipiak", - "Kowalewski", - "Domański", - "Krupa", - "Kacprzak", - "Mróz", - "Tomczak", ) male_first_names: tuple[str, ...] = ( diff --git a/src/dataforge/locales/pt_BR/company.py b/src/dataforge/locales/pt_BR/company.py index 2f0d9db..d0c7b01 100644 --- a/src/dataforge/locales/pt_BR/company.py +++ b/src/dataforge/locales/pt_BR/company.py @@ -133,19 +133,4 @@ "Diretor de Operações", "Vice-Presidente de Engenharia", "Gerente de Engenharia", - "Líder Técnico", - "Desenvolvedor Sênior", - "Desenvolvedor Júnior", - "Estagiário", - "Consultor", - "Arquiteto de Soluções", - "Engenheiro Principal", - "Cientista de Pesquisa", - "Engenheiro de Dados", - "Engenheiro de Plataforma", - "Desenvolvedor Mobile", - "Engenheiro de Sistemas Embarcados", - "Analista de Sistemas", - "Coordenador de TI", - "Técnico de Suporte", ) diff --git a/src/dataforge/locales/pt_BR/person.py b/src/dataforge/locales/pt_BR/person.py index 6f9d3b2..3cbdd4e 100644 --- a/src/dataforge/locales/pt_BR/person.py +++ b/src/dataforge/locales/pt_BR/person.py @@ -51,56 +51,6 @@ "Renata", "Guilherme", "Adriana", - "Henrique", - "Luciana", - "Leandro", - "Cristiane", - "Marcelo", - "Fabiana", - "Tiago", - "Michele", - "Hugo", - "Raquel", - "Renan", - "Érica", - "Igor", - "Bianca", - "Fábio", - "Débora", - "Luiz", - "Sandra", - "Antônio", - "Simone", - "Francisco", - "Cláudia", - "Raimundo", - "Márcia", - "Sérgio", - "Rita", - "Vicente", - "Lúcia", - "Rogério", - "Vera", - "Alexandre", - "Elaine", - "Márcio", - "Sônia", - "Jorge", - "Rosa", - "Nilson", - "Célia", - "Valdir", - "Terezinha", - "Edson", - "Francisca", - "Claudio", - "Aparecida", - "Roberto", - "Marta", - "Manoel", - "Sueli", - "Geraldo", - "Ivone", ) last_names: tuple[str, ...] = ( @@ -179,9 +129,4 @@ "Leite", "Passos", "Resende", - "Porto", - "Vasconcelos", - "Guerra", - "Magalhães", - "Assis", ) diff --git a/src/dataforge/locales/ru_RU/address.py b/src/dataforge/locales/ru_RU/address.py index 65cfdca..5bc9106 100644 --- a/src/dataforge/locales/ru_RU/address.py +++ b/src/dataforge/locales/ru_RU/address.py @@ -45,16 +45,6 @@ "Vostochnaya", "Zapadnaya", "Severnaya", - "Yuzhnaya", - "Parkovaya", - "Stroiteley", - "Moskovskaya", - "Peterburgskaya", - "Kommunisticheskaya", - "Internatsionalnaya", - "Revolyutsii", - "Truda", - "Mironova", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/ru_RU/company.py b/src/dataforge/locales/ru_RU/company.py index 3c79aa6..bab4cc7 100644 --- a/src/dataforge/locales/ru_RU/company.py +++ b/src/dataforge/locales/ru_RU/company.py @@ -78,20 +78,6 @@ "Krossplatformennyy", "Masshtabiruyemyy", "Mnogourovnevyy", - "Nepreryvnyy", - "Optimizirovannyy", - "Organicheskiy", - "Proaktivnyy", - "Programmiruemyy", - "Progressivnyy", - "Reaktivnyy", - "Robustnyy", - "Setevoy", - "Sinkhronizirovannyy", - "Universalnyy", - "Ustoychivy", - "Virtualizirovannyy", - "Vysokoproizvoditelnyy", ) catch_phrase_nouns: tuple[str, ...] = ( @@ -115,18 +101,6 @@ "produktivnost", "protokol", "resheniye", - "resurs", - "set", - "sistema", - "strategiya", - "struktura", - "tekhnologiya", - "upravleniye", - "uslugi", - "vzaimodeystviye", - "funktsionalnost", - "effektivnost", - "ekosistema", ) job_titles: tuple[str, ...] = ( @@ -155,14 +129,4 @@ "Tekhnicheskiy direktor", "Operatsionnyy direktor", "Direktor po razrabotke", - "Rukovoditel otdela", - "Tekhnicheskiy lead", - "Starshiy razrabotchik", - "Stazhyor", - "Konsultant", - "Arkhitektor resheniy", - "Glavnyy inzhener", - "Inzhener dannykh", - "Mobil'nyy razrabotchik", - "Issledovatel", ) diff --git a/src/dataforge/locales/ru_RU/person.py b/src/dataforge/locales/ru_RU/person.py index 28580ef..4e213cd 100644 --- a/src/dataforge/locales/ru_RU/person.py +++ b/src/dataforge/locales/ru_RU/person.py @@ -80,32 +80,6 @@ "Kazakov", "Afanasyev", "Danilov", - "Kononov", - "Lobanov", - "Lukin", - "Suslov", - "Kalinin", - "Lazarev", - "Naumov", - "Yefimov", - "Ershov", - "Shubin", - "Krasilnikov", - "Zimin", - "Ponomarev", - "Voronov", - "Savelev", - "Konstantinov", - "Shcherbakov", - "Kudryavtsev", - "Kalashnikov", - "Loginov", - "Sukhorukov", - "Golubev", - "Kolesnikov", - "Gorbunov", - "Burov", - "Tsvetkov", ) male_first_names: tuple[str, ...] = ( diff --git a/src/dataforge/locales/sv_SE/address.py b/src/dataforge/locales/sv_SE/address.py index 644f75d..0b3ebcb 100644 --- a/src/dataforge/locales/sv_SE/address.py +++ b/src/dataforge/locales/sv_SE/address.py @@ -41,16 +41,6 @@ "Tall", "Torg", "Trädgårds", - "Tull", - "Upp", - "Vallmo", - "Vatten", - "Vinter", - "Väder", - "Västra", - "Ängs", - "Östra", - "Övre", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/sv_SE/person.py b/src/dataforge/locales/sv_SE/person.py index 502e752..aa356e5 100644 --- a/src/dataforge/locales/sv_SE/person.py +++ b/src/dataforge/locales/sv_SE/person.py @@ -51,57 +51,6 @@ "Jessica", "Johan", "Johanna", - "Jonas", - "Julia", - "Karin", - "Karl", - "Katarina", - "Kristina", - "Lars", - "Lena", - "Lennart", - "Linda", - "Linus", - "Lisa", - "Louise", - "Lovisa", - "Lucas", - "Malin", - "Marcus", - "Margareta", - "Maria", - "Marie", - "Martin", - "Matilda", - "Mattias", - "Mikael", - "Moa", - "Monica", - "Nils", - "Olivia", - "Oscar", - "Patrik", - "Per", - "Peter", - "Robert", - "Roland", - "Rolf", - "Sandra", - "Sara", - "Sebastian", - "Sigrid", - "Sofia", - "Stefan", - "Stig", - "Susanne", - "Sven", - "Thomas", - "Tobias", - "Ulla", - "Ulrika", - "Viktor", - "Wilma", - "Åsa", ) last_names: tuple[str, ...] = ( @@ -180,16 +129,4 @@ "Strömberg", "Sundberg", "Sundqvist", - "Sundström", - "Svensson", - "Söderberg", - "Söderström", - "Viklund", - "Vikström", - "Wallin", - "Wikström", - "Åberg", - "Åkesson", - "Åström", - "Öberg", ) diff --git a/src/dataforge/locales/tr_TR/address.py b/src/dataforge/locales/tr_TR/address.py index cac1e98..1ae5000 100644 --- a/src/dataforge/locales/tr_TR/address.py +++ b/src/dataforge/locales/tr_TR/address.py @@ -41,16 +41,6 @@ "Nergis", "Karanfil", "Yasemin", - "Barış", - "Özgürlük", - "Demokrasi", - "Adalet", - "Birlik", - "Dostluk", - "Kardeşlik", - "Anadolu", - "Trakya", - "Kıbrıs", ) street_suffixes: tuple[str, ...] = ( diff --git a/src/dataforge/locales/tr_TR/person.py b/src/dataforge/locales/tr_TR/person.py index 368eb53..2ab17cf 100644 --- a/src/dataforge/locales/tr_TR/person.py +++ b/src/dataforge/locales/tr_TR/person.py @@ -51,56 +51,6 @@ "Metin", "Murat", "Mustafa", - "Nalan", - "Nazlı", - "Neslihan", - "Nihal", - "Nur", - "Nurgül", - "Nurhan", - "Oğuz", - "Onur", - "Osman", - "Ömer", - "Özge", - "Özlem", - "Pelin", - "Pınar", - "Ramazan", - "Recep", - "Rüya", - "Seda", - "Seher", - "Selin", - "Selma", - "Semra", - "Sercan", - "Serhat", - "Serpil", - "Sevgi", - "Sevim", - "Sibel", - "Soner", - "Suat", - "Sultan", - "Şerife", - "Şükrü", - "Taner", - "Tolga", - "Tuba", - "Tuncay", - "Tuğba", - "Ufuk", - "Uğur", - "Ümit", - "Vildan", - "Volkan", - "Yasemin", - "Yasin", - "Yıldız", - "Yusuf", - "Zehra", - "Zeynep", ) last_names: tuple[str, ...] = ( @@ -179,29 +129,4 @@ "Bayram", "Elmas", "Erdem", - "Altın", - "Uysal", - "Karadağ", - "Güngör", - "Yavuz", - "Özçelik", - "Ulusoy", - "Soytürk", - "Parlak", - "Çağlar", - "Yüksel", - "Işık", - "Dağ", - "Bingöl", - "Tuncer", - "Soylu", - "Karataş", - "Bilgin", - "Gürbüz", - "Aydoğan", - "Canpolat", - "Doğru", - "İnan", - "Bakır", - "Kahraman", ) diff --git a/src/dataforge/locales/zh_CN/company.py b/src/dataforge/locales/zh_CN/company.py index 29454c3..1b58675 100644 --- a/src/dataforge/locales/zh_CN/company.py +++ b/src/dataforge/locales/zh_CN/company.py @@ -133,19 +133,4 @@ "首席运营官", "工程副总裁", "工程总监", - "技术负责人", - "高级开发工程师", - "初级开发工程师", - "实习生", - "顾问", - "解决方案架构师", - "首席工程师", - "研究员", - "数据工程师", - "平台工程师", - "移动开发工程师", - "嵌入式系统工程师", - "系统分析师", - "IT协调员", - "技术支持", ) diff --git a/src/dataforge/locales/zh_CN/person.py b/src/dataforge/locales/zh_CN/person.py index 801d267..d2e7f8f 100644 --- a/src/dataforge/locales/zh_CN/person.py +++ b/src/dataforge/locales/zh_CN/person.py @@ -51,56 +51,6 @@ "俊", "倩", "健", - "秀荣", - "新华", - "丹", - "雪", - "玲", - "瑞", - "畅", - "宁", - "欣", - "辉", - "佳", - "嘉", - "博", - "雅", - "思", - "晨", - "宇", - "泽", - "子涵", - "欣怡", - "梓涵", - "子轩", - "浩宇", - "浩然", - "诗涵", - "雨涵", - "梦琪", - "雨萱", - "一诺", - "子墨", - "可馨", - "语嫣", - "若曦", - "紫萱", - "梦洁", - "雅琴", - "静怡", - "雪儿", - "思琪", - "美琳", - "瑶", - "琳", - "雯", - "睿", - "晗", - "昊", - "煜", - "皓", - "天佑", - "梓豪", ) last_names: tuple[str, ...] = ( diff --git a/src/dataforge/openapi.py b/src/dataforge/openapi.py index 4fe79e6..0e1a2b6 100644 --- a/src/dataforge/openapi.py +++ b/src/dataforge/openapi.py @@ -1,34 +1,4 @@ -"""OpenAPI / JSON Schema import — generate fake data from API specs. - -Parses OpenAPI 3.x and JSON Schema documents, resolves ``$ref`` -references, maps types and formats to DataForge providers, and -creates Schema objects that generate conforming data. - -Usage:: - - from dataforge import DataForge - from dataforge.openapi import OpenAPIParser - - forge = DataForge(seed=42) - parser = OpenAPIParser(forge) - - # From an OpenAPI spec file - schemas = parser.from_file("openapi.yaml") - - # Generate data for a specific schema - rows = schemas["User"].generate(count=100) - - # From a JSON Schema - schema = parser.from_json_schema({ - "type": "object", - "properties": { - "name": {"type": "string"}, - "email": {"type": "string", "format": "email"}, - "age": {"type": "integer", "minimum": 18, "maximum": 99}, - } - }) - rows = schema.generate(count=50) -""" +"""OpenAPI / JSON Schema import — generate fake data from API specs.""" from __future__ import annotations @@ -37,12 +7,7 @@ if TYPE_CHECKING: from dataforge.core import DataForge -# ------------------------------------------------------------------ -# Type mapping: (JSON Schema type, format) → DataForge field -# ------------------------------------------------------------------ - _TYPE_FORMAT_MAP: dict[tuple[str, str | None], str] = { - # String formats ("string", "email"): "email", ("string", "uri"): "url", ("string", "url"): "url", @@ -57,20 +22,16 @@ ("string", "password"): "crypto.sha256", ("string", "byte"): "misc.uuid4", ("string", "binary"): "misc.uuid4", - # String without format → contextual - ("string", None): None, # resolved by property name - # Numbers + ("string", None): None, ("integer", None): None, ("integer", "int32"): None, ("integer", "int64"): None, ("number", None): None, ("number", "float"): None, ("number", "double"): None, - # Boolean ("boolean", None): "boolean", } -# Property name → DataForge field (for unformatted strings/integers) _PROPERTY_NAME_MAP: dict[str, str] = { "name": "full_name", "first_name": "first_name", @@ -98,13 +59,7 @@ class OpenAPIParser: - """Parse OpenAPI and JSON Schema documents into DataForge Schemas. - - Parameters - ---------- - forge : DataForge - The DataForge instance for creating schemas. - """ + """Parse OpenAPI and JSON Schema documents into DataForge Schemas.""" __slots__ = ("_forge", "_ref_cache") @@ -113,18 +68,7 @@ def __init__(self, forge: DataForge) -> None: self._ref_cache: dict[str, Any] = {} def from_file(self, path: str) -> dict[str, Any]: - """Parse an OpenAPI spec file and return schemas. - - Parameters - ---------- - path : str - Path to the OpenAPI spec (JSON or YAML). - - Returns - ------- - dict[str, Schema] - Mapping of schema name → Schema object. - """ + """Parse an OpenAPI spec file and return schemas.""" from dataforge.schema_io import _detect_format fmt = _detect_format(path) @@ -146,21 +90,10 @@ def from_file(self, path: str) -> dict[str, Any]: return self.from_openapi(doc) def from_openapi(self, doc: dict[str, Any]) -> dict[str, Any]: - """Parse an OpenAPI document dict. - - Parameters - ---------- - doc : dict - The parsed OpenAPI document. - - Returns - ------- - dict[str, Schema] - """ - self._ref_cache = doc # store full doc for $ref resolution + """Parse an OpenAPI document dict.""" + self._ref_cache = doc schemas: dict[str, Any] = {} - # OpenAPI 3.x: components.schemas components = doc.get("components", {}) schema_defs = components.get("schemas", {}) @@ -171,7 +104,7 @@ def from_openapi(self, doc: dict[str, Any]) -> dict[str, Any]: schema = self._build_schema(resolved, name) schemas[name] = schema except (ValueError, KeyError): - pass # Skip schemas we can't map + pass return schemas @@ -180,19 +113,7 @@ def from_json_schema( schema_def: dict[str, Any], name: str = "root", ) -> Any: - """Create a Schema from a JSON Schema definition. - - Parameters - ---------- - schema_def : dict - JSON Schema object definition. - name : str - Schema name for error messages. - - Returns - ------- - Schema - """ + """Create a Schema from a JSON Schema definition.""" resolved = self._resolve_refs(schema_def) return self._build_schema(resolved, name) @@ -209,7 +130,7 @@ def _resolve_refs(self, obj: Any) -> Any: def _resolve_ref(self, ref: str) -> Any: """Resolve a single $ref path like '#/components/schemas/User'.""" if not ref.startswith("#/"): - return {} # External refs not supported + return {} parts = ref[2:].split("/") obj: Any = self._ref_cache for part in parts: @@ -254,62 +175,47 @@ def _map_property( schema_type = prop_def.get("type", "string") schema_format = prop_def.get("format") - # Handle enum if "enum" in prop_def: - # For enums, we'll use a lambda with the enum values - # For simplicity, return None and handle in the caller - return None # TODO: enum support via lambda + return None - # Handle arrays if schema_type == "array": - return None # Skip arrays for now + return None - # Handle nested objects if schema_type == "object": - return None # Skip nested objects for now + return None - # Check type+format mapping key = (schema_type, schema_format) mapped = _TYPE_FORMAT_MAP.get(key) if mapped is not None: return mapped - # Check without format key_nofmt = (schema_type, None) mapped = _TYPE_FORMAT_MAP.get(key_nofmt) if mapped is not None: return mapped - # Property name heuristic name_lower = prop_name.lower().replace("-", "_") name_mapped = _PROPERTY_NAME_MAP.get(name_lower) if name_mapped: return name_mapped - # Try to resolve via registry try: self._forge._resolve_field(prop_name) return prop_name except ValueError: pass - # Numeric type fallback with range if schema_type in ("integer", "number"): minimum = prop_def.get("minimum") maximum = prop_def.get("maximum") if minimum is not None or maximum is not None: - # Use a lambda for range-constrained numbers - return None # TODO: range constraint support + return None return None - # Fallback for strings if schema_type == "string": - # Check minLength/maxLength, pattern pattern = prop_def.get("pattern") if pattern: - return None # TODO: regexify support - - # Generic string fallback + return None return "lorem.word" return None diff --git a/src/dataforge/providers/address.py b/src/dataforge/providers/address.py index 2b56cf6..75726ea 100644 --- a/src/dataforge/providers/address.py +++ b/src/dataforge/providers/address.py @@ -1,7 +1,6 @@ """Address provider — generates fake addresses.""" from types import ModuleType -from typing import Literal, overload from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider @@ -38,26 +37,6 @@ "South Africa", "Argentina", "Colombia", - "Chile", - "Turkey", - "Thailand", - "Indonesia", - "Malaysia", - "Singapore", - "Philippines", - "Vietnam", - "Egypt", - "Nigeria", - "Saudi Arabia", - "UAE", - "Israel", - "Czech Republic", - "Romania", - "Hungary", - "Greece", - "Ukraine", - "Peru", - "Pakistan", ) _COUNTRY_CODES: tuple[str, ...] = ( @@ -91,26 +70,6 @@ "ZA", "AR", "CO", - "CL", - "TR", - "TH", - "ID", - "MY", - "SG", - "PH", - "VN", - "EG", - "NG", - "SA", - "AE", - "IL", - "CZ", - "RO", - "HU", - "GR", - "UA", - "PE", - "PK", ) @@ -152,6 +111,11 @@ class AddressProvider(BaseProvider): "coordinate": "coordinate", } + _choice_fields: dict[str, tuple[str, ...]] = { + "country": _COUNTRY_NAMES, + "country_code": _COUNTRY_CODES, + } + def __init__(self, engine: RandomEngine, locale_data: ModuleType) -> None: super().__init__(engine) self._street_names: tuple[str, ...] = locale_data.street_names @@ -163,9 +127,7 @@ def __init__(self, engine: RandomEngine, locale_data: ModuleType) -> None: locale_data.building_number_formats ) - # ------------------------------------------------------------------ # Scalar helpers (always return a single str) - # ------------------------------------------------------------------ def _one_building_number(self) -> str: fmt = self._engine.choice(self._building_number_formats) @@ -188,136 +150,48 @@ def _one_full_address(self) -> str: zip_code = self._one_zip_code() return f"{building} {street}, {city}, {state} {zip_code}" - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - @overload - def building_number(self) -> str: ... - @overload - def building_number(self, count: Literal[1]) -> str: ... - @overload - def building_number(self, count: int) -> str | list[str]: ... def building_number(self, count: int = 1) -> str | list[str]: - """Generate a random building number. - - Parameters - ---------- - count : int - Number of building numbers to generate. - """ + """Generate a random building number.""" if count == 1: return self._one_building_number() return [self._one_building_number() for _ in range(count)] - @overload - def street_name(self) -> str: ... - @overload - def street_name(self, count: Literal[1]) -> str: ... - @overload - def street_name(self, count: int) -> str | list[str]: ... def street_name(self, count: int = 1) -> str | list[str]: - """Generate a random street name (e.g. ``"Oak Ave"``). - - Parameters - ---------- - count : int - Number of street names to generate. - """ + """Generate a random street name.""" if count == 1: return self._one_street() return [self._one_street() for _ in range(count)] - @overload - def street_address(self) -> str: ... - @overload - def street_address(self, count: Literal[1]) -> str: ... - @overload - def street_address(self, count: int) -> str | list[str]: ... def street_address(self, count: int = 1) -> str | list[str]: - """Generate a random street address (e.g. ``"4821 Oak Ave"``). - - Parameters - ---------- - count : int - Number of street addresses to generate. - """ + """Generate a random street address.""" if count == 1: return f"{self._one_building_number()} {self._one_street()}" return [ f"{self._one_building_number()} {self._one_street()}" for _ in range(count) ] - @overload - def city(self) -> str: ... - @overload - def city(self, count: Literal[1]) -> str: ... - @overload - def city(self, count: int) -> str | list[str]: ... def city(self, count: int = 1) -> str | list[str]: - """Generate a random city name. - - Parameters - ---------- - count : int - Number of city names to generate. - """ + """Generate a random city name.""" if count == 1: return self._engine.choice(self._cities) return self._engine.choices(self._cities, count) - @overload - def state(self) -> str: ... - @overload - def state(self, count: Literal[1]) -> str: ... - @overload - def state(self, count: int) -> str | list[str]: ... def state(self, count: int = 1) -> str | list[str]: - """Generate a random US state abbreviation. - - Parameters - ---------- - count : int - Number of state abbreviations to generate. - """ + """Generate a random US state abbreviation.""" if count == 1: return self._engine.choice(self._states) return self._engine.choices(self._states, count) - @overload - def zip_code(self) -> str: ... - @overload - def zip_code(self, count: Literal[1]) -> str: ... - @overload - def zip_code(self, count: int) -> str | list[str]: ... def zip_code(self, count: int = 1) -> str | list[str]: - """Generate a random zip code (e.g. ``"90210"`` or ``"90210-1234"``). - - Parameters - ---------- - count : int - Number of zip codes to generate. - """ + """Generate a random zip code.""" if count == 1: return self._one_zip_code() return [self._one_zip_code() for _ in range(count)] - @overload - def full_address(self) -> str: ... - @overload - def full_address(self, count: Literal[1]) -> str: ... - @overload - def full_address(self, count: int) -> str | list[str]: ... def full_address(self, count: int = 1) -> str | list[str]: - """Generate a complete fake address. - - Example: ``"4821 Oak Ave, Chicago, IL 60614"`` - - Parameters - ---------- - count : int - Number of full addresses to generate. - """ + """Generate a complete fake address.""" if count == 1: return self._one_full_address() # Vectorized batch: bulk choices() for cities/states (2 calls @@ -341,85 +215,15 @@ def full_address(self, count: int = 1) -> str | list[str]: result.append(f"{bldg} {street}, {cities[i]}, {states[i]} {zipcode}") return result - @overload - def country(self) -> str: ... - @overload - def country(self, count: Literal[1]) -> str: ... - @overload - def country(self, count: int) -> str | list[str]: ... - def country(self, count: int = 1) -> str | list[str]: - """Generate a random country name (e.g. ``"Germany"``). - - Parameters - ---------- - count : int - Number of country names to generate. - """ - if count == 1: - return self._engine.choice(_COUNTRY_NAMES) - return self._engine.choices(_COUNTRY_NAMES, count) - - @overload - def country_code(self) -> str: ... - @overload - def country_code(self, count: Literal[1]) -> str: ... - @overload - def country_code(self, count: int) -> str | list[str]: ... - def country_code(self, count: int = 1) -> str | list[str]: - """Generate a random ISO 3166-1 alpha-2 country code (e.g. ``"DE"``). - - Parameters - ---------- - count : int - Number of country codes to generate. - """ - if count == 1: - return self._engine.choice(_COUNTRY_CODES) - return self._engine.choices(_COUNTRY_CODES, count) - - @overload - def latitude(self) -> str: ... - @overload - def latitude(self, count: Literal[1]) -> str: ... - @overload - def latitude(self, count: int) -> str | list[str]: ... def latitude(self, count: int = 1) -> str | list[str]: - """Generate a random latitude (``-90.0`` to ``90.0``). - - Parameters - ---------- - count : int - Number of latitudes to generate. - - Returns - ------- - str or list[str] - Latitude as a decimal string with 6 decimal places. - """ + """Generate a random latitude (-90.0 to 90.0).""" ri = self._engine.random_int if count == 1: return f"{ri(-90_000_000, 90_000_000) / 1_000_000:.6f}" return [f"{ri(-90_000_000, 90_000_000) / 1_000_000:.6f}" for _ in range(count)] - @overload - def longitude(self) -> str: ... - @overload - def longitude(self, count: Literal[1]) -> str: ... - @overload - def longitude(self, count: int) -> str | list[str]: ... def longitude(self, count: int = 1) -> str | list[str]: - """Generate a random longitude (``-180.0`` to ``180.0``). - - Parameters - ---------- - count : int - Number of longitudes to generate. - - Returns - ------- - str or list[str] - Longitude as a decimal string with 6 decimal places. - """ + """Generate a random longitude (-180.0 to 180.0).""" ri = self._engine.random_int if count == 1: return f"{ri(-180_000_000, 180_000_000) / 1_000_000:.6f}" @@ -427,24 +231,8 @@ def longitude(self, count: int = 1) -> str | list[str]: f"{ri(-180_000_000, 180_000_000) / 1_000_000:.6f}" for _ in range(count) ] - @overload - def coordinate(self) -> tuple[str, str]: ... - @overload - def coordinate(self, count: Literal[1]) -> tuple[str, str]: ... - @overload - def coordinate(self, count: int) -> tuple[str, str] | list[tuple[str, str]]: ... def coordinate(self, count: int = 1) -> tuple[str, str] | list[tuple[str, str]]: - """Generate a random (latitude, longitude) pair. - - Parameters - ---------- - count : int - Number of coordinate pairs to generate. - - Returns - ------- - tuple[str, str] or list[tuple[str, str]] - """ + """Generate a random (latitude, longitude) pair.""" ri = self._engine.random_int if count == 1: return ( diff --git a/src/dataforge/providers/ai_chat.py b/src/dataforge/providers/ai_chat.py deleted file mode 100644 index 69cd48a..0000000 --- a/src/dataforge/providers/ai_chat.py +++ /dev/null @@ -1,170 +0,0 @@ -"""AI Chat provider — assembles realistic conversation turns. - -This is a **compound** provider (``_needs_forge = True``) that -delegates to ``ai_prompt`` and ``llm`` providers to assemble -realistic chat messages with role, model, content, and token usage. - -Individual string fields are exposed in ``_field_map`` for Schema -compatibility. The compound ``chat_message()`` method returns a -``dict`` and is available only via direct API use. -""" - -from typing import TYPE_CHECKING, Literal, overload - -from dataforge.backend import RandomEngine -from dataforge.providers.base import BaseProvider - -if TYPE_CHECKING: - from dataforge.core import DataForge - -# Module-level constants — zero per-call allocation -_ROLES: tuple[str, ...] = ( - "system", - "user", - "assistant", - "tool", -) - -_CHAT_ROLES_WEIGHTED: tuple[tuple[str, int], ...] = ( - ("user", 40), - ("assistant", 40), - ("system", 15), - ("tool", 5), -) -_CHAT_ROLE_VALUES: tuple[str, ...] = tuple(r for r, _ in _CHAT_ROLES_WEIGHTED) -_CHAT_ROLE_WEIGHTS: tuple[int, ...] = tuple(w for _, w in _CHAT_ROLES_WEIGHTED) - - -class AiChatProvider(BaseProvider): - """Generates fake AI chat data — messages, roles, conversations. - - Delegates to ``ai_prompt`` and ``llm`` providers for content - and metadata generation. - - Parameters - ---------- - engine : RandomEngine - The shared random engine instance. - forge : DataForge - The parent DataForge instance for cross-provider access. - """ - - __slots__ = ("_forge",) - - _provider_name = "ai_chat" - _locale_modules: tuple[str, ...] = () - _needs_forge: bool = True - _field_map: dict[str, str] = { - "chat_role": "chat_role", - "chat_model": "chat_model", - "chat_content": "chat_content", - "chat_tokens": "chat_tokens", - "chat_finish_reason": "chat_finish_reason", - } - - def __init__(self, engine: RandomEngine, forge: "DataForge") -> None: - super().__init__(engine) - self._forge = forge - - # ------------------------------------------------------------------ - # Individual field methods (for _field_map / Schema compatibility) - # ------------------------------------------------------------------ - - @overload - def chat_role(self) -> str: ... - @overload - def chat_role(self, count: Literal[1]) -> str: ... - @overload - def chat_role(self, count: int) -> str | list[str]: ... - def chat_role(self, count: int = 1) -> str | list[str]: - """Generate a chat message role (user, assistant, system, tool).""" - if count == 1: - return self._engine.weighted_choice(_CHAT_ROLE_VALUES, _CHAT_ROLE_WEIGHTS) - return self._engine.weighted_choices( - _CHAT_ROLE_VALUES, _CHAT_ROLE_WEIGHTS, count - ) - - @overload - def chat_model(self) -> str: ... - @overload - def chat_model(self, count: Literal[1]) -> str: ... - @overload - def chat_model(self, count: int) -> str | list[str]: ... - def chat_model(self, count: int = 1) -> str | list[str]: - """Generate a model name for the chat (delegates to llm.model_name).""" - return self._forge.llm.model_name(count) - - @overload - def chat_content(self) -> str: ... - @overload - def chat_content(self, count: Literal[1]) -> str: ... - @overload - def chat_content(self, count: int) -> str | list[str]: ... - def chat_content(self, count: int = 1) -> str | list[str]: - """Generate chat message content (delegates to ai_prompt.user_prompt).""" - return self._forge.ai_prompt.user_prompt(count) - - @overload - def chat_tokens(self) -> str: ... - @overload - def chat_tokens(self, count: Literal[1]) -> str: ... - @overload - def chat_tokens(self, count: int) -> str | list[str]: ... - def chat_tokens(self, count: int = 1) -> str | list[str]: - """Generate a token count for the message (delegates to llm.token_count).""" - return self._forge.llm.token_count(count) - - @overload - def chat_finish_reason(self) -> str: ... - @overload - def chat_finish_reason(self, count: Literal[1]) -> str: ... - @overload - def chat_finish_reason(self, count: int) -> str | list[str]: ... - def chat_finish_reason(self, count: int = 1) -> str | list[str]: - """Generate a finish reason (delegates to llm.finish_reason).""" - return self._forge.llm.finish_reason(count) - - # ------------------------------------------------------------------ - # Compound message method (direct API only, not in _field_map) - # ------------------------------------------------------------------ - - def chat_message(self, count: int = 1) -> dict[str, str] | list[dict[str, str]]: - """Generate a realistic chat message with role, model, content, tokens. - - Returns a dict with keys: ``role``, ``model``, ``content``, - ``tokens``, ``finish_reason``. - - Parameters - ---------- - count : int - Number of messages to generate. - - Returns - ------- - dict[str, str] or list[dict[str, str]] - """ - - def _one_message() -> dict[str, str]: - role = self._engine.weighted_choice(_CHAT_ROLE_VALUES, _CHAT_ROLE_WEIGHTS) - model = self._forge.llm.model_name() - # Pick content based on role for realism - if role == "system": - content = self._forge.ai_prompt.system_prompt() - elif role == "user": - content = self._forge.ai_prompt.user_prompt() - else: - # assistant or tool — use a user prompt as stand-in - content = self._forge.ai_prompt.user_prompt() - tokens = self._forge.llm.token_count() - finish = self._forge.llm.finish_reason() - return { - "role": role, - "model": model, - "content": content, - "tokens": tokens, - "finish_reason": finish, - } - - if count == 1: - return _one_message() - return [_one_message() for _ in range(count)] diff --git a/src/dataforge/providers/ai_prompt.py b/src/dataforge/providers/ai_prompt.py index 64d5bfd..10041ab 100644 --- a/src/dataforge/providers/ai_prompt.py +++ b/src/dataforge/providers/ai_prompt.py @@ -1,12 +1,8 @@ """AI Prompt provider — user prompts, system prompts, prompt templates.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# --------------------------------------------------------------------------- # Module-level immutable tuples — zero per-call allocation -# --------------------------------------------------------------------------- _USER_PROMPTS: tuple[str, ...] = ( "Summarize this article in 3 bullet points", @@ -29,16 +25,6 @@ "Summarize the key takeaways from this meeting", "What are the main arguments for and against this?", "Help me create an outline for this essay", - "Can you fact-check this statement?", - "Suggest a catchy title for this blog post", - "Help me write a thank you note", - "What are best practices for this workflow?", - "Rewrite this in a more formal tone", - "Break down this complex topic into simple parts", - "What should I consider before making this decision?", - "Help me prepare talking points for this meeting", - "Compare and contrast these two options", - "Give me feedback on this draft", ) _CODING_PROMPTS: tuple[str, ...] = ( @@ -62,16 +48,6 @@ "Write a migration script for this database change", "How do I implement caching for this endpoint?", "Help me write a recursive algorithm for this problem", - "Create a REST API client with proper error handling", - "Write a function to sanitize user input", - "Implement rate limiting for this API", - "How do I handle concurrent requests in this service?", - "Write a custom middleware for this web framework", - "Create a connection pool for this database", - "Help me implement WebSocket support", - "Write a function to parse and validate CSV data", - "Implement a producer-consumer pattern for this queue", - "How do I properly handle database transactions here?", ) _CREATIVE_PROMPTS: tuple[str, ...] = ( @@ -90,11 +66,6 @@ "Write a letter from a time traveler to their past self", "Create a backstory for a video game character", "Write a flash fiction piece about a last sunset", - "Compose a children's story about a brave little cloud", - "Write a comedic sketch about office life", - "Create a recipe told as a love story", - "Write a news article from the year 2150", - "Describe a painting that doesn't exist yet", ) _ANALYSIS_PROMPTS: tuple[str, ...] = ( @@ -113,11 +84,6 @@ "Evaluate the ROI of this marketing campaign", "Analyze the demographic data for this region", "What trends emerge from this time series data?", - "Assess the technical debt in this codebase", - "Analyze the customer churn data for patterns", - "Evaluate the scalability of this architecture", - "What anomalies do you detect in this log data?", - "Analyze the A/B test results for statistical significance", ) _SYSTEM_PROMPTS: tuple[str, ...] = ( @@ -136,11 +102,6 @@ "You are a legal assistant. Provide general information, not legal advice.", "You are a language tutor. Help users practice and improve their skills.", "You are a project manager. Help organize tasks and track progress.", - "You are a marketing strategist. Focus on growth and engagement.", - "You are a database administrator. Optimize queries and schema design.", - "You are a machine learning engineer. Focus on model performance.", - "You are a technical interviewer. Ask relevant and fair questions.", - "You are a documentation specialist. Keep things clear and organized.", ) _PERSONA_ROLES: tuple[str, ...] = ( @@ -159,11 +120,6 @@ "expert systems architect", "professional data engineer", "senior site reliability engineer", - "experienced mobile developer", - "expert network engineer", - "professional QA engineer", - "senior platform engineer", - "experienced AI researcher", ) _PERSONA_TRAITS: tuple[str, ...] = ( @@ -289,93 +245,19 @@ class AiPromptProvider(BaseProvider): "few_shot_prompt": "few_shot_prompt", } - # --- Public API --- - - @overload - def user_prompt(self) -> str: ... - @overload - def user_prompt(self, count: Literal[1]) -> str: ... - @overload - def user_prompt(self, count: int) -> str | list[str]: ... - def user_prompt(self, count: int = 1) -> str | list[str]: - """Generate a realistic user prompt (e.g. ChatGPT-style request).""" - if count == 1: - return self._engine.choice(_USER_PROMPTS) - return self._engine.choices(_USER_PROMPTS, count) - - @overload - def coding_prompt(self) -> str: ... - @overload - def coding_prompt(self, count: Literal[1]) -> str: ... - @overload - def coding_prompt(self, count: int) -> str | list[str]: ... - def coding_prompt(self, count: int = 1) -> str | list[str]: - """Generate a coding-related prompt.""" - if count == 1: - return self._engine.choice(_CODING_PROMPTS) - return self._engine.choices(_CODING_PROMPTS, count) - - @overload - def creative_prompt(self) -> str: ... - @overload - def creative_prompt(self, count: Literal[1]) -> str: ... - @overload - def creative_prompt(self, count: int) -> str | list[str]: ... - def creative_prompt(self, count: int = 1) -> str | list[str]: - """Generate a creative writing prompt.""" - if count == 1: - return self._engine.choice(_CREATIVE_PROMPTS) - return self._engine.choices(_CREATIVE_PROMPTS, count) - - @overload - def analysis_prompt(self) -> str: ... - @overload - def analysis_prompt(self, count: Literal[1]) -> str: ... - @overload - def analysis_prompt(self, count: int) -> str | list[str]: ... - def analysis_prompt(self, count: int = 1) -> str | list[str]: - """Generate a data analysis prompt.""" - if count == 1: - return self._engine.choice(_ANALYSIS_PROMPTS) - return self._engine.choices(_ANALYSIS_PROMPTS, count) - - @overload - def system_prompt(self) -> str: ... - @overload - def system_prompt(self, count: Literal[1]) -> str: ... - @overload - def system_prompt(self, count: int) -> str | list[str]: ... - def system_prompt(self, count: int = 1) -> str | list[str]: - """Generate a system prompt for configuring an AI assistant.""" - if count == 1: - return self._engine.choice(_SYSTEM_PROMPTS) - return self._engine.choices(_SYSTEM_PROMPTS, count) - - # --- Computed string methods --- + _choice_fields: dict[str, tuple[str, ...]] = { + "user_prompt": _USER_PROMPTS, + "coding_prompt": _CODING_PROMPTS, + "creative_prompt": _CREATIVE_PROMPTS, + "analysis_prompt": _ANALYSIS_PROMPTS, + "system_prompt": _SYSTEM_PROMPTS, + } def _one_persona_prompt(self) -> str: role = self._engine.choice(_PERSONA_ROLES) trait = self._engine.choice(_PERSONA_TRAITS) return f"You are an {role} {trait}." - @overload - def persona_prompt(self) -> str: ... - @overload - def persona_prompt(self, count: Literal[1]) -> str: ... - @overload - def persona_prompt(self, count: int) -> str | list[str]: ... - def persona_prompt(self, count: int = 1) -> str | list[str]: - """Generate a persona-based system prompt.""" - if count == 1: - return self._one_persona_prompt() - # Inlined batch with local binding - _choice = self._engine.choice - _roles = _PERSONA_ROLES - _traits = _PERSONA_TRAITS - return [ - f"You are an {_choice(_roles)} {_choice(_traits)}." for _ in range(count) - ] - def _one_prompt_template(self) -> str: action = self._engine.choice(_TEMPLATE_ACTIONS) _tone = self._engine.choice(_TEMPLATE_TONES) @@ -383,25 +265,6 @@ def _one_prompt_template(self) -> str: _topic = self._engine.choice(_TEMPLATE_TOPICS) return f"{action} a {{tone}} {fmt} about {{topic}} for {{audience}}" - @overload - def prompt_template(self) -> str: ... - @overload - def prompt_template(self, count: Literal[1]) -> str: ... - @overload - def prompt_template(self, count: int) -> str | list[str]: ... - def prompt_template(self, count: int = 1) -> str | list[str]: - """Generate a parameterized prompt template with placeholders.""" - if count == 1: - return self._one_prompt_template() - # Inlined batch with local binding - _choice = self._engine.choice - _actions = _TEMPLATE_ACTIONS - _formats = _TEMPLATE_FORMATS - return [ - f"{_choice(_actions)} a {{tone}} {_choice(_formats)} about {{topic}} for {{audience}}" - for _ in range(count) - ] - def _one_few_shot_prompt(self) -> str: task = self._engine.choice(_FEW_SHOT_TASKS) # Build 2 examples @@ -416,32 +279,20 @@ def _one_few_shot_prompt(self) -> str: f'Now classify:\nInput: "{{input}}"\nOutput:' ) - @overload - def few_shot_prompt(self) -> str: ... - @overload - def few_shot_prompt(self, count: Literal[1]) -> str: ... - @overload - def few_shot_prompt(self, count: int) -> str | list[str]: ... + def persona_prompt(self, count: int = 1) -> str | list[str]: + """Generate a persona-based system prompt.""" + if count == 1: + return self._one_persona_prompt() + return [self._one_persona_prompt() for _ in range(count)] + + def prompt_template(self, count: int = 1) -> str | list[str]: + """Generate a parameterized prompt template with placeholders.""" + if count == 1: + return self._one_prompt_template() + return [self._one_prompt_template() for _ in range(count)] + def few_shot_prompt(self, count: int = 1) -> str | list[str]: """Generate a few-shot prompt with example pairs.""" if count == 1: return self._one_few_shot_prompt() - # Inlined batch with local binding - _choice = self._engine.choice - _tasks = _FEW_SHOT_TASKS - _examples = _FEW_SHOT_EXAMPLES - _labels = _FEW_SHOT_LABELS - result: list[str] = [] - for _ in range(count): - task = _choice(_tasks) - ex1 = _choice(_examples) - lb1 = _choice(_labels) - ex2 = _choice(_examples) - lb2 = _choice(_labels) - result.append( - f"{task}.\n\n" - f'Example 1:\nInput: "{ex1}"\nOutput: {lb1}\n\n' - f'Example 2:\nInput: "{ex2}"\nOutput: {lb2}\n\n' - f'Now classify:\nInput: "{{input}}"\nOutput:' - ) - return result + return [self._one_few_shot_prompt() for _ in range(count)] diff --git a/src/dataforge/providers/automotive.py b/src/dataforge/providers/automotive.py index 94d576b..ff0565b 100644 --- a/src/dataforge/providers/automotive.py +++ b/src/dataforge/providers/automotive.py @@ -4,13 +4,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _VEHICLE_MAKES: tuple[str, ...] = ( "Toyota", @@ -33,26 +29,6 @@ "GMC", "Dodge", "Buick", - "Cadillac", - "Lincoln", - "Acura", - "Infiniti", - "Volvo", - "Porsche", - "Land Rover", - "Jaguar", - "Mitsubishi", - "Chrysler", - "Fiat", - "Alfa Romeo", - "Genesis", - "Rivian", - "Lucid", - "Polestar", - "Mini", - "Maserati", - "Ferrari", - "Lamborghini", ) _VEHICLE_MODELS: tuple[str, ...] = ( @@ -86,46 +62,6 @@ "F-Type", "Outlander", "Pacifica", - "500", - "Giulia", - "G70", - "R1T", - "Air", - "Polestar 2", - "Cooper", - "Ghibli", - "488", - "Urus", - "Corolla", - "Accord", - "Mustang", - "Malibu", - "X5", - "E-Class", - "Q7", - "Passat", - "Tucson", - "Sportage", - "Sentra", - "Forester", - "Mazda3", - "ES", - "Model Y", - "Grand Cherokee", - "2500", - "Yukon", - "Challenger", - "Enclave", - "CT5", - "Corsair", - "TLX", - "QX60", - "XC60", - "Cayenne", - "Defender", - "XE", - "Eclipse Cross", - "300", ) _VEHICLE_COLORS: tuple[str, ...] = ( @@ -144,11 +80,6 @@ "Purple", "Burgundy", "Navy", - "Charcoal", - "Pearl White", - "Midnight Blue", - "Racing Red", - "Arctic Silver", ) _PLATE_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" @@ -218,9 +149,13 @@ class AutomotiveProvider(BaseProvider): "vehicle_color": "vehicle_color", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "vehicle_make": _VEHICLE_MAKES, + "vehicle_model": _VEHICLE_MODELS, + "vehicle_color": _VEHICLE_COLORS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_plate(self) -> str: """Generate a single US-style license plate (ABC-1234).""" @@ -249,168 +184,30 @@ def _one_vin(self) -> str: chars[8] = "X" if remainder == 10 else str(remainder) return "".join(chars) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - @overload - def license_plate(self) -> str: ... - @overload - def license_plate(self, count: Literal[1]) -> str: ... - @overload - def license_plate(self, count: int) -> str | list[str]: ... def license_plate(self, count: int = 1) -> str | list[str]: - """Generate a US-style license plate (e.g. ``"ABC-1234"``). - - Parameters - ---------- - count : int - Number of plates to generate. - - Returns - ------- - str or list[str] - """ + """Generate a US-style license plate (e.g. ``"ABC-1234"``).""" if count == 1: return self._one_plate() return [self._one_plate() for _ in range(count)] - @overload - def vin(self) -> str: ... - @overload - def vin(self, count: Literal[1]) -> str: ... - @overload - def vin(self, count: int) -> str | list[str]: ... def vin(self, count: int = 1) -> str | list[str]: - """Generate a 17-character Vehicle Identification Number. - - The check digit (position 9) is computed correctly per the - ISO 3779 / FMVSS 115 algorithm. - - Parameters - ---------- - count : int - Number of VINs to generate. - - Returns - ------- - str or list[str] - """ + """Generate a 17-character Vehicle Identification Number.""" if count == 1: return self._one_vin() return [self._one_vin() for _ in range(count)] - @overload - def vehicle_make(self) -> str: ... - @overload - def vehicle_make(self, count: Literal[1]) -> str: ... - @overload - def vehicle_make(self, count: int) -> str | list[str]: ... - def vehicle_make(self, count: int = 1) -> str | list[str]: - """Generate a vehicle manufacturer name (e.g. ``"Toyota"``). - - Parameters - ---------- - count : int - Number of makes to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_VEHICLE_MAKES) - return self._engine.choices(_VEHICLE_MAKES, count) - - @overload - def vehicle_model(self) -> str: ... - @overload - def vehicle_model(self, count: Literal[1]) -> str: ... - @overload - def vehicle_model(self, count: int) -> str | list[str]: ... - def vehicle_model(self, count: int = 1) -> str | list[str]: - """Generate a vehicle model name (e.g. ``"Camry"``). - - Parameters - ---------- - count : int - Number of models to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_VEHICLE_MODELS) - return self._engine.choices(_VEHICLE_MODELS, count) - - @overload - def vehicle_year(self) -> int: ... - @overload - def vehicle_year(self, count: Literal[1]) -> int: ... - @overload - def vehicle_year(self, count: int) -> int | list[int]: ... def vehicle_year(self, count: int = 1) -> int | list[int]: - """Generate a vehicle model year (1990–2026). - - Parameters - ---------- - count : int - Number of years to generate. - - Returns - ------- - int or list[int] - """ + """Generate a vehicle model year (1990–2026).""" ri = self._engine.random_int if count == 1: return ri(1990, 2026) return [ri(1990, 2026) for _ in range(count)] - @overload - def vehicle_year_str(self) -> str: ... - @overload - def vehicle_year_str(self, count: Literal[1]) -> str: ... - @overload - def vehicle_year_str(self, count: int) -> str | list[str]: ... def vehicle_year_str(self, count: int = 1) -> str | list[str]: - """Generate a vehicle model year as a string (``"1990"``–``"2026"``). - - This variant is used by the ``_field_map`` for Schema - compatibility (all Schema fields must produce strings). - - Parameters - ---------- - count : int - Number of years to generate. - - Returns - ------- - str or list[str] - """ + """Generate a vehicle model year as a string (``"1990"``–``"2026"``).""" ri = self._engine.random_int if count == 1: return str(ri(1990, 2026)) return [str(ri(1990, 2026)) for _ in range(count)] - - @overload - def vehicle_color(self) -> str: ... - @overload - def vehicle_color(self, count: Literal[1]) -> str: ... - @overload - def vehicle_color(self, count: int) -> str | list[str]: ... - def vehicle_color(self, count: int = 1) -> str | list[str]: - """Generate a vehicle color (e.g. ``"Midnight Blue"``). - - Parameters - ---------- - count : int - Number of colors to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_VEHICLE_COLORS) - return self._engine.choices(_VEHICLE_COLORS, count) diff --git a/src/dataforge/providers/barcode.py b/src/dataforge/providers/barcode.py index 03473a5..934adb9 100644 --- a/src/dataforge/providers/barcode.py +++ b/src/dataforge/providers/barcode.py @@ -4,8 +4,6 @@ standards. This provider is locale-independent. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider @@ -50,9 +48,7 @@ class BarcodeProvider(BaseProvider): "isbn10": "isbn10", } - # ------------------------------------------------------------------ # Scalar helpers - # ------------------------------------------------------------------ def _one_ean13(self) -> str: body = self._engine.random_digits_str(12) @@ -72,78 +68,28 @@ def _one_isbn10(self) -> str: body = self._engine.random_digits_str(9) return body + _isbn10_check_digit(body) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def ean13(self) -> str: ... - @overload - def ean13(self, count: Literal[1]) -> str: ... - @overload - def ean13(self, count: int) -> str | list[str]: ... - def ean13(self, count: int = 1) -> str | list[str]: - """Generate a random EAN-13 barcode (13 digits, valid check digit). - Parameters - ---------- - count : int - Number of barcodes to generate. - """ + def ean13(self, count: int = 1) -> str | list[str]: + """Generate a random EAN-13 barcode (13 digits, valid check digit).""" if count == 1: return self._one_ean13() return [self._one_ean13() for _ in range(count)] - @overload - def ean8(self) -> str: ... - @overload - def ean8(self, count: Literal[1]) -> str: ... - @overload - def ean8(self, count: int) -> str | list[str]: ... def ean8(self, count: int = 1) -> str | list[str]: - """Generate a random EAN-8 barcode (8 digits, valid check digit). - - Parameters - ---------- - count : int - Number of barcodes to generate. - """ + """Generate a random EAN-8 barcode (8 digits, valid check digit).""" if count == 1: return self._one_ean8() return [self._one_ean8() for _ in range(count)] - @overload - def isbn13(self) -> str: ... - @overload - def isbn13(self, count: Literal[1]) -> str: ... - @overload - def isbn13(self, count: int) -> str | list[str]: ... def isbn13(self, count: int = 1) -> str | list[str]: - """Generate a random ISBN-13 (starts with 978/979, valid check digit). - - Parameters - ---------- - count : int - Number of ISBNs to generate. - """ + """Generate a random ISBN-13 (starts with 978/979, valid check digit).""" if count == 1: return self._one_isbn13() return [self._one_isbn13() for _ in range(count)] - @overload - def isbn10(self) -> str: ... - @overload - def isbn10(self, count: Literal[1]) -> str: ... - @overload - def isbn10(self, count: int) -> str | list[str]: ... def isbn10(self, count: int = 1) -> str | list[str]: - """Generate a random ISBN-10 (9 digits + check character). - - Parameters - ---------- - count : int - Number of ISBNs to generate. - """ + """Generate a random ISBN-10 (9 digits + check character).""" if count == 1: return self._one_isbn10() return [self._one_isbn10() for _ in range(count)] diff --git a/src/dataforge/providers/base.py b/src/dataforge/providers/base.py index 0491535..9cb327f 100644 --- a/src/dataforge/providers/base.py +++ b/src/dataforge/providers/base.py @@ -3,13 +3,25 @@ from dataforge.backend import RandomEngine +def _make_choice_method(data: tuple[str, ...]): + """Create a choice method that picks from *data*. + + Returns a function with the standard ``(self, count=1)`` signature. + Uses the same ``engine.choice`` / ``engine.choices`` hot-path as + hand-written methods so performance is identical. + """ + + def method(self, count: int = 1): + if count == 1: + return self._engine.choice(data) + return self._engine.choices(data, count) + + return method + + class BaseProvider: """Abstract base for all dataforge providers. - Holds a reference to the shared :class:`RandomEngine` so every - provider can generate random values without owning its own RNG - state. - Subclasses should define class-level metadata for the provider registry: @@ -20,6 +32,9 @@ class BaseProvider: ``()`` for locale-independent providers. - ``_field_map``: dict mapping shorthand field names to method names (e.g. ``{"first_name": "first_name", "name": "full_name"}``). + - ``_choice_fields``: dict mapping method names to data tuples. + Methods listed here are **auto-generated** at class-creation + time — no hand-written boilerplate required. """ __slots__ = ("_engine",) @@ -30,5 +45,15 @@ class BaseProvider: _field_map: dict[str, str] = {} _needs_forge: bool = False + # Declarative choice fields — auto-generated as methods + _choice_fields: dict[str, tuple[str, ...]] = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + # Auto-generate simple choice methods from _choice_fields + for name, data in cls._choice_fields.items(): + if name not in cls.__dict__: # don't override explicit methods + setattr(cls, name, _make_choice_method(data)) + def __init__(self, engine: RandomEngine) -> None: self._engine = engine diff --git a/src/dataforge/providers/color.py b/src/dataforge/providers/color.py index c8c4796..80a2823 100644 --- a/src/dataforge/providers/color.py +++ b/src/dataforge/providers/color.py @@ -1,7 +1,5 @@ """Color provider — generates fake colors in various formats.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _COLOR_NAMES: tuple[str, ...] = ( @@ -25,36 +23,6 @@ "Teal", "Aqua", "Silver", - "Gold", - "Coral", - "Salmon", - "Turquoise", - "Indigo", - "Violet", - "Crimson", - "Khaki", - "Ivory", - "Lavender", - "Beige", - "Mint", - "Plum", - "Orchid", - "Sienna", - "Tan", - "Azure", - "Peach", - "Chartreuse", - "Fuchsia", - "Tomato", - "SteelBlue", - "SlateGray", - "RoyalBlue", - "DarkGreen", - "DarkRed", - "DodgerBlue", - "ForestGreen", - "Chocolate", - "Firebrick", ) @@ -79,9 +47,11 @@ class ColorProvider(BaseProvider): "hsl_color": "hsl_string", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "color_name": _COLOR_NAMES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_hex(self) -> str: return f"#{self._engine._rng.getrandbits(24):06x}" @@ -102,100 +72,30 @@ def _one_hsl(self) -> tuple[int, int, int]: self._engine.random_int(0, 100), ) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def color_name(self) -> str: ... - @overload - def color_name(self, count: Literal[1]) -> str: ... - @overload - def color_name(self, count: int) -> str | list[str]: ... - def color_name(self, count: int = 1) -> str | list[str]: - """Generate a random color name (e.g. ``"Red"``, ``"Teal"``). - - Parameters - ---------- - count : int - Number of color names to generate. - """ - if count == 1: - return self._engine.choice(_COLOR_NAMES) - return self._engine.choices(_COLOR_NAMES, count) - - @overload - def hex_color(self) -> str: ... - @overload - def hex_color(self, count: Literal[1]) -> str: ... - @overload - def hex_color(self, count: int) -> str | list[str]: ... - def hex_color(self, count: int = 1) -> str | list[str]: - """Generate a random hex color (e.g. ``"#a3f2c1"``). - Parameters - ---------- - count : int - Number of hex colors to generate. - """ + def hex_color(self, count: int = 1) -> str | list[str]: + """Generate a random hex color (e.g. ``"#a3f2c1"``).""" if count == 1: return self._one_hex() return [self._one_hex() for _ in range(count)] - @overload - def rgb(self) -> tuple[int, int, int]: ... - @overload - def rgb(self, count: Literal[1]) -> tuple[int, int, int]: ... - @overload - def rgb(self, count: int) -> tuple[int, int, int] | list[tuple[int, int, int]]: ... def rgb(self, count: int = 1) -> tuple[int, int, int] | list[tuple[int, int, int]]: - """Generate a random RGB tuple (e.g. ``(123, 45, 200)``). - - Parameters - ---------- - count : int - Number of RGB tuples to generate. - """ + """Generate a random RGB tuple (e.g. ``(123, 45, 200)``).""" if count == 1: return self._one_rgb() return [self._one_rgb() for _ in range(count)] - @overload - def rgba(self) -> tuple[int, int, int, float]: ... - @overload - def rgba(self, count: Literal[1]) -> tuple[int, int, int, float]: ... - @overload - def rgba( - self, count: int - ) -> tuple[int, int, int, float] | list[tuple[int, int, int, float]]: ... def rgba( self, count: int = 1 ) -> tuple[int, int, int, float] | list[tuple[int, int, int, float]]: - """Generate a random RGBA tuple (e.g. ``(123, 45, 200, 0.75)``). - - Parameters - ---------- - count : int - Number of RGBA tuples to generate. - """ + """Generate a random RGBA tuple (e.g. ``(123, 45, 200, 0.75)``).""" if count == 1: return self._one_rgba() return [self._one_rgba() for _ in range(count)] - @overload - def rgb_string(self) -> str: ... - @overload - def rgb_string(self, count: Literal[1]) -> str: ... - @overload - def rgb_string(self, count: int) -> str | list[str]: ... def rgb_string(self, count: int = 1) -> str | list[str]: - """Generate a random RGB CSS string (e.g. ``"rgb(123, 45, 200)"``). - - Parameters - ---------- - count : int - Number of RGB strings to generate. - """ + """Generate a random RGB CSS string (e.g. ``"rgb(123, 45, 200)"``).""" if count == 1: r, g, b = self._one_rgb() return f"rgb({r}, {g}, {b})" @@ -205,38 +105,14 @@ def rgb_string(self, count: int = 1) -> str | list[str]: result.append(f"rgb({r}, {g}, {b})") return result - @overload - def hsl(self) -> tuple[int, int, int]: ... - @overload - def hsl(self, count: Literal[1]) -> tuple[int, int, int]: ... - @overload - def hsl(self, count: int) -> tuple[int, int, int] | list[tuple[int, int, int]]: ... def hsl(self, count: int = 1) -> tuple[int, int, int] | list[tuple[int, int, int]]: - """Generate a random HSL tuple (e.g. ``(210, 65, 50)``). - - Parameters - ---------- - count : int - Number of HSL tuples to generate. - """ + """Generate a random HSL tuple (e.g. ``(210, 65, 50)``).""" if count == 1: return self._one_hsl() return [self._one_hsl() for _ in range(count)] - @overload - def hsl_string(self) -> str: ... - @overload - def hsl_string(self, count: Literal[1]) -> str: ... - @overload - def hsl_string(self, count: int) -> str | list[str]: ... def hsl_string(self, count: int = 1) -> str | list[str]: - """Generate a random HSL CSS string (e.g. ``"hsl(210, 65%, 50%)"``). - - Parameters - ---------- - count : int - Number of HSL strings to generate. - """ + """Generate a random HSL CSS string (e.g. ``"hsl(210, 65%, 50%)"``).""" if count == 1: h, s, lt = self._one_hsl() return f"hsl({h}, {s}%, {lt}%)" diff --git a/src/dataforge/providers/company.py b/src/dataforge/providers/company.py index fc545ba..8a5e56e 100644 --- a/src/dataforge/providers/company.py +++ b/src/dataforge/providers/company.py @@ -1,7 +1,6 @@ """Company provider — generates fake company names, catch phrases, job titles.""" from types import ModuleType -from typing import Literal, overload from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider @@ -47,9 +46,7 @@ def __init__(self, engine: RandomEngine, locale_data: ModuleType) -> None: self._catch_phrase_nouns: tuple[str, ...] = locale_data.catch_phrase_nouns self._job_titles: tuple[str, ...] = locale_data.job_titles - # ------------------------------------------------------------------ # Scalar helpers - # ------------------------------------------------------------------ def _one_company_name(self) -> str: name = self._engine.choice(self._company_names) @@ -61,24 +58,10 @@ def _one_catch_phrase(self) -> str: noun = self._engine.choice(self._catch_phrase_nouns) return f"{adj} {noun}" - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def company_name(self) -> str: ... - @overload - def company_name(self, count: Literal[1]) -> str: ... - @overload - def company_name(self, count: int) -> str | list[str]: ... - def company_name(self, count: int = 1) -> str | list[str]: - """Generate a random company name (e.g. ``"Acme Inc"``). - Parameters - ---------- - count : int - Number of company names to generate. - """ + def company_name(self, count: int = 1) -> str | list[str]: + """Generate a random company name.""" if count == 1: return self._one_company_name() # Vectorized: 2 bulk choices() + zip (avoids N scalar calls) @@ -86,38 +69,14 @@ def company_name(self, count: int = 1) -> str | list[str]: suffixes = self._engine.choices(self._company_suffixes, count) return [f"{n} {s}" for n, s in zip(names, suffixes)] - @overload - def company_suffix(self) -> str: ... - @overload - def company_suffix(self, count: Literal[1]) -> str: ... - @overload - def company_suffix(self, count: int) -> str | list[str]: ... def company_suffix(self, count: int = 1) -> str | list[str]: - """Generate a random company suffix (e.g. ``"LLC"``, ``"GmbH"``). - - Parameters - ---------- - count : int - Number of suffixes to generate. - """ + """Generate a random company suffix.""" if count == 1: return self._engine.choice(self._company_suffixes) return self._engine.choices(self._company_suffixes, count) - @overload - def catch_phrase(self) -> str: ... - @overload - def catch_phrase(self, count: Literal[1]) -> str: ... - @overload - def catch_phrase(self, count: int) -> str | list[str]: ... def catch_phrase(self, count: int = 1) -> str | list[str]: - """Generate a random catch phrase (e.g. ``"Innovative framework"``). - - Parameters - ---------- - count : int - Number of catch phrases to generate. - """ + """Generate a random catch phrase.""" if count == 1: return self._one_catch_phrase() # Vectorized: 2 bulk choices() + zip (avoids N scalar calls) @@ -125,20 +84,8 @@ def catch_phrase(self, count: int = 1) -> str | list[str]: nouns = self._engine.choices(self._catch_phrase_nouns, count) return [f"{a} {n}" for a, n in zip(adjs, nouns)] - @overload - def job_title(self) -> str: ... - @overload - def job_title(self, count: Literal[1]) -> str: ... - @overload - def job_title(self, count: int) -> str | list[str]: ... def job_title(self, count: int = 1) -> str | list[str]: - """Generate a random job title. - - Parameters - ---------- - count : int - Number of job titles to generate. - """ + """Generate a random job title.""" if count == 1: return self._engine.choice(self._job_titles) return self._engine.choices(self._job_titles, count) diff --git a/src/dataforge/providers/crypto.py b/src/dataforge/providers/crypto.py index 1327d8f..f41c314 100644 --- a/src/dataforge/providers/crypto.py +++ b/src/dataforge/providers/crypto.py @@ -5,21 +5,11 @@ correct lengths, not actual digests. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider class CryptoProvider(BaseProvider): - """Generates random strings matching common hash digest formats. - - This provider is locale-independent. - - Parameters - ---------- - engine : RandomEngine - The shared random engine instance. - """ + """Generates random strings matching common hash digest formats.""" __slots__ = () @@ -31,74 +21,24 @@ class CryptoProvider(BaseProvider): "sha256": "sha256", } - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - @overload - def md5(self) -> str: ... - @overload - def md5(self, count: Literal[1]) -> str: ... - @overload - def md5(self, count: int) -> str | list[str]: ... def md5(self, count: int = 1) -> str | list[str]: - """Generate a random MD5-style hex string (32 hex chars). - - Parameters - ---------- - count : int - Number of hashes to generate. - - Returns - ------- - str or list[str] - """ + """Generate a random MD5-style hex string (32 hex chars).""" bits = self._engine._rng.getrandbits if count == 1: return f"{bits(128):032x}" return [f"{bits(128):032x}" for _ in range(count)] - @overload - def sha1(self) -> str: ... - @overload - def sha1(self, count: Literal[1]) -> str: ... - @overload - def sha1(self, count: int) -> str | list[str]: ... def sha1(self, count: int = 1) -> str | list[str]: - """Generate a random SHA-1-style hex string (40 hex chars). - - Parameters - ---------- - count : int - Number of hashes to generate. - - Returns - ------- - str or list[str] - """ + """Generate a random SHA-1-style hex string (40 hex chars).""" bits = self._engine._rng.getrandbits if count == 1: return f"{bits(160):040x}" return [f"{bits(160):040x}" for _ in range(count)] - @overload - def sha256(self) -> str: ... - @overload - def sha256(self, count: Literal[1]) -> str: ... - @overload - def sha256(self, count: int) -> str | list[str]: ... def sha256(self, count: int = 1) -> str | list[str]: - """Generate a random SHA-256-style hex string (64 hex chars). - - Parameters - ---------- - count : int - Number of hashes to generate. - - Returns - ------- - str or list[str] - """ + """Generate a random SHA-256-style hex string (64 hex chars).""" bits = self._engine._rng.getrandbits if count == 1: return f"{bits(256):064x}" diff --git a/src/dataforge/providers/datetime.py b/src/dataforge/providers/datetime.py index d73331d..1b9ffcd 100644 --- a/src/dataforge/providers/datetime.py +++ b/src/dataforge/providers/datetime.py @@ -5,7 +5,6 @@ """ import datetime as _dt -from typing import Literal, overload from dataforge.providers.base import BaseProvider @@ -53,27 +52,6 @@ "Asia/Shanghai", "Asia/Hong_Kong", "Asia/Seoul", - "Asia/Singapore", - "Asia/Dubai", - "Asia/Kolkata", - "Asia/Bangkok", - "Asia/Jakarta", - "Asia/Karachi", - "Asia/Riyadh", - "Asia/Taipei", - "Australia/Sydney", - "Australia/Melbourne", - "Australia/Perth", - "Pacific/Auckland", - "America/Sao_Paulo", - "America/Mexico_City", - "America/Buenos_Aires", - "America/Bogota", - "America/Lima", - "Africa/Cairo", - "Africa/Lagos", - "Africa/Johannesburg", - "Africa/Nairobi", ) @@ -103,9 +81,11 @@ class DateTimeProvider(BaseProvider): "unix_timestamp": "unix_timestamp", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "timezone": _TIMEZONES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_date( self, @@ -138,11 +118,7 @@ def _time_to_hms(t: _dt.time) -> str: return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}" def _one_time_str(self) -> str: - """Generate a random time as ``HH:MM:SS`` string — fast path. - - Bypasses ``_dt.time`` object creation entirely by using - ``divmod`` arithmetic directly on the random seconds value. - """ + """Generate a random time as ``HH:MM:SS`` string — fast path.""" total = self._engine.random_int(0, _SECONDS_IN_DAY - 1) h, rem = divmod(total, 3600) m, s = divmod(rem, 60) @@ -163,16 +139,8 @@ def _one_date_of_birth(self, min_age: int = 18, max_age: int = 80) -> _dt.date: end = today.replace(year=today.year - min_age) return self._one_date(start, end) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - @overload - def date(self) -> str: ... - @overload - def date(self, count: Literal[1]) -> str: ... - @overload - def date(self, count: int) -> str | list[str]: ... def date( self, count: int = 1, @@ -180,19 +148,7 @@ def date( start: _dt.date | None = None, end: _dt.date | None = None, ) -> str | list[str]: - """Generate a random date string. - - Parameters - ---------- - count : int - Number of dates to generate. - fmt : str - strftime format string. - start : datetime.date | None - Earliest date (default: 1970-01-01). - end : datetime.date | None - Latest date (default: 2030-12-31). - """ + """Generate a random date string.""" s = start or _MIN_DATE e = end or _MAX_DATE # Fast path: default ISO format avoids expensive strftime @@ -213,22 +169,8 @@ def date( return self._one_date(s, e).strftime(fmt) return [self._one_date(s, e).strftime(fmt) for _ in range(count)] - @overload - def time(self) -> str: ... - @overload - def time(self, count: Literal[1]) -> str: ... - @overload - def time(self, count: int) -> str | list[str]: ... def time(self, count: int = 1, fmt: str = "%H:%M:%S") -> str | list[str]: - """Generate a random time string. - - Parameters - ---------- - count : int - Number of times to generate. - fmt : str - strftime format string. - """ + """Generate a random time string.""" # Fast path: default HH:MM:SS format — skip _dt.time object if fmt == "%H:%M:%S": if count == 1: @@ -246,12 +188,6 @@ def time(self, count: int = 1, fmt: str = "%H:%M:%S") -> str | list[str]: return self._one_time().strftime(fmt) return [self._one_time().strftime(fmt) for _ in range(count)] - @overload - def datetime(self) -> str: ... - @overload - def datetime(self, count: Literal[1]) -> str: ... - @overload - def datetime(self, count: int) -> str | list[str]: ... def datetime( self, count: int = 1, @@ -259,19 +195,7 @@ def datetime( start: _dt.date | None = None, end: _dt.date | None = None, ) -> str | list[str]: - """Generate a random datetime string. - - Parameters - ---------- - count : int - Number of datetimes to generate. - fmt : str - strftime format string. - start : datetime.date | None - Earliest date (default: 1970-01-01). - end : datetime.date | None - Latest date (default: 2030-12-31). - """ + """Generate a random datetime string.""" s = start or _MIN_DATE e = end or _MAX_DATE # Fast path: default ISO-like format — avoid strftime + datetime objects @@ -305,12 +229,6 @@ def datetime( return self._one_datetime(s, e).strftime(fmt) return [self._one_datetime(s, e).strftime(fmt) for _ in range(count)] - @overload - def date_of_birth(self) -> str: ... - @overload - def date_of_birth(self, count: Literal[1]) -> str: ... - @overload - def date_of_birth(self, count: int) -> str | list[str]: ... def date_of_birth( self, count: int = 1, @@ -318,19 +236,7 @@ def date_of_birth( max_age: int = 80, fmt: str = "%Y-%m-%d", ) -> str | list[str]: - """Generate a random date of birth. - - Parameters - ---------- - count : int - Number of dates to generate. - min_age : int - Minimum age in years. - max_age : int - Maximum age in years. - fmt : str - strftime format string. - """ + """Generate a random date of birth.""" # Compute today() once for the entire batch today = _dt.date.today() start = today.replace(year=today.year - max_age) @@ -349,64 +255,24 @@ def date_of_birth( return [self._one_date(start, end).strftime(fmt) for _ in range(count)] def date_object(self, count: int = 1) -> _dt.date | list[_dt.date]: - """Generate a random ``datetime.date`` object. - - Parameters - ---------- - count : int - Number of date objects to generate. - """ + """Generate a random ``datetime.date`` object.""" if count == 1: return self._one_date() return [self._one_date() for _ in range(count)] def datetime_object(self, count: int = 1) -> _dt.datetime | list[_dt.datetime]: - """Generate a random ``datetime.datetime`` object. - - Parameters - ---------- - count : int - Number of datetime objects to generate. - """ + """Generate a random ``datetime.datetime`` object.""" if count == 1: return self._one_datetime() return [self._one_datetime() for _ in range(count)] - @overload - def timezone(self) -> str: ... - @overload - def timezone(self, count: Literal[1]) -> str: ... - @overload - def timezone(self, count: int) -> str | list[str]: ... - def timezone(self, count: int = 1) -> str | list[str]: - """Generate a random IANA timezone string (e.g. ``"Europe/Berlin"``). - - Parameters - ---------- - count : int - Number of timezone strings to generate. - """ - if count == 1: - return self._engine.choice(_TIMEZONES) - return self._engine.choices(_TIMEZONES, count) - def unix_timestamp( self, count: int = 1, start: _dt.date | None = None, end: _dt.date | None = None, ) -> int | list[int]: - """Generate a random Unix timestamp (seconds since epoch). - - Parameters - ---------- - count : int - Number of timestamps to generate. - start : datetime.date | None - Earliest date (default: 1970-01-01). - end : datetime.date | None - Latest date (default: 2030-12-31). - """ + """Generate a random Unix timestamp (seconds since epoch).""" # Use pre-computed constants for default range to avoid # .toordinal() per call. if start is None and end is None: diff --git a/src/dataforge/providers/ecommerce.py b/src/dataforge/providers/ecommerce.py index 504044d..4877686 100644 --- a/src/dataforge/providers/ecommerce.py +++ b/src/dataforge/providers/ecommerce.py @@ -1,7 +1,5 @@ """E-commerce provider — products, SKUs, tracking, reviews.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _PRODUCT_ADJECTIVES: tuple[str, ...] = ( @@ -20,17 +18,6 @@ "Compact", "Portable", "Wireless", - "Digital", - "Organic", - "Vintage", - "Artisan", - "Custom", - "Heavy-Duty", - "Lightweight", - "Industrial", - "Professional", - "Commercial", - "Residential", ) _PRODUCT_MATERIALS: tuple[str, ...] = ( @@ -49,14 +36,6 @@ "Marble", "Carbon Fiber", "Titanium", - "Bronze", - "Copper", - "Linen", - "Wool", - "Concrete", - "Paper", - "Foam", - "Nylon", ) _PRODUCT_ITEMS: tuple[str, ...] = ( @@ -80,31 +59,6 @@ "Pan", "Pillow", "Blanket", - "Towel", - "Mirror", - "Clock", - "Frame", - "Shelf", - "Desk", - "Sofa", - "Bench", - "Stool", - "Rack", - "Cabinet", - "Drawer", - "Basket", - "Box", - "Case", - "Cover", - "Mat", - "Rug", - "Curtain", - "Vase", - "Candle", - "Planter", - "Tray", - "Hook", - "Stand", ) _PRODUCT_CATEGORIES: tuple[str, ...] = ( @@ -155,11 +109,6 @@ "Love it!", "Just okay", "Works as expected", - "Would buy again", - "Five stars", - "Better than expected", - "Fantastic purchase", - "Very satisfied", ) @@ -184,6 +133,11 @@ class EcommerceProvider(BaseProvider): "order_id": "order_id", } + _choice_fields: dict[str, tuple[str, ...]] = { + "product_category": _PRODUCT_CATEGORIES, + "review_title": _REVIEW_TITLES, + } + _CURRENCIES: tuple[tuple[str, str], ...] = ( ("$", "USD"), ("€", "EUR"), @@ -193,8 +147,6 @@ class EcommerceProvider(BaseProvider): ("$", "AUD"), ) - # --- Scalar helpers --- - def _one_product_name(self) -> str: _c = self._engine.choice return ( @@ -212,50 +164,18 @@ def _one_tracking(self) -> str: def _one_order_id(self) -> str: return f"ORD-{self._engine.random_digits_str(10)}" - # --- Public API --- - - @overload - def product_name(self) -> str: ... - @overload - def product_name(self, count: Literal[1]) -> str: ... - @overload - def product_name(self, count: int) -> str | list[str]: ... def product_name(self, count: int = 1) -> str | list[str]: """Generate a fake product name.""" if count == 1: return self._one_product_name() return [self._one_product_name() for _ in range(count)] - @overload - def product_category(self) -> str: ... - @overload - def product_category(self, count: Literal[1]) -> str: ... - @overload - def product_category(self, count: int) -> str | list[str]: ... - def product_category(self, count: int = 1) -> str | list[str]: - """Generate a product category.""" - if count == 1: - return self._engine.choice(_PRODUCT_CATEGORIES) - return self._engine.choices(_PRODUCT_CATEGORIES, count) - - @overload - def sku(self) -> str: ... - @overload - def sku(self, count: Literal[1]) -> str: ... - @overload - def sku(self, count: int) -> str | list[str]: ... def sku(self, count: int = 1) -> str | list[str]: """Generate a product SKU (e.g., ABC-123456).""" if count == 1: return self._one_sku() return [self._one_sku() for _ in range(count)] - @overload - def price_with_currency(self) -> str: ... - @overload - def price_with_currency(self, count: Literal[1]) -> str: ... - @overload - def price_with_currency(self, count: int) -> str | list[str]: ... def price_with_currency(self, count: int = 1) -> str | list[str]: """Generate a price with currency symbol (e.g., $49.99).""" if count == 1: @@ -267,48 +187,18 @@ def price_with_currency(self, count: int = 1) -> str | list[str]: f"{_c(self._CURRENCIES)[0]}{_ri(1, 99999) / 100:.2f}" for _ in range(count) ] - @overload - def review_rating(self) -> int: ... - @overload - def review_rating(self, count: Literal[1]) -> int: ... - @overload - def review_rating(self, count: int) -> int | list[int]: ... def review_rating(self, count: int = 1) -> int | list[int]: """Generate a review rating (1-5).""" if count == 1: return self._engine.random_int(1, 5) return [self._engine.random_int(1, 5) for _ in range(count)] - @overload - def review_title(self) -> str: ... - @overload - def review_title(self, count: Literal[1]) -> str: ... - @overload - def review_title(self, count: int) -> str | list[str]: ... - def review_title(self, count: int = 1) -> str | list[str]: - """Generate a product review title.""" - if count == 1: - return self._engine.choice(_REVIEW_TITLES) - return self._engine.choices(_REVIEW_TITLES, count) - - @overload - def tracking_number(self) -> str: ... - @overload - def tracking_number(self, count: Literal[1]) -> str: ... - @overload - def tracking_number(self, count: int) -> str | list[str]: ... def tracking_number(self, count: int = 1) -> str | list[str]: """Generate a shipping tracking number.""" if count == 1: return self._one_tracking() return [self._one_tracking() for _ in range(count)] - @overload - def order_id(self) -> str: ... - @overload - def order_id(self, count: Literal[1]) -> str: ... - @overload - def order_id(self, count: int) -> str | list[str]: ... def order_id(self, count: int = 1) -> str | list[str]: """Generate an order ID (e.g., ORD-1234567890).""" if count == 1: diff --git a/src/dataforge/providers/education.py b/src/dataforge/providers/education.py index 2428c24..21ea6d9 100644 --- a/src/dataforge/providers/education.py +++ b/src/dataforge/providers/education.py @@ -4,13 +4,9 @@ All data is stored as immutable ``tuple[str, ...]``. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level) -# ------------------------------------------------------------------ _UNIVERSITIES: tuple[str, ...] = ( "Harvard University", @@ -33,36 +29,6 @@ "UCLA", "University of Virginia", "Georgetown University", - "Carnegie Mellon University", - "New York University", - "University of Southern California", - "Boston University", - "University of Wisconsin-Madison", - "University of Texas at Austin", - "Georgia Institute of Technology", - "University of Washington", - "University of Illinois Urbana-Champaign", - "Purdue University", - "University of Minnesota", - "Ohio State University", - "Penn State University", - "University of Florida", - "University of North Carolina at Chapel Hill", - "University of Maryland", - "Indiana University", - "Arizona State University", - "University of Colorado Boulder", - "Michigan State University", - "University of Oregon", - "Vanderbilt University", - "Rice University", - "Emory University", - "Washington University in St. Louis", - "Tufts University", - "Brown University", - "Dartmouth College", - "University of Notre Dame", - "University of Rochester", ) _DEGREES: tuple[str, ...] = ( @@ -81,11 +47,6 @@ "Master of Public Health", "Master of Social Work", "Doctor of Philosophy", - "Doctor of Medicine", - "Doctor of Education", - "Juris Doctor", - "Doctor of Dental Surgery", - "Doctor of Veterinary Medicine", ) _FIELDS_OF_STUDY: tuple[str, ...] = ( @@ -109,36 +70,6 @@ "Psychology", "Sociology", "Political Science", - "History", - "English Literature", - "Philosophy", - "Art History", - "Music", - "Theater", - "Nursing", - "Public Health", - "Medicine", - "Law", - "Education", - "Architecture", - "Environmental Science", - "Geology", - "Astronomy", - "Linguistics", - "Anthropology", - "Communications", - "Journalism", - "Information Technology", - "Data Science", - "Statistics", - "Neuroscience", - "International Relations", - "Criminal Justice", - "Social Work", - "Graphic Design", - "Film Studies", - "Pharmacology", - "Biochemistry", ) @@ -163,72 +94,8 @@ class EducationProvider(BaseProvider): "field_of_study": "field_of_study", } - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def university(self) -> str: ... - @overload - def university(self, count: Literal[1]) -> str: ... - @overload - def university(self, count: int) -> str | list[str]: ... - def university(self, count: int = 1) -> str | list[str]: - """Generate a university name (e.g. ``"Stanford University"``). - - Parameters - ---------- - count : int - Number of university names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_UNIVERSITIES) - return self._engine.choices(_UNIVERSITIES, count) - - @overload - def degree(self) -> str: ... - @overload - def degree(self, count: Literal[1]) -> str: ... - @overload - def degree(self, count: int) -> str | list[str]: ... - def degree(self, count: int = 1) -> str | list[str]: - """Generate a degree type (e.g. ``"Bachelor of Science"``). - - Parameters - ---------- - count : int - Number of degrees to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_DEGREES) - return self._engine.choices(_DEGREES, count) - - @overload - def field_of_study(self) -> str: ... - @overload - def field_of_study(self, count: Literal[1]) -> str: ... - @overload - def field_of_study(self, count: int) -> str | list[str]: ... - def field_of_study(self, count: int = 1) -> str | list[str]: - """Generate a field of study (e.g. ``"Computer Science"``). - - Parameters - ---------- - count : int - Number of fields to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_FIELDS_OF_STUDY) - return self._engine.choices(_FIELDS_OF_STUDY, count) + _choice_fields: dict[str, tuple[str, ...]] = { + "university": _UNIVERSITIES, + "degree": _DEGREES, + "field_of_study": _FIELDS_OF_STUDY, + } diff --git a/src/dataforge/providers/file.py b/src/dataforge/providers/file.py index fddb23a..842733b 100644 --- a/src/dataforge/providers/file.py +++ b/src/dataforge/providers/file.py @@ -1,7 +1,5 @@ """File provider — generates fake file names, extensions, MIME types, paths.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _FILE_EXTENSIONS: tuple[tuple[str, str, str], ...] = ( @@ -80,26 +78,6 @@ "test", "sample", "example", - "demo", - "project", - "image", - "photo", - "video", - "audio", - "music", - "export", - "import", - "invoice", - "receipt", - "budget", - "plan", - "schedule", - "index", - "main", - "app", - "module", - "utils", - "core", ) _DIR_PARTS: tuple[str, ...] = ( @@ -118,11 +96,6 @@ "etc", "config", "logs", - "backup", - "media", - "images", - "videos", - "music", ) @@ -157,9 +130,11 @@ class FileProvider(BaseProvider): "file_category": "file_category", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "file_category": _FILE_CATEGORIES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_ext_record(self) -> tuple[str, str, str]: return self._engine.choice(_FILE_EXTENSIONS) @@ -175,97 +150,34 @@ def _one_file_path(self) -> str: name = self._one_file_name() return "/" + "/".join(parts) + "/" + name - # ------------------------------------------------------------------ + def _one_file_extension(self) -> str: + return self._one_ext_record()[0] + + def _one_mime_type(self) -> str: + return self._one_ext_record()[1] + # Public API - # ------------------------------------------------------------------ - @overload - def file_name(self) -> str: ... - @overload - def file_name(self, count: Literal[1]) -> str: ... - @overload - def file_name(self, count: int) -> str | list[str]: ... def file_name(self, count: int = 1) -> str | list[str]: - """Generate a random file name (e.g. ``"report.pdf"``). - - Parameters - ---------- - count : int - Number of file names to generate. - """ + """Generate a random file name (e.g. ``"report.pdf"``).""" if count == 1: return self._one_file_name() return [self._one_file_name() for _ in range(count)] - @overload - def file_extension(self) -> str: ... - @overload - def file_extension(self, count: Literal[1]) -> str: ... - @overload - def file_extension(self, count: int) -> str | list[str]: ... def file_extension(self, count: int = 1) -> str | list[str]: - """Generate a random file extension (e.g. ``"pdf"``, ``"jpg"``). - - Parameters - ---------- - count : int - Number of extensions to generate. - """ + """Generate a random file extension (e.g. ``"pdf"``, ``"jpg"``).""" if count == 1: - return self._one_ext_record()[0] - return [self._one_ext_record()[0] for _ in range(count)] + return self._one_file_extension() + return [self._one_file_extension() for _ in range(count)] - @overload - def mime_type(self) -> str: ... - @overload - def mime_type(self, count: Literal[1]) -> str: ... - @overload - def mime_type(self, count: int) -> str | list[str]: ... def mime_type(self, count: int = 1) -> str | list[str]: - """Generate a random MIME type (e.g. ``"application/pdf"``). - - Parameters - ---------- - count : int - Number of MIME types to generate. - """ + """Generate a random MIME type (e.g. ``"application/pdf"``).""" if count == 1: - return self._one_ext_record()[1] - return [self._one_ext_record()[1] for _ in range(count)] + return self._one_mime_type() + return [self._one_mime_type() for _ in range(count)] - @overload - def file_path(self) -> str: ... - @overload - def file_path(self, count: Literal[1]) -> str: ... - @overload - def file_path(self, count: int) -> str | list[str]: ... def file_path(self, count: int = 1) -> str | list[str]: - """Generate a random Unix file path (e.g. ``"/home/user/report.pdf"``). - - Parameters - ---------- - count : int - Number of file paths to generate. - """ + """Generate a random Unix file path (e.g. ``"/home/user/report.pdf"``).""" if count == 1: return self._one_file_path() return [self._one_file_path() for _ in range(count)] - - @overload - def file_category(self) -> str: ... - @overload - def file_category(self, count: Literal[1]) -> str: ... - @overload - def file_category(self, count: int) -> str | list[str]: ... - def file_category(self, count: int = 1) -> str | list[str]: - """Generate a random file category (e.g. ``"image"``, ``"document"``). - - Parameters - ---------- - count : int - Number of categories to generate. - """ - categories = _FILE_CATEGORIES - if count == 1: - return self._engine.choice(categories) - return self._engine.choices(categories, count) diff --git a/src/dataforge/providers/finance.py b/src/dataforge/providers/finance.py index 86c0a15..8f9a0f3 100644 --- a/src/dataforge/providers/finance.py +++ b/src/dataforge/providers/finance.py @@ -1,7 +1,5 @@ """Finance provider — generates fake credit card numbers, IBANs, currencies.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider # Credit card prefixes by network (BIN ranges) @@ -192,9 +190,14 @@ class FinanceProvider(BaseProvider): "bitcoin_address": "bitcoin_address", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "currency_code": _CURRENCY_CODES, + "currency_name": _CURRENCY_NAMES, + "currency_symbol": _CURRENCY_SYMBOLS, + "card_type": _CARD_TYPE_NAMES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_credit_card_number(self) -> str: _, prefix, length = self._engine.choice(_CARD_TYPES) @@ -274,157 +277,50 @@ def _one_bitcoin_address(self) -> str: chars = self._engine._rng.choices(_BASE58_STR, k=length) return "1" + "".join(chars) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def credit_card_number(self) -> str: ... - @overload - def credit_card_number(self, count: Literal[1]) -> str: ... - @overload - def credit_card_number(self, count: int) -> str | list[str]: ... - def credit_card_number(self, count: int = 1) -> str | list[str]: - """Generate a random credit card number (Luhn-valid). - Parameters - ---------- - count : int - Number of card numbers to generate. - """ + def credit_card_number(self, count: int = 1) -> str | list[str]: + """Generate a random credit card number (Luhn-valid).""" if count == 1: return self._one_credit_card_number() return [self._one_credit_card_number() for _ in range(count)] - @overload - def credit_card(self) -> dict[str, str]: ... - @overload - def credit_card(self, count: Literal[1]) -> dict[str, str]: ... - @overload - def credit_card(self, count: int) -> dict[str, str] | list[dict[str, str]]: ... def credit_card(self, count: int = 1) -> dict[str, str] | list[dict[str, str]]: - """Generate a full credit card (number, type, expiry, CVV). - - Parameters - ---------- - count : int - Number of cards to generate. - """ + """Generate a full credit card (number, type, expiry, CVV).""" if count == 1: return self._one_credit_card() return [self._one_credit_card() for _ in range(count)] - @overload - def card_type(self) -> str: ... - @overload - def card_type(self, count: Literal[1]) -> str: ... - @overload - def card_type(self, count: int) -> str | list[str]: ... - def card_type(self, count: int = 1) -> str | list[str]: - """Generate a random credit card network name. - - Parameters - ---------- - count : int - Number of card types to generate. - """ - types = _CARD_TYPE_NAMES + def cvv(self, count: int = 1) -> str | list[str]: + """Generate a random CVV (3 digits).""" if count == 1: - return self._engine.choice(types) - return self._engine.choices(types, count) - - @overload - def iban(self) -> str: ... - @overload - def iban(self, count: Literal[1]) -> str: ... - @overload - def iban(self, count: int) -> str | list[str]: ... - def iban(self, count: int = 1) -> str | list[str]: - """Generate a random IBAN. + return self._engine.random_digits_str(3) + return [self._engine.random_digits_str(3) for _ in range(count)] + + def expiry_date(self, count: int = 1) -> str | list[str]: + """Generate a random credit card expiry date (MM/YY).""" + if count == 1: + m = str(self._engine.random_int(1, 12)).zfill(2) + y = str(self._engine.random_int(25, 30)) + return f"{m}/{y}" + result: list[str] = [] + _ri = self._engine.random_int + for _ in range(count): + m = str(_ri(1, 12)).zfill(2) + y = str(_ri(25, 30)) + result.append(f"{m}/{y}") + return result - Parameters - ---------- - count : int - Number of IBANs to generate. - """ + def iban(self, count: int = 1) -> str | list[str]: + """Generate a random IBAN.""" if count == 1: return self._one_iban() return [self._one_iban() for _ in range(count)] - @overload - def currency_code(self) -> str: ... - @overload - def currency_code(self, count: Literal[1]) -> str: ... - @overload - def currency_code(self, count: int) -> str | list[str]: ... - def currency_code(self, count: int = 1) -> str | list[str]: - """Generate a random ISO 4217 currency code (e.g. ``"USD"``). - - Parameters - ---------- - count : int - Number of codes to generate. - """ - if count == 1: - return self._engine.choice(_CURRENCY_CODES) - return self._engine.choices(_CURRENCY_CODES, count) - - @overload - def currency_name(self) -> str: ... - @overload - def currency_name(self, count: Literal[1]) -> str: ... - @overload - def currency_name(self, count: int) -> str | list[str]: ... - def currency_name(self, count: int = 1) -> str | list[str]: - """Generate a random currency name (e.g. ``"US Dollar"``). - - Parameters - ---------- - count : int - Number of names to generate. - """ - if count == 1: - return self._engine.choice(_CURRENCY_NAMES) - return self._engine.choices(_CURRENCY_NAMES, count) - - @overload - def currency_symbol(self) -> str: ... - @overload - def currency_symbol(self, count: Literal[1]) -> str: ... - @overload - def currency_symbol(self, count: int) -> str | list[str]: ... - def currency_symbol(self, count: int = 1) -> str | list[str]: - """Generate a random currency symbol (e.g. ``"$"``). - - Parameters - ---------- - count : int - Number of symbols to generate. - """ - if count == 1: - return self._engine.choice(_CURRENCY_SYMBOLS) - return self._engine.choices(_CURRENCY_SYMBOLS, count) - - @overload - def price(self) -> str: ... - @overload - def price(self, count: Literal[1]) -> str: ... - @overload - def price(self, count: int) -> str | list[str]: ... def price( self, count: int = 1, min_val: float = 0.99, max_val: float = 9999.99 ) -> str | list[str]: - """Generate a random price string (e.g. ``"49.99"``). - - Parameters - ---------- - count : int - Number of prices to generate. - min_val : float - Minimum price value. - max_val : float - Maximum price value. - """ + """Generate a random price string (e.g. ``"49.99"``).""" min_cents = int(min_val * 100) max_cents = int(max_val * 100) if count == 1: @@ -433,20 +329,8 @@ def price( _ri = self._engine.random_int return [f"{_ri(min_cents, max_cents) / 100:.2f}" for _ in range(count)] - @overload - def bic(self) -> str: ... - @overload - def bic(self, count: Literal[1]) -> str: ... - @overload - def bic(self, count: int) -> str | list[str]: ... def bic(self, count: int = 1) -> str | list[str]: - """Generate a random BIC/SWIFT code (e.g. ``"DEUTDEFFXXX"``). - - Parameters - ---------- - count : int - Number of BIC codes to generate. - """ + """Generate a random BIC/SWIFT code (e.g. ``"DEUTDEFFXXX"``).""" if count == 1: return self._one_bic() # Inlined batch loop with local-bound choices @@ -456,20 +340,8 @@ def bic(self, count: int = 1) -> str | list[str]: for _ in range(count) ] - @overload - def routing_number(self) -> str: ... - @overload - def routing_number(self, count: Literal[1]) -> str: ... - @overload - def routing_number(self, count: int) -> str | list[str]: ... def routing_number(self, count: int = 1) -> str | list[str]: - """Generate a random US ABA routing number with valid checksum. - - Parameters - ---------- - count : int - Number of routing numbers to generate. - """ + """Generate a random US ABA routing number with valid checksum.""" if count == 1: return self._one_routing_number() # Inlined batch with local-bound helpers @@ -505,20 +377,8 @@ def routing_number(self, count: int = 1) -> str | list[str]: result.append(f"{d1}{d2}{mid}{check}") return result - @overload - def bitcoin_address(self) -> str: ... - @overload - def bitcoin_address(self, count: Literal[1]) -> str: ... - @overload - def bitcoin_address(self, count: int) -> str | list[str]: ... def bitcoin_address(self, count: int = 1) -> str | list[str]: - """Generate a random Bitcoin address (P2PKH format, starts with ``1``). - - Parameters - ---------- - count : int - Number of addresses to generate. - """ + """Generate a random Bitcoin address (P2PKH format, starts with ``1``).""" if count == 1: return self._one_bitcoin_address() # Inlined batch loop — use modular indexing into BASE58 alphabet diff --git a/src/dataforge/providers/food.py b/src/dataforge/providers/food.py index c4887cd..9737aa1 100644 --- a/src/dataforge/providers/food.py +++ b/src/dataforge/providers/food.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _DISHES: tuple[str, ...] = ( "Spaghetti Carbonara", @@ -34,26 +30,6 @@ "Ceviche", "Beef Wellington", "Tom Yum Soup", - "Croissant", - "Shakshuka", - "Butter Chicken", - "Risotto", - "Lasagna", - "Gyoza", - "Empanada", - "Poutine", - "Borscht", - "Naan Bread", - "Kimchi Fried Rice", - "Churros", - "Baklava", - "Tiramisu", - "Creme Brulee", - "Banoffee Pie", - "Eggs Benedict", - "Club Sandwich", - "Lobster Bisque", - "Beef Stroganoff", ) _CUISINES: tuple[str, ...] = ( @@ -77,16 +53,6 @@ "Indonesian", "Caribbean", "American", - "British", - "German", - "Russian", - "Australian", - "Filipino", - "Malaysian", - "Portuguese", - "Argentine", - "Egyptian", - "Scandinavian", ) _INGREDIENTS: tuple[str, ...] = ( @@ -110,26 +76,6 @@ "Cheese", "Egg", "Flour", - "Sugar", - "Salt", - "Pepper", - "Lemon", - "Lime", - "Avocado", - "Mushroom", - "Bell Pepper", - "Spinach", - "Broccoli", - "Coconut Milk", - "Soy Sauce", - "Cumin", - "Paprika", - "Cinnamon", - "Oregano", - "Thyme", - "Rosemary", - "Parsley", - "Chili Flakes", ) _RESTAURANT_ADJECTIVES: tuple[str, ...] = ( @@ -148,11 +94,6 @@ "Secret", "Hidden", "Rustic", - "Urban", - "Coastal", - "Mountain", - "Garden", - "Sunset", ) _RESTAURANT_NOUNS: tuple[str, ...] = ( @@ -171,11 +112,6 @@ "Garden", "Market", "Corner", - "Terrace", - "Harbor", - "Cellar", - "Palace", - "Pantry", ) _DIETARY_TAGS: tuple[str, ...] = ( @@ -194,11 +130,6 @@ "Sugar-Free", "Whole30", "Mediterranean", - "Raw", - "Plant-Based", - "Lactose-Free", - "Soy-Free", - "Low-Sodium", ) _BEVERAGES: tuple[str, ...] = ( @@ -217,11 +148,6 @@ "Sparkling Water", "Kombucha", "Matcha", - "Chai Latte", - "Fresh Juice", - "Agua Fresca", - "Mojito", - "Sangria", ) _COOKING_METHODS: tuple[str, ...] = ( @@ -240,11 +166,6 @@ "Deep-Fried", "Pan-Seared", "Slow-Cooked", - "Sous Vide", - "Gratin", - "Flambeed", - "Pickled", - "Fermented", ) @@ -279,198 +200,37 @@ class FoodProvider(BaseProvider): "meal_price": "meal_price", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "dish": _DISHES, + "cuisine": _CUISINES, + "ingredient": _INGREDIENTS, + "dietary_tag": _DIETARY_TAGS, + "beverage": _BEVERAGES, + "cooking_method": _COOKING_METHODS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_restaurant(self) -> str: - """Generate a single restaurant name.""" choice = self._engine._rng.choice return f"The {choice(_RESTAURANT_ADJECTIVES)} {choice(_RESTAURANT_NOUNS)}" def _one_meal_price(self) -> str: - """Generate a single meal price string.""" ri = self._engine.random_int dollars = ri(5, 75) cents = ri(0, 99) return f"${dollars}.{cents:02d}" - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def dish(self) -> str: ... - @overload - def dish(self, count: Literal[1]) -> str: ... - @overload - def dish(self, count: int) -> str | list[str]: ... - def dish(self, count: int = 1) -> str | list[str]: - """Generate a dish name (e.g. ``"Pad Thai"``). - - Parameters - ---------- - count : int - Number of dish names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_DISHES) - return self._engine.choices(_DISHES, count) - - @overload - def cuisine(self) -> str: ... - @overload - def cuisine(self, count: Literal[1]) -> str: ... - @overload - def cuisine(self, count: int) -> str | list[str]: ... - def cuisine(self, count: int = 1) -> str | list[str]: - """Generate a cuisine type (e.g. ``"Italian"``). - - Parameters - ---------- - count : int - Number of cuisine types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CUISINES) - return self._engine.choices(_CUISINES, count) - - @overload - def ingredient(self) -> str: ... - @overload - def ingredient(self, count: Literal[1]) -> str: ... - @overload - def ingredient(self, count: int) -> str | list[str]: ... - def ingredient(self, count: int = 1) -> str | list[str]: - """Generate an ingredient name (e.g. ``"Garlic"``). - - Parameters - ---------- - count : int - Number of ingredients to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_INGREDIENTS) - return self._engine.choices(_INGREDIENTS, count) + # Public API — custom methods - @overload - def restaurant(self) -> str: ... - @overload - def restaurant(self, count: Literal[1]) -> str: ... - @overload - def restaurant(self, count: int) -> str | list[str]: ... def restaurant(self, count: int = 1) -> str | list[str]: - """Generate a restaurant name (e.g. ``"The Golden Kitchen"``). - - Parameters - ---------- - count : int - Number of restaurant names to generate. - - Returns - ------- - str or list[str] - """ + """Generate a restaurant name (e.g. ``"The Golden Kitchen"``).""" if count == 1: return self._one_restaurant() return [self._one_restaurant() for _ in range(count)] - @overload - def dietary_tag(self) -> str: ... - @overload - def dietary_tag(self, count: Literal[1]) -> str: ... - @overload - def dietary_tag(self, count: int) -> str | list[str]: ... - def dietary_tag(self, count: int = 1) -> str | list[str]: - """Generate a dietary tag (e.g. ``"Vegan"``). - - Parameters - ---------- - count : int - Number of dietary tags to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_DIETARY_TAGS) - return self._engine.choices(_DIETARY_TAGS, count) - - @overload - def beverage(self) -> str: ... - @overload - def beverage(self, count: Literal[1]) -> str: ... - @overload - def beverage(self, count: int) -> str | list[str]: ... - def beverage(self, count: int = 1) -> str | list[str]: - """Generate a beverage name (e.g. ``"Cappuccino"``). - - Parameters - ---------- - count : int - Number of beverages to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_BEVERAGES) - return self._engine.choices(_BEVERAGES, count) - - @overload - def cooking_method(self) -> str: ... - @overload - def cooking_method(self, count: Literal[1]) -> str: ... - @overload - def cooking_method(self, count: int) -> str | list[str]: ... - def cooking_method(self, count: int = 1) -> str | list[str]: - """Generate a cooking method (e.g. ``"Grilled"``). - - Parameters - ---------- - count : int - Number of cooking methods to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_COOKING_METHODS) - return self._engine.choices(_COOKING_METHODS, count) - - @overload - def meal_price(self) -> str: ... - @overload - def meal_price(self, count: Literal[1]) -> str: ... - @overload - def meal_price(self, count: int) -> str | list[str]: ... def meal_price(self, count: int = 1) -> str | list[str]: - """Generate a meal price (e.g. ``"$24.99"``). - - Parameters - ---------- - count : int - Number of meal prices to generate. - - Returns - ------- - str or list[str] - """ + """Generate a meal price (e.g. ``"$24.99"``).""" if count == 1: return self._one_meal_price() return [self._one_meal_price() for _ in range(count)] diff --git a/src/dataforge/providers/geo.py b/src/dataforge/providers/geo.py index f1fc809..5ef08c8 100644 --- a/src/dataforge/providers/geo.py +++ b/src/dataforge/providers/geo.py @@ -1,7 +1,5 @@ """Geo provider — coordinates, countries, continents, places, distances.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _CONTINENTS: tuple[str, ...] = ( @@ -38,11 +36,6 @@ "North Sea", "Baltic Sea", "Caspian Sea", - "Arabian Sea", - "Coral Sea", - "Tasman Sea", - "Banda Sea", - "Timor Sea", ) _MOUNTAIN_RANGES: tuple[str, ...] = ( @@ -61,11 +54,6 @@ "Karakoram", "Hindu Kush", "Tian Shan", - "Kunlun Mountains", - "Altai Mountains", - "Drakensberg", - "Great Dividing Range", - "Brooks Range", ) _RIVERS: tuple[str, ...] = ( @@ -84,16 +72,6 @@ "Niger", "Murray", "Tocantins", - "Volga", - "Danube", - "Ganges", - "Rhine", - "Euphrates", - "Indus", - "Tigris", - "Colorado", - "Columbia", - "Thames", ) _COMPASS_DIRECTIONS: tuple[str, ...] = ( @@ -144,7 +122,14 @@ class GeoProvider(BaseProvider): "geo_hash": "geo_hash", } - # --- Scalar helpers --- + _choice_fields: dict[str, tuple[str, ...]] = { + "continent": _CONTINENTS, + "ocean": _OCEANS, + "sea": _SEAS, + "mountain_range": _MOUNTAIN_RANGES, + "river": _RIVERS, + "compass_direction": _COMPASS_DIRECTIONS, + } def _one_geo_coordinate(self) -> str: lat = self._engine.random_int(-9000, 9000) / 100.0 @@ -176,157 +161,26 @@ def _one_geo_hash(self) -> str: bits >>= 5 return "".join(chars) - # --- Public API --- - - @overload - def continent(self) -> str: ... - @overload - def continent(self, count: Literal[1]) -> str: ... - @overload - def continent(self, count: int) -> str | list[str]: ... - def continent(self, count: int = 1) -> str | list[str]: - """Generate a continent name.""" - if count == 1: - return self._engine.choice(_CONTINENTS) - return self._engine.choices(_CONTINENTS, count) - - @overload - def ocean(self) -> str: ... - @overload - def ocean(self, count: Literal[1]) -> str: ... - @overload - def ocean(self, count: int) -> str | list[str]: ... - def ocean(self, count: int = 1) -> str | list[str]: - """Generate an ocean name.""" - if count == 1: - return self._engine.choice(_OCEANS) - return self._engine.choices(_OCEANS, count) - - @overload - def sea(self) -> str: ... - @overload - def sea(self, count: Literal[1]) -> str: ... - @overload - def sea(self, count: int) -> str | list[str]: ... - def sea(self, count: int = 1) -> str | list[str]: - """Generate a sea name.""" - if count == 1: - return self._engine.choice(_SEAS) - return self._engine.choices(_SEAS, count) - - @overload - def mountain_range(self) -> str: ... - @overload - def mountain_range(self, count: Literal[1]) -> str: ... - @overload - def mountain_range(self, count: int) -> str | list[str]: ... - def mountain_range(self, count: int = 1) -> str | list[str]: - """Generate a mountain range name.""" - if count == 1: - return self._engine.choice(_MOUNTAIN_RANGES) - return self._engine.choices(_MOUNTAIN_RANGES, count) - - @overload - def river(self) -> str: ... - @overload - def river(self, count: Literal[1]) -> str: ... - @overload - def river(self, count: int) -> str | list[str]: ... - def river(self, count: int = 1) -> str | list[str]: - """Generate a river name.""" - if count == 1: - return self._engine.choice(_RIVERS) - return self._engine.choices(_RIVERS, count) - - @overload - def compass_direction(self) -> str: ... - @overload - def compass_direction(self, count: Literal[1]) -> str: ... - @overload - def compass_direction(self, count: int) -> str | list[str]: ... - def compass_direction(self, count: int = 1) -> str | list[str]: - """Generate a compass direction (e.g., N, NE, SSW).""" - if count == 1: - return self._engine.choice(_COMPASS_DIRECTIONS) - return self._engine.choices(_COMPASS_DIRECTIONS, count) - - @overload - def geo_coordinate(self) -> str: ... - @overload - def geo_coordinate(self, count: Literal[1]) -> str: ... - @overload - def geo_coordinate(self, count: int) -> str | list[str]: ... def geo_coordinate(self, count: int = 1) -> str | list[str]: """Generate a geographic coordinate pair (lat, lon).""" if count == 1: return self._one_geo_coordinate() - # Inlined batch with local binding — avoids method call overhead - _ri = self._engine.random_int - return [ - f"{_ri(-9000, 9000) / 100.0:.4f}, {_ri(-18000, 18000) / 100.0:.4f}" - for _ in range(count) - ] + return [self._one_geo_coordinate() for _ in range(count)] - @overload - def dms_latitude(self) -> str: ... - @overload - def dms_latitude(self, count: Literal[1]) -> str: ... - @overload - def dms_latitude(self, count: int) -> str | list[str]: ... def dms_latitude(self, count: int = 1) -> str | list[str]: """Generate a latitude in degrees-minutes-seconds format.""" if count == 1: return self._one_dms_lat() - # Inlined batch with local binding - _ri = self._engine.random_int - _choice = self._engine.choice - _dirs = _COORDINATE_DMS_DIRS_LAT - return [ - f"{_ri(0, 90)}°{_ri(0, 59):02d}'{_ri(0, 59):02d}\"{_choice(_dirs)}" - for _ in range(count) - ] + return [self._one_dms_lat() for _ in range(count)] - @overload - def dms_longitude(self) -> str: ... - @overload - def dms_longitude(self, count: Literal[1]) -> str: ... - @overload - def dms_longitude(self, count: int) -> str | list[str]: ... def dms_longitude(self, count: int = 1) -> str | list[str]: """Generate a longitude in degrees-minutes-seconds format.""" if count == 1: return self._one_dms_lon() - # Inlined batch with local binding - _ri = self._engine.random_int - _choice = self._engine.choice - _dirs = _COORDINATE_DMS_DIRS_LON - return [ - f"{_ri(0, 180)}°{_ri(0, 59):02d}'{_ri(0, 59):02d}\"{_choice(_dirs)}" - for _ in range(count) - ] + return [self._one_dms_lon() for _ in range(count)] - @overload - def geo_hash(self) -> str: ... - @overload - def geo_hash(self, count: Literal[1]) -> str: ... - @overload - def geo_hash(self, count: int) -> str | list[str]: ... def geo_hash(self, count: int = 1) -> str | list[str]: """Generate a geohash string (base32, 6-12 chars).""" if count == 1: return self._one_geo_hash() - # Inlined batch with local binding — avoids per-item method - # call overhead and re-binding _BASE32 inside the loop. - _ri = self._engine.random_int - _getrandbits = self._engine.getrandbits - b32 = _BASE32 - result: list[str] = [] - for _ in range(count): - length = _ri(6, 12) - bits = _getrandbits(length * 5) - chars: list[str] = [] - for _j in range(length): - chars.append(b32[bits & 0x1F]) - bits >>= 5 - result.append("".join(chars)) - return result + return [self._one_geo_hash() for _ in range(count)] diff --git a/src/dataforge/providers/government.py b/src/dataforge/providers/government.py index 4167531..6b71488 100644 --- a/src/dataforge/providers/government.py +++ b/src/dataforge/providers/government.py @@ -1,7 +1,5 @@ """Government provider — SSN, tax IDs, passport numbers, driver's licenses.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider @@ -21,8 +19,6 @@ class GovernmentProvider(BaseProvider): "national_id": "national_id", } - # --- Scalar helpers --- - def _one_ssn(self) -> str: """Generate US-format SSN: ###-##-####.""" _ri = self._engine.random_int @@ -51,62 +47,30 @@ def _one_national_id(self) -> str: """Generate a national ID number: 10-digit numeric string.""" return self._engine.random_digits_str(10) - # --- Public API --- - - @overload - def ssn(self) -> str: ... - @overload - def ssn(self, count: Literal[1]) -> str: ... - @overload - def ssn(self, count: int) -> str | list[str]: ... def ssn(self, count: int = 1) -> str | list[str]: """Generate a US Social Security Number (###-##-####).""" if count == 1: return self._one_ssn() return [self._one_ssn() for _ in range(count)] - @overload - def tax_id(self) -> str: ... - @overload - def tax_id(self, count: Literal[1]) -> str: ... - @overload - def tax_id(self, count: int) -> str | list[str]: ... def tax_id(self, count: int = 1) -> str | list[str]: """Generate a US Employer Identification Number (##-#######).""" if count == 1: return self._one_tax_id() return [self._one_tax_id() for _ in range(count)] - @overload - def passport_number(self) -> str: ... - @overload - def passport_number(self, count: Literal[1]) -> str: ... - @overload - def passport_number(self, count: int) -> str | list[str]: ... def passport_number(self, count: int = 1) -> str | list[str]: """Generate a US passport number (letter + 8 digits).""" if count == 1: return self._one_passport() return [self._one_passport() for _ in range(count)] - @overload - def drivers_license(self) -> str: ... - @overload - def drivers_license(self, count: Literal[1]) -> str: ... - @overload - def drivers_license(self, count: int) -> str | list[str]: ... def drivers_license(self, count: int = 1) -> str | list[str]: """Generate a US-style driver's license number.""" if count == 1: return self._one_drivers_license() return [self._one_drivers_license() for _ in range(count)] - @overload - def national_id(self) -> str: ... - @overload - def national_id(self, count: Literal[1]) -> str: ... - @overload - def national_id(self, count: int) -> str | list[str]: ... def national_id(self, count: int = 1) -> str | list[str]: """Generate a 10-digit national ID number.""" if count == 1: diff --git a/src/dataforge/providers/hardware.py b/src/dataforge/providers/hardware.py index 9c3348f..55dfb33 100644 --- a/src/dataforge/providers/hardware.py +++ b/src/dataforge/providers/hardware.py @@ -5,26 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ - -_CPU_BRANDS: tuple[str, ...] = ( - "Intel", - "AMD", - "Apple", - "Qualcomm", - "ARM", - "NVIDIA", - "Samsung", - "MediaTek", - "IBM", - "RISC-V", -) _CPU_MODELS: tuple[str, ...] = ( "Core i9-14900K", @@ -47,16 +30,6 @@ "Snapdragon 8 Gen 3", "Xeon W-3400", "EPYC 9654", - "Core Ultra 9 285K", - "Ryzen AI 9 HX 370", - "Dimensity 9300", - "Snapdragon X Elite", - "A17 Pro", - "Core i9-12900K", - "Ryzen 9 5950X", - "Xeon Platinum 8480+", - "EPYC 7763", - "Core i7-12700K", ) _GPU_MODELS: tuple[str, ...] = ( @@ -80,16 +53,6 @@ "Intel Arc A580", "NVIDIA A100", "NVIDIA H100", - "NVIDIA L40S", - "AMD Instinct MI300X", - "Apple M3 Max GPU", - "NVIDIA RTX 5090", - "NVIDIA RTX 5080", - "AMD RX 9070 XT", - "NVIDIA Quadro RTX 6000", - "AMD Radeon Pro W7900", - "NVIDIA T4", - "NVIDIA V100", ) _RAM_SIZES: tuple[str, ...] = ( @@ -121,11 +84,6 @@ "LPDDR5-6400", "LPDDR5X-7500", "GDDR6", - "GDDR6X", - "HBM2e", - "HBM3", - "HBM3e", - "ECC DDR5-4800", ) _STORAGE_TYPES: tuple[str, ...] = ( @@ -144,11 +102,6 @@ "12 TB HDD", "16 TB HDD", "1 TB PCIe 5.0 SSD", - "2 TB PCIe 5.0 SSD", - "4 TB PCIe 5.0 SSD", - "256 GB eMMC", - "128 GB UFS 4.0", - "512 GB UFS 4.0", ) _FORM_FACTORS: tuple[str, ...] = ( @@ -208,16 +161,6 @@ "Thermaltake", "Razer", "Logitech", - "SteelSeries", - "HyperX", - "Sabrent", - "Noctua", - "EKWB", - "Fractal Design", - "Phanteks", - "DeepCool", - "Arctic", - "Silverstone", ) _PORTS: tuple[str, ...] = ( @@ -236,11 +179,6 @@ "PCIe x16", "PCIe x4", "M.2 NVMe", - "SATA III", - "Mini DisplayPort", - "DVI-D", - "VGA", - "Wi-Fi 7", ) @@ -275,204 +213,14 @@ class HardwareProvider(BaseProvider): "port_type": "port", } - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def cpu(self) -> str: ... - @overload - def cpu(self, count: Literal[1]) -> str: ... - @overload - def cpu(self, count: int) -> str | list[str]: ... - def cpu(self, count: int = 1) -> str | list[str]: - """Generate a CPU model (e.g. ``"Ryzen 9 7950X"``). - - Parameters - ---------- - count : int - Number of CPU models to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CPU_MODELS) - return self._engine.choices(_CPU_MODELS, count) - - @overload - def gpu(self) -> str: ... - @overload - def gpu(self, count: Literal[1]) -> str: ... - @overload - def gpu(self, count: int) -> str | list[str]: ... - def gpu(self, count: int = 1) -> str | list[str]: - """Generate a GPU model (e.g. ``"NVIDIA RTX 4090"``). - - Parameters - ---------- - count : int - Number of GPU models to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_GPU_MODELS) - return self._engine.choices(_GPU_MODELS, count) - - @overload - def ram_size(self) -> str: ... - @overload - def ram_size(self, count: Literal[1]) -> str: ... - @overload - def ram_size(self, count: int) -> str | list[str]: ... - def ram_size(self, count: int = 1) -> str | list[str]: - """Generate a RAM size (e.g. ``"32 GB"``). - - Parameters - ---------- - count : int - Number of RAM sizes to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_RAM_SIZES) - return self._engine.choices(_RAM_SIZES, count) - - @overload - def ram_type(self) -> str: ... - @overload - def ram_type(self, count: Literal[1]) -> str: ... - @overload - def ram_type(self, count: int) -> str | list[str]: ... - def ram_type(self, count: int = 1) -> str | list[str]: - """Generate a RAM type (e.g. ``"DDR5-5600"``). - - Parameters - ---------- - count : int - Number of RAM types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_RAM_TYPES) - return self._engine.choices(_RAM_TYPES, count) - - @overload - def storage(self) -> str: ... - @overload - def storage(self, count: Literal[1]) -> str: ... - @overload - def storage(self, count: int) -> str | list[str]: ... - def storage(self, count: int = 1) -> str | list[str]: - """Generate a storage specification (e.g. ``"1 TB NVMe SSD"``). - - Parameters - ---------- - count : int - Number of storage specs to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_STORAGE_TYPES) - return self._engine.choices(_STORAGE_TYPES, count) - - @overload - def form_factor(self) -> str: ... - @overload - def form_factor(self, count: Literal[1]) -> str: ... - @overload - def form_factor(self, count: int) -> str | list[str]: ... - def form_factor(self, count: int = 1) -> str | list[str]: - """Generate a motherboard form factor (e.g. ``"ATX"``). - - Parameters - ---------- - count : int - Number of form factors to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_FORM_FACTORS) - return self._engine.choices(_FORM_FACTORS, count) - - @overload - def peripheral(self) -> str: ... - @overload - def peripheral(self, count: Literal[1]) -> str: ... - @overload - def peripheral(self, count: int) -> str | list[str]: ... - def peripheral(self, count: int = 1) -> str | list[str]: - """Generate a peripheral device (e.g. ``"Mechanical Keyboard"``). - - Parameters - ---------- - count : int - Number of peripherals to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_PERIPHERALS) - return self._engine.choices(_PERIPHERALS, count) - - @overload - def manufacturer(self) -> str: ... - @overload - def manufacturer(self, count: Literal[1]) -> str: ... - @overload - def manufacturer(self, count: int) -> str | list[str]: ... - def manufacturer(self, count: int = 1) -> str | list[str]: - """Generate a hardware manufacturer (e.g. ``"ASUS"``). - - Parameters - ---------- - count : int - Number of manufacturer names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_MANUFACTURERS) - return self._engine.choices(_MANUFACTURERS, count) - - @overload - def port(self) -> str: ... - @overload - def port(self, count: Literal[1]) -> str: ... - @overload - def port(self, count: int) -> str | list[str]: ... - def port(self, count: int = 1) -> str | list[str]: - """Generate a port/connector type (e.g. ``"USB-C 4.0"``). - - Parameters - ---------- - count : int - Number of port types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_PORTS) - return self._engine.choices(_PORTS, count) + _choice_fields: dict[str, tuple[str, ...]] = { + "cpu": _CPU_MODELS, + "gpu": _GPU_MODELS, + "ram_size": _RAM_SIZES, + "ram_type": _RAM_TYPES, + "storage": _STORAGE_TYPES, + "form_factor": _FORM_FACTORS, + "peripheral": _PERIPHERALS, + "manufacturer": _MANUFACTURERS, + "port": _PORTS, + } diff --git a/src/dataforge/providers/internet.py b/src/dataforge/providers/internet.py index 3e47976..2340c0f 100644 --- a/src/dataforge/providers/internet.py +++ b/src/dataforge/providers/internet.py @@ -3,7 +3,6 @@ import re import unicodedata from types import ModuleType -from typing import Literal, overload from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider @@ -25,15 +24,6 @@ "green", "dark", "light", - "quick", - "easy", - "safe", - "bold", - "true", - "deep", - "high", - "low", - "wide", ) _URL_PROTOCOLS: tuple[str, ...] = ("https", "http") @@ -140,9 +130,7 @@ def __init__( person_data.last_names ) - # ------------------------------------------------------------------ # Scalar helpers - # ------------------------------------------------------------------ def _one_username(self) -> str: first = self._engine.choice(self._ascii_first_names) @@ -192,24 +180,10 @@ def _one_slug(self) -> str: def _one_safe_email(self) -> str: return self._one_email_from(_SAFE_DOMAINS) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def username(self) -> str: ... - @overload - def username(self, count: Literal[1]) -> str: ... - @overload - def username(self, count: int) -> str | list[str]: ... - def username(self, count: int = 1) -> str | list[str]: - """Generate a random username. - Parameters - ---------- - count : int - Number of usernames to generate. - """ + def username(self, count: int = 1) -> str | list[str]: + """Generate a random username.""" if count == 1: return self._one_username() # Vectorized batch: bulk random selections avoid per-item overhead @@ -223,38 +197,14 @@ def username(self, count: int = 1) -> str | list[str]: for fmt, f, ln in zip(fmts, firsts, lasts) ] - @overload - def email(self) -> str: ... - @overload - def email(self, count: Literal[1]) -> str: ... - @overload - def email(self, count: int) -> str | list[str]: ... def email(self, count: int = 1) -> str | list[str]: - """Generate a random email address. - - Parameters - ---------- - count : int - Number of emails to generate. - """ + """Generate a random email address.""" if count == 1: return self._one_email() return self._batch_emails(self._free_email_domains, count) - @overload - def domain(self) -> str: ... - @overload - def domain(self, count: Literal[1]) -> str: ... - @overload - def domain(self, count: int) -> str | list[str]: ... def domain(self, count: int = 1) -> str | list[str]: - """Generate a random domain name. - - Parameters - ---------- - count : int - Number of domains to generate. - """ + """Generate a random domain name.""" if count == 1: return self._one_domain() # Vectorized batch: bulk random selections @@ -263,20 +213,8 @@ def domain(self, count: int = 1) -> str | list[str]: suffixes = _choices(self._domain_suffixes, count) return [f"{w}.{s}" for w, s in zip(words, suffixes)] - @overload - def url(self) -> str: ... - @overload - def url(self, count: Literal[1]) -> str: ... - @overload - def url(self, count: int) -> str | list[str]: ... def url(self, count: int = 1) -> str | list[str]: - """Generate a random URL. - - Parameters - ---------- - count : int - Number of URLs to generate. - """ + """Generate a random URL.""" if count == 1: return self._one_url() # Vectorized batch: bulk random selections @@ -286,46 +224,20 @@ def url(self, count: int = 1) -> str | list[str]: suffixes = _choices(self._domain_suffixes, count) return [f"{p}://{w}.{s}" for p, w, s in zip(protocols, words, suffixes)] - @overload - def ipv4(self) -> str: ... - @overload - def ipv4(self, count: Literal[1]) -> str: ... - @overload - def ipv4(self, count: int) -> str | list[str]: ... def ipv4(self, count: int = 1) -> str | list[str]: - """Generate a random IPv4 address. - - Parameters - ---------- - count : int - Number of IPs to generate. - """ + """Generate a random IPv4 address.""" if count == 1: return self._one_ipv4() - # Inlined batch with local-bound getrandbits _getrandbits = self._engine.getrandbits return [ f"{(b := _getrandbits(32)) >> 24}.{(b >> 16) & 0xFF}.{(b >> 8) & 0xFF}.{b & 0xFF}" for _ in range(count) ] - @overload - def slug(self) -> str: ... - @overload - def slug(self, count: Literal[1]) -> str: ... - @overload - def slug(self, count: int) -> str | list[str]: ... def slug(self, count: int = 1) -> str | list[str]: - """Generate a random URL-safe slug (e.g. ``"fast-cool-open"``). - - Parameters - ---------- - count : int - Number of slugs to generate. - """ + """Generate a random URL-safe slug.""" if count == 1: return self._one_slug() - # Vectorized: pick all words in one bulk call, then split _ri = self._engine.random_int _choices = self._engine.choices result: list[str] = [] @@ -335,41 +247,14 @@ def slug(self, count: int = 1) -> str | list[str]: result.append("-".join(words)) return result - @overload - def tld(self) -> str: ... - @overload - def tld(self, count: Literal[1]) -> str: ... - @overload - def tld(self, count: int) -> str | list[str]: ... def tld(self, count: int = 1) -> str | list[str]: - """Generate a random top-level domain (e.g. ``"io"``, ``"dev"``). - - Parameters - ---------- - count : int - Number of TLDs to generate. - """ + """Generate a random top-level domain.""" if count == 1: return self._engine.choice(self._domain_suffixes) return self._engine.choices(self._domain_suffixes, count) - @overload - def safe_email(self) -> str: ... - @overload - def safe_email(self, count: Literal[1]) -> str: ... - @overload - def safe_email(self, count: int) -> str | list[str]: ... def safe_email(self, count: int = 1) -> str | list[str]: - """Generate a random email using ``example.com``/``example.org``. - - These addresses are safe for testing — they will never reach - a real mailbox. - - Parameters - ---------- - count : int - Number of emails to generate. - """ + """Generate a random email using example.com/example.org.""" if count == 1: return self._one_safe_email() return self._batch_emails(_SAFE_DOMAINS, count) diff --git a/src/dataforge/providers/legal.py b/src/dataforge/providers/legal.py index 7b4dbd9..ee30f00 100644 --- a/src/dataforge/providers/legal.py +++ b/src/dataforge/providers/legal.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _COURTS: tuple[str, ...] = ( "Supreme Court", @@ -29,11 +25,6 @@ "Civil Court", "Federal Court", "Superior Court", - "Magistrate Court", - "Court of Claims", - "Appellate Court", - "High Court", - "Crown Court", ) _PRACTICE_AREAS: tuple[str, ...] = ( @@ -57,16 +48,6 @@ "Antitrust Law", "Securities Law", "Cybersecurity Law", - "Entertainment Law", - "Sports Law", - "Elder Law", - "Human Rights Law", - "Military Law", - "Patent Law", - "Trademark Law", - "Copyright Law", - "Insurance Law", - "Health Care Law", ) _LEGAL_TERMS: tuple[str, ...] = ( @@ -90,16 +71,6 @@ "indictment", "arraignment", "acquittal", - "plea bargain", - "mistrial", - "statute of limitations", - "due process", - "double jeopardy", - "tort", - "liability", - "negligence", - "fiduciary duty", - "arbitration", ) _DOCUMENT_TYPES: tuple[str, ...] = ( @@ -118,11 +89,6 @@ "Power of Attorney", "Will", "Trust Agreement", - "Non-Disclosure Agreement", - "Lease Agreement", - "Employment Agreement", - "Patent Application", - "Court Order", ) _FIRM_PREFIXES: tuple[str, ...] = ( @@ -141,11 +107,6 @@ "White", "Harris", "Martin", - "Clark", - "Lewis", - "Walker", - "Hall", - "Young", ) _FIRM_SUFFIXES: tuple[str, ...] = ( @@ -177,11 +138,6 @@ "Sonia", "Sandra", "Amy", - "Katherine", - "Margaret", - "Patricia", - "Linda", - "Susan", ) _JUDGE_LAST: tuple[str, ...] = ( @@ -200,11 +156,6 @@ "Kavanaugh", "Barrett", "Sotomayor", - "Alito", - "Thomas", - "Jackson", - "Holmes", - "Cardozo", ) _CASE_PREFIXES: tuple[str, ...] = ( @@ -251,9 +202,14 @@ class LegalProvider(BaseProvider): "verdict": "verdict", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "court": _COURTS, + "practice_area": _PRACTICE_AREAS, + "legal_term": _LEGAL_TERMS, + "document_type": _DOCUMENT_TYPES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_case_number(self) -> str: """Generate a single case number (e.g. ``"CV-2024-003847"``).""" @@ -274,182 +230,28 @@ def _one_judge(self) -> str: choice = self._engine._rng.choice return f"Hon. {choice(_JUDGE_FIRST)} {choice(_JUDGE_LAST)}" - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ + # Public API — custom methods - @overload - def case_number(self) -> str: ... - @overload - def case_number(self, count: Literal[1]) -> str: ... - @overload - def case_number(self, count: int) -> str | list[str]: ... def case_number(self, count: int = 1) -> str | list[str]: - """Generate a case number (e.g. ``"CV-2024-003847"``). - - Parameters - ---------- - count : int - Number of case numbers to generate. - - Returns - ------- - str or list[str] - """ + """Generate a case number (e.g. ``"CV-2024-003847"``).""" if count == 1: return self._one_case_number() return [self._one_case_number() for _ in range(count)] - @overload - def court(self) -> str: ... - @overload - def court(self, count: Literal[1]) -> str: ... - @overload - def court(self, count: int) -> str | list[str]: ... - def court(self, count: int = 1) -> str | list[str]: - """Generate a court name (e.g. ``"Supreme Court"``). - - Parameters - ---------- - count : int - Number of court names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_COURTS) - return self._engine.choices(_COURTS, count) - - @overload - def practice_area(self) -> str: ... - @overload - def practice_area(self, count: Literal[1]) -> str: ... - @overload - def practice_area(self, count: int) -> str | list[str]: ... - def practice_area(self, count: int = 1) -> str | list[str]: - """Generate a legal practice area (e.g. ``"Corporate Law"``). - - Parameters - ---------- - count : int - Number of practice areas to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_PRACTICE_AREAS) - return self._engine.choices(_PRACTICE_AREAS, count) - - @overload - def legal_term(self) -> str: ... - @overload - def legal_term(self, count: Literal[1]) -> str: ... - @overload - def legal_term(self, count: int) -> str | list[str]: ... - def legal_term(self, count: int = 1) -> str | list[str]: - """Generate a legal term (e.g. ``"habeas corpus"``). - - Parameters - ---------- - count : int - Number of legal terms to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_LEGAL_TERMS) - return self._engine.choices(_LEGAL_TERMS, count) - - @overload - def document_type(self) -> str: ... - @overload - def document_type(self, count: Literal[1]) -> str: ... - @overload - def document_type(self, count: int) -> str | list[str]: ... - def document_type(self, count: int = 1) -> str | list[str]: - """Generate a legal document type (e.g. ``"Affidavit"``). - - Parameters - ---------- - count : int - Number of document types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_DOCUMENT_TYPES) - return self._engine.choices(_DOCUMENT_TYPES, count) - - @overload - def law_firm(self) -> str: ... - @overload - def law_firm(self, count: Literal[1]) -> str: ... - @overload - def law_firm(self, count: int) -> str | list[str]: ... def law_firm(self, count: int = 1) -> str | list[str]: - """Generate a law firm name (e.g. ``"Smith & Associates"``). - - Parameters - ---------- - count : int - Number of law firm names to generate. - - Returns - ------- - str or list[str] - """ + """Generate a law firm name (e.g. ``"Smith & Associates"``).""" if count == 1: return self._one_law_firm() return [self._one_law_firm() for _ in range(count)] - @overload - def judge(self) -> str: ... - @overload - def judge(self, count: Literal[1]) -> str: ... - @overload - def judge(self, count: int) -> str | list[str]: ... def judge(self, count: int = 1) -> str | list[str]: - """Generate a judge name (e.g. ``"Hon. Robert Marshall"``). - - Parameters - ---------- - count : int - Number of judge names to generate. - - Returns - ------- - str or list[str] - """ + """Generate a judge name (e.g. ``"Hon. Robert Marshall"``).""" if count == 1: return self._one_judge() return [self._one_judge() for _ in range(count)] - @overload - def verdict(self) -> str: ... - @overload - def verdict(self, count: Literal[1]) -> str: ... - @overload - def verdict(self, count: int) -> str | list[str]: ... def verdict(self, count: int = 1) -> str | list[str]: - """Generate a verdict (e.g. ``"Guilty"``). - - Parameters - ---------- - count : int - Number of verdicts to generate. - - Returns - ------- - str or list[str] - """ + """Generate a verdict (e.g. ``"Guilty"``).""" verdicts = ("Guilty", "Not Guilty", "Dismissed", "Settled", "Mistrial") if count == 1: return self._engine.choice(verdicts) diff --git a/src/dataforge/providers/llm.py b/src/dataforge/providers/llm.py index cf92ba1..732d801 100644 --- a/src/dataforge/providers/llm.py +++ b/src/dataforge/providers/llm.py @@ -1,14 +1,15 @@ -"""LLM provider — model metadata, agents, RAG, moderation, usage/billing.""" +"""LLM provider — model metadata, agents, RAG, moderation, usage/billing, chat.""" -from typing import Literal, overload +from typing import TYPE_CHECKING +from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider -# --------------------------------------------------------------------------- +if TYPE_CHECKING: + from dataforge.core import DataForge + # Module-level immutable tuples — zero per-call allocation -# --------------------------------------------------------------------------- -# --- LLM metadata --- _MODEL_NAMES: tuple[str, ...] = ( "gpt-4o", @@ -31,16 +32,6 @@ "mistral-medium", "mistral-small", "mixtral-8x22b", - "mixtral-8x7b", - "command-r-plus", - "command-r", - "deepseek-v3", - "deepseek-r1", - "qwen-2.5-72b", - "phi-4", - "yi-large", - "dbrx-instruct", - "jamba-1.5-large", ) _PROVIDER_NAMES: tuple[str, ...] = ( @@ -59,11 +50,6 @@ "Azure OpenAI", "Hugging Face", "Replicate", - "Together AI", - "Groq", - "Fireworks AI", - "Perplexity", - "Anyscale", ) _FINISH_REASONS: tuple[str, ...] = ( @@ -104,7 +90,6 @@ "co-", ) -# --- AI Agent / Tool use --- _TOOL_NAMES: tuple[str, ...] = ( "web_search", @@ -122,11 +107,6 @@ "weather_lookup", "stock_price", "url_fetcher", - "json_validator", - "csv_parser", - "pdf_extractor", - "screenshot_tool", - "clipboard_manager", ) _AGENT_NAMES: tuple[str, ...] = ( @@ -145,11 +125,6 @@ "DocumentAgent", "SchedulerBot", "SecurityScanner", - "PerformanceAgent", - "MigrationHelper", - "ComplianceChecker", - "IncidentResponder", - "OnboardingBot", ) _MCP_SERVER_NAMES: tuple[str, ...] = ( @@ -168,11 +143,6 @@ "jira", "confluence", "aws", - "kubernetes", - "docker", - "redis", - "elasticsearch", - "mongodb", ) _CAPABILITIES: tuple[str, ...] = ( @@ -191,14 +161,8 @@ "text-to-speech", "retrieval-augmented-generation", "multi-turn-conversation", - "json-mode", - "system-prompts", - "parallel-tool-calls", - "citation-generation", - "web-browsing", ) -# --- RAG / Embeddings --- _EMBEDDING_MODELS: tuple[str, ...] = ( "text-embedding-3-small", @@ -216,11 +180,6 @@ "e5-large-v2", "jina-embeddings-v3", "mxbai-embed-large-v1", - "all-MiniLM-L6-v2", - "instructor-xl", - "cohere-embed-english-v3", - "titan-embed-text-v2", - "gecko-embedding", ) _VECTOR_DB_NAMES: tuple[str, ...] = ( @@ -239,11 +198,6 @@ "MongoDB Atlas Vector", "Azure AI Search", "Google Vertex AI", - "OpenSearch", - "Turbopuffer", - "Marqo", - "Deep Lake", - "Vald", ) _NAMESPACES: tuple[str, ...] = ( @@ -264,7 +218,6 @@ "customer-data", ) -# --- Content moderation --- _MODERATION_CATEGORIES: tuple[str, ...] = ( "hate", @@ -295,7 +248,6 @@ "filtered", ) -# --- Usage / Billing --- _RATE_LIMIT_NAMES: tuple[str, ...] = ( "x-ratelimit-limit-requests", @@ -312,14 +264,18 @@ _HEX_CHARS: str = "0123456789abcdef" _ALPHANUM: str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +_CHAT_ROLE_VALUES: tuple[str, ...] = ("user", "assistant", "system", "tool") +_CHAT_ROLE_WEIGHTS: tuple[int, ...] = (40, 40, 15, 5) + class LlmProvider(BaseProvider): - """Generates fake LLM ecosystem data — models, agents, RAG, moderation, billing.""" + """Generates fake LLM ecosystem data — models, agents, RAG, moderation, billing, chat.""" - __slots__ = () + __slots__ = ("_forge",) _provider_name = "llm" _locale_modules: tuple[str, ...] = () + _needs_forge: bool = True _field_map: dict[str, str] = { # LLM metadata "model_name": "model_name", @@ -350,35 +306,35 @@ class LlmProvider(BaseProvider): "completion_tokens": "completion_tokens", "cost_estimate": "cost_estimate", "rate_limit_header": "rate_limit_header", + # AI Chat fields + "chat_role": "chat_role", + "chat_model": "chat_model", + "chat_content": "chat_content", + "chat_tokens": "chat_tokens", + "chat_finish_reason": "chat_finish_reason", } - # =================================================================== - # LLM metadata - # =================================================================== - - @overload - def model_name(self) -> str: ... - @overload - def model_name(self, count: Literal[1]) -> str: ... - @overload - def model_name(self, count: int) -> str | list[str]: ... - def model_name(self, count: int = 1) -> str | list[str]: - """Generate an LLM model name (e.g. gpt-4o, claude-3.5-sonnet).""" - if count == 1: - return self._engine.choice(_MODEL_NAMES) - return self._engine.choices(_MODEL_NAMES, count) - - @overload - def provider_name(self) -> str: ... - @overload - def provider_name(self, count: Literal[1]) -> str: ... - @overload - def provider_name(self, count: int) -> str | list[str]: ... - def provider_name(self, count: int = 1) -> str | list[str]: - """Generate an LLM provider name (e.g. OpenAI, Anthropic).""" - if count == 1: - return self._engine.choice(_PROVIDER_NAMES) - return self._engine.choices(_PROVIDER_NAMES, count) + def __init__(self, engine: RandomEngine, forge: "DataForge") -> None: + super().__init__(engine) + self._forge = forge + + _choice_fields: dict[str, tuple[str, ...]] = { + "model_name": _MODEL_NAMES, + "provider_name": _PROVIDER_NAMES, + "finish_reason": _FINISH_REASONS, + "stop_sequence": _STOP_SEQUENCES, + "tool_name": _TOOL_NAMES, + "mcp_server_name": _MCP_SERVER_NAMES, + "agent_name": _AGENT_NAMES, + "capability": _CAPABILITIES, + "embedding_model": _EMBEDDING_MODELS, + "vector_db_name": _VECTOR_DB_NAMES, + "namespace": _NAMESPACES, + "moderation_category": _MODERATION_CATEGORIES, + "harm_label": _HARM_LABELS, + } + + # Scalar helpers def _one_api_key(self) -> str: prefix = self._engine.choice(_API_KEY_PREFIXES) @@ -388,68 +344,6 @@ def _one_api_key(self) -> str: an_len = len(an) return prefix + "".join(an[_ri(0, an_len - 1)] for _ in range(40)) - @overload - def api_key(self) -> str: ... - @overload - def api_key(self, count: Literal[1]) -> str: ... - @overload - def api_key(self, count: int) -> str | list[str]: ... - def api_key(self, count: int = 1) -> str | list[str]: - """Generate a realistic-looking API key.""" - if count == 1: - return self._one_api_key() - # Inlined batch with local binding - _choice = self._engine.choice - _ri = self._engine.random_int - _prefixes = _API_KEY_PREFIXES - an = _ALPHANUM - an_len = len(an) - result: list[str] = [] - for _ in range(count): - prefix = _choice(_prefixes) - result.append(prefix + "".join(an[_ri(0, an_len - 1)] for _j in range(40))) - return result - - @overload - def finish_reason(self) -> str: ... - @overload - def finish_reason(self, count: Literal[1]) -> str: ... - @overload - def finish_reason(self, count: int) -> str | list[str]: ... - def finish_reason(self, count: int = 1) -> str | list[str]: - """Generate an LLM finish reason (e.g. stop, length, tool_calls).""" - if count == 1: - return self._engine.choice(_FINISH_REASONS) - return self._engine.choices(_FINISH_REASONS, count) - - @overload - def stop_sequence(self) -> str: ... - @overload - def stop_sequence(self, count: Literal[1]) -> str: ... - @overload - def stop_sequence(self, count: int) -> str | list[str]: ... - def stop_sequence(self, count: int = 1) -> str | list[str]: - """Generate a stop sequence token.""" - if count == 1: - return self._engine.choice(_STOP_SEQUENCES) - return self._engine.choices(_STOP_SEQUENCES, count) - - # =================================================================== - # AI Agent / Tool use - # =================================================================== - - @overload - def tool_name(self) -> str: ... - @overload - def tool_name(self, count: Literal[1]) -> str: ... - @overload - def tool_name(self, count: int) -> str | list[str]: ... - def tool_name(self, count: int = 1) -> str | list[str]: - """Generate a tool/function name for AI agents.""" - if count == 1: - return self._engine.choice(_TOOL_NAMES) - return self._engine.choices(_TOOL_NAMES, count) - def _one_tool_call_id(self) -> str: # Format: call_XXXX... (24 alphanumeric chars) — matches OpenAI format _ri = self._engine.random_int @@ -457,270 +351,145 @@ def _one_tool_call_id(self) -> str: an_len = len(an) return "call_" + "".join(an[_ri(0, an_len - 1)] for _ in range(24)) - @overload - def tool_call_id(self) -> str: ... - @overload - def tool_call_id(self, count: Literal[1]) -> str: ... - @overload - def tool_call_id(self, count: int) -> str | list[str]: ... - def tool_call_id(self, count: int = 1) -> str | list[str]: - """Generate a tool call ID (e.g. call_abc123...).""" - if count == 1: - return self._one_tool_call_id() - # Inlined batch with local binding - _ri = self._engine.random_int - an = _ALPHANUM - an_len = len(an) - return [ - "call_" + "".join(an[_ri(0, an_len - 1)] for _j in range(24)) - for _ in range(count) - ] - - @overload - def mcp_server_name(self) -> str: ... - @overload - def mcp_server_name(self, count: Literal[1]) -> str: ... - @overload - def mcp_server_name(self, count: int) -> str | list[str]: ... - def mcp_server_name(self, count: int = 1) -> str | list[str]: - """Generate an MCP server name (e.g. filesystem, github).""" - if count == 1: - return self._engine.choice(_MCP_SERVER_NAMES) - return self._engine.choices(_MCP_SERVER_NAMES, count) - - @overload - def agent_name(self) -> str: ... - @overload - def agent_name(self, count: Literal[1]) -> str: ... - @overload - def agent_name(self, count: int) -> str | list[str]: ... - def agent_name(self, count: int = 1) -> str | list[str]: - """Generate an AI agent name (e.g. ResearchAgent, CodingAssistant).""" - if count == 1: - return self._engine.choice(_AGENT_NAMES) - return self._engine.choices(_AGENT_NAMES, count) - - @overload - def capability(self) -> str: ... - @overload - def capability(self, count: Literal[1]) -> str: ... - @overload - def capability(self, count: int) -> str | list[str]: ... - def capability(self, count: int = 1) -> str | list[str]: - """Generate an LLM capability (e.g. tool-use, streaming, vision).""" - if count == 1: - return self._engine.choice(_CAPABILITIES) - return self._engine.choices(_CAPABILITIES, count) - - # =================================================================== - # RAG / Embeddings - # =================================================================== - - @overload - def embedding_model(self) -> str: ... - @overload - def embedding_model(self, count: Literal[1]) -> str: ... - @overload - def embedding_model(self, count: int) -> str | list[str]: ... - def embedding_model(self, count: int = 1) -> str | list[str]: - """Generate an embedding model name.""" - if count == 1: - return self._engine.choice(_EMBEDDING_MODELS) - return self._engine.choices(_EMBEDDING_MODELS, count) - - @overload - def vector_db_name(self) -> str: ... - @overload - def vector_db_name(self, count: Literal[1]) -> str: ... - @overload - def vector_db_name(self, count: int) -> str | list[str]: ... - def vector_db_name(self, count: int = 1) -> str | list[str]: - """Generate a vector database name (e.g. Pinecone, ChromaDB).""" - if count == 1: - return self._engine.choice(_VECTOR_DB_NAMES) - return self._engine.choices(_VECTOR_DB_NAMES, count) - def _one_chunk_id(self) -> str: # Format: chunk_XXXXXXXX (8 hex chars) bits = self._engine.getrandbits(32) return f"chunk_{bits:08x}" - @overload - def chunk_id(self) -> str: ... - @overload - def chunk_id(self, count: Literal[1]) -> str: ... - @overload - def chunk_id(self, count: int) -> str | list[str]: ... + def _one_similarity_score(self) -> str: + # Score between 0.0 and 1.0 with 4 decimal places + return f"{self._engine.random_int(0, 10000) / 10000.0:.4f}" + + def _one_moderation_score(self) -> str: + # Score between 0.0000 and 1.0000 + return f"{self._engine.random_int(0, 10000) / 10000.0:.4f}" + + def _one_token_count(self) -> str: + return str(self._engine.random_int(1, 16384)) + + def _one_prompt_tokens(self) -> str: + return str(self._engine.random_int(10, 8192)) + + def _one_completion_tokens(self) -> str: + return str(self._engine.random_int(1, 4096)) + + def _one_cost_estimate(self) -> str: + # Cost in USD: $0.0001 to $9.9999 + cents = self._engine.random_int(1, 99999) + return f"${cents / 10000.0:.4f}" + + def _one_rate_limit_header(self) -> str: + name = self._engine.choice(_RATE_LIMIT_NAMES) + value = str(self._engine.random_int(0, 100000)) + return f"{name}: {value}" + + # Public API — custom methods + + def api_key(self, count: int = 1) -> str | list[str]: + """Generate a realistic-looking API key.""" + if count == 1: + return self._one_api_key() + return [self._one_api_key() for _ in range(count)] + + def tool_call_id(self, count: int = 1) -> str | list[str]: + """Generate a tool call ID (e.g. call_abc123...).""" + if count == 1: + return self._one_tool_call_id() + return [self._one_tool_call_id() for _ in range(count)] + def chunk_id(self, count: int = 1) -> str | list[str]: """Generate a document chunk ID (e.g. chunk_a1b2c3d4).""" if count == 1: return self._one_chunk_id() - # Inlined batch with local binding - _getrandbits = self._engine.getrandbits - return [f"chunk_{_getrandbits(32):08x}" for _ in range(count)] - - def _one_similarity_score(self) -> str: - # Score between 0.0 and 1.0 with 4 decimal places - return f"{self._engine.random_int(0, 10000) / 10000.0:.4f}" + return [self._one_chunk_id() for _ in range(count)] - @overload - def similarity_score(self) -> str: ... - @overload - def similarity_score(self, count: Literal[1]) -> str: ... - @overload - def similarity_score(self, count: int) -> str | list[str]: ... def similarity_score(self, count: int = 1) -> str | list[str]: """Generate a similarity/relevance score (0.0000-1.0000).""" if count == 1: return self._one_similarity_score() - # Inlined batch with local binding - _ri = self._engine.random_int - return [f"{_ri(0, 10000) / 10000.0:.4f}" for _ in range(count)] - - @overload - def namespace(self) -> str: ... - @overload - def namespace(self, count: Literal[1]) -> str: ... - @overload - def namespace(self, count: int) -> str | list[str]: ... - def namespace(self, count: int = 1) -> str | list[str]: - """Generate a vector DB namespace name.""" - if count == 1: - return self._engine.choice(_NAMESPACES) - return self._engine.choices(_NAMESPACES, count) - - # =================================================================== - # Content moderation - # =================================================================== - - @overload - def moderation_category(self) -> str: ... - @overload - def moderation_category(self, count: Literal[1]) -> str: ... - @overload - def moderation_category(self, count: int) -> str | list[str]: ... - def moderation_category(self, count: int = 1) -> str | list[str]: - """Generate a content moderation category.""" - if count == 1: - return self._engine.choice(_MODERATION_CATEGORIES) - return self._engine.choices(_MODERATION_CATEGORIES, count) - - def _one_moderation_score(self) -> str: - # Score between 0.0000 and 1.0000 - return f"{self._engine.random_int(0, 10000) / 10000.0:.4f}" + return [self._one_similarity_score() for _ in range(count)] - @overload - def moderation_score(self) -> str: ... - @overload - def moderation_score(self, count: Literal[1]) -> str: ... - @overload - def moderation_score(self, count: int) -> str | list[str]: ... def moderation_score(self, count: int = 1) -> str | list[str]: """Generate a moderation score (0.0000-1.0000).""" if count == 1: return self._one_moderation_score() - _ri = self._engine.random_int - return [f"{_ri(0, 10000) / 10000.0:.4f}" for _ in range(count)] - - @overload - def harm_label(self) -> str: ... - @overload - def harm_label(self, count: Literal[1]) -> str: ... - @overload - def harm_label(self, count: int) -> str | list[str]: ... - def harm_label(self, count: int = 1) -> str | list[str]: - """Generate a harm/safety label (e.g. safe, blocked, flagged).""" - if count == 1: - return self._engine.choice(_HARM_LABELS) - return self._engine.choices(_HARM_LABELS, count) + return [self._one_moderation_score() for _ in range(count)] - # =================================================================== - # Usage / Billing - # =================================================================== - - def _one_token_count(self) -> str: - return str(self._engine.random_int(1, 16384)) - - @overload - def token_count(self) -> str: ... - @overload - def token_count(self, count: Literal[1]) -> str: ... - @overload - def token_count(self, count: int) -> str | list[str]: ... def token_count(self, count: int = 1) -> str | list[str]: """Generate a token count (1-16384).""" if count == 1: return self._one_token_count() - _ri = self._engine.random_int - return [str(_ri(1, 16384)) for _ in range(count)] + return [self._one_token_count() for _ in range(count)] - def _one_prompt_tokens(self) -> str: - return str(self._engine.random_int(10, 8192)) - - @overload - def prompt_tokens(self) -> str: ... - @overload - def prompt_tokens(self, count: Literal[1]) -> str: ... - @overload - def prompt_tokens(self, count: int) -> str | list[str]: ... def prompt_tokens(self, count: int = 1) -> str | list[str]: """Generate a prompt token count (10-8192).""" if count == 1: return self._one_prompt_tokens() - _ri = self._engine.random_int - return [str(_ri(10, 8192)) for _ in range(count)] - - def _one_completion_tokens(self) -> str: - return str(self._engine.random_int(1, 4096)) + return [self._one_prompt_tokens() for _ in range(count)] - @overload - def completion_tokens(self) -> str: ... - @overload - def completion_tokens(self, count: Literal[1]) -> str: ... - @overload - def completion_tokens(self, count: int) -> str | list[str]: ... def completion_tokens(self, count: int = 1) -> str | list[str]: """Generate a completion token count (1-4096).""" if count == 1: return self._one_completion_tokens() - _ri = self._engine.random_int - return [str(_ri(1, 4096)) for _ in range(count)] + return [self._one_completion_tokens() for _ in range(count)] - def _one_cost_estimate(self) -> str: - # Cost in USD: $0.0001 to $9.9999 - cents = self._engine.random_int(1, 99999) - return f"${cents / 10000.0:.4f}" - - @overload - def cost_estimate(self) -> str: ... - @overload - def cost_estimate(self, count: Literal[1]) -> str: ... - @overload - def cost_estimate(self, count: int) -> str | list[str]: ... def cost_estimate(self, count: int = 1) -> str | list[str]: """Generate a cost estimate in USD (e.g. $0.0234).""" if count == 1: return self._one_cost_estimate() - _ri = self._engine.random_int - return [f"${_ri(1, 99999) / 10000.0:.4f}" for _ in range(count)] - - def _one_rate_limit_header(self) -> str: - name = self._engine.choice(_RATE_LIMIT_NAMES) - value = str(self._engine.random_int(0, 100000)) - return f"{name}: {value}" + return [self._one_cost_estimate() for _ in range(count)] - @overload - def rate_limit_header(self) -> str: ... - @overload - def rate_limit_header(self, count: Literal[1]) -> str: ... - @overload - def rate_limit_header(self, count: int) -> str | list[str]: ... def rate_limit_header(self, count: int = 1) -> str | list[str]: - """Generate a rate limit HTTP header (e.g. x-ratelimit-remaining-tokens: 4500).""" + """Generate a rate limit HTTP header.""" if count == 1: return self._one_rate_limit_header() - # Inlined batch with local binding - _choice = self._engine.choice - _ri = self._engine.random_int - _names = _RATE_LIMIT_NAMES - return [f"{_choice(_names)}: {_ri(0, 100000)}" for _ in range(count)] + return [self._one_rate_limit_header() for _ in range(count)] + + # AI Chat methods (merged from ai_chat provider) + + def chat_role(self, count: int = 1) -> str | list[str]: + """Generate a chat message role (user, assistant, system, tool).""" + if count == 1: + return self._engine.weighted_choice(_CHAT_ROLE_VALUES, _CHAT_ROLE_WEIGHTS) + return self._engine.weighted_choices( + _CHAT_ROLE_VALUES, _CHAT_ROLE_WEIGHTS, count + ) + + def chat_model(self, count: int = 1) -> str | list[str]: + """Generate a model name for the chat.""" + return self.model_name(count) + + def chat_content(self, count: int = 1) -> str | list[str]: + """Generate chat message content.""" + return self._forge.ai_prompt.user_prompt(count) + + def chat_tokens(self, count: int = 1) -> str | list[str]: + """Generate a token count for a chat message.""" + return self.token_count(count) + + def chat_finish_reason(self, count: int = 1) -> str | list[str]: + """Generate a finish reason for a chat message.""" + return self.finish_reason(count) + + def chat_message(self, count: int = 1) -> dict[str, str] | list[dict[str, str]]: + """Generate a realistic chat message dict with role, model, content, tokens.""" + + def _one() -> dict[str, str]: + role = self._engine.weighted_choice(_CHAT_ROLE_VALUES, _CHAT_ROLE_WEIGHTS) + model = self.model_name() + content = ( + self._forge.ai_prompt.system_prompt() + if role == "system" + else self._forge.ai_prompt.user_prompt() + ) + return { + "role": role, + "model": model, + "content": content, + "tokens": self.token_count(), + "finish_reason": self.finish_reason(), + } + + if count == 1: + return _one() + return [_one() for _ in range(count)] diff --git a/src/dataforge/providers/logistics.py b/src/dataforge/providers/logistics.py index 52a8f5a..5b87ebc 100644 --- a/src/dataforge/providers/logistics.py +++ b/src/dataforge/providers/logistics.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _CARRIERS: tuple[str, ...] = ( "FedEx", @@ -34,16 +30,6 @@ "XPO Logistics", "C.H. Robinson", "J.B. Hunt", - "Schneider National", - "Old Dominion", - "Saia", - "Estes Express", - "R+L Carriers", - "YRC Freight", - "ABF Freight", - "Holland", - "FedEx Freight", - "UPS Freight", ) _SHIPPING_METHODS: tuple[str, ...] = ( @@ -62,11 +48,6 @@ "Rail Freight", "Same Day", "White Glove", - "Parcel Post", - "Registered Mail", - "Certified Mail", - "International Economy", - "International Priority", ) _CONTAINER_TYPES: tuple[str, ...] = ( @@ -85,11 +66,6 @@ "40ft Double Door", "ISO Tank", "Bulk Container", - "Platform Container", - "Half Height Container", - "Pallet Wide Container", - "Swap Body", - "Flexi Tank", ) _TRACKING_STATUSES: tuple[str, ...] = ( @@ -108,11 +84,6 @@ "At Regional Hub", "Departed Origin", "Arrived at Destination", - "Exception", - "Lost", - "Damaged", - "Awaiting Pickup", - "Picked Up", ) _INCOTERMS: tuple[str, ...] = ( @@ -145,11 +116,6 @@ "Coastal", "Highland", "Lakeside", - "Riverside", - "Summit", - "Valley", - "Gateway", - "Crossroads", ) _WAREHOUSE_TYPES: tuple[str, ...] = ( @@ -181,11 +147,6 @@ "Sack", "Tote", "Skid", - "Padded Mailer", - "Poly Bag", - "Corrugated Box", - "Wooden Crate", - "Shrink Wrap Pallet", ) _HS_PREFIXES: tuple[str, ...] = ( @@ -209,32 +170,6 @@ "28", "29", "30", - "39", - "40", - "42", - "44", - "48", - "49", - "50", - "52", - "54", - "55", - "61", - "62", - "63", - "64", - "70", - "71", - "72", - "73", - "76", - "84", - "85", - "87", - "90", - "94", - "95", - "96", ) @@ -271,17 +206,22 @@ class LogisticsProvider(BaseProvider): "freight_class": "freight_class", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "carrier": _CARRIERS, + "shipping_method": _SHIPPING_METHODS, + "container_type": _CONTAINER_TYPES, + "tracking_status": _TRACKING_STATUSES, + "incoterm": _INCOTERMS, + "package_type": _PACKAGE_TYPES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_warehouse(self) -> str: - """Generate a single warehouse name.""" choice = self._engine._rng.choice return f"{choice(_WAREHOUSE_ADJECTIVES)} {choice(_WAREHOUSE_TYPES)}" def _one_hs_code(self) -> str: - """Generate a single HS (Harmonized System) code.""" choice = self._engine._rng.choice ri = self._engine.random_int prefix = choice(_HS_PREFIXES) @@ -290,14 +230,12 @@ def _one_hs_code(self) -> str: return f"{prefix}{suffix}.{sub}" def _one_shipping_weight(self) -> str: - """Generate a single shipping weight string.""" ri = self._engine.random_int lbs = ri(1, 2000) oz = ri(0, 15) return f"{lbs}.{oz} lbs" def _one_freight_class(self) -> str: - """Generate a single NMFC freight class.""" classes = ( "50", "55", @@ -320,226 +258,28 @@ def _one_freight_class(self) -> str: ) return self._engine._rng.choice(classes) - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def carrier(self) -> str: ... - @overload - def carrier(self, count: Literal[1]) -> str: ... - @overload - def carrier(self, count: int) -> str | list[str]: ... - def carrier(self, count: int = 1) -> str | list[str]: - """Generate a shipping carrier (e.g. ``"FedEx"``). - - Parameters - ---------- - count : int - Number of carrier names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CARRIERS) - return self._engine.choices(_CARRIERS, count) - - @overload - def shipping_method(self) -> str: ... - @overload - def shipping_method(self, count: Literal[1]) -> str: ... - @overload - def shipping_method(self, count: int) -> str | list[str]: ... - def shipping_method(self, count: int = 1) -> str | list[str]: - """Generate a shipping method (e.g. ``"Express"``). - - Parameters - ---------- - count : int - Number of shipping methods to generate. + # Public API — custom methods - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_SHIPPING_METHODS) - return self._engine.choices(_SHIPPING_METHODS, count) - - @overload - def container_type(self) -> str: ... - @overload - def container_type(self, count: Literal[1]) -> str: ... - @overload - def container_type(self, count: int) -> str | list[str]: ... - def container_type(self, count: int = 1) -> str | list[str]: - """Generate a container type (e.g. ``"40ft High Cube"``). - - Parameters - ---------- - count : int - Number of container types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CONTAINER_TYPES) - return self._engine.choices(_CONTAINER_TYPES, count) - - @overload - def tracking_status(self) -> str: ... - @overload - def tracking_status(self, count: Literal[1]) -> str: ... - @overload - def tracking_status(self, count: int) -> str | list[str]: ... - def tracking_status(self, count: int = 1) -> str | list[str]: - """Generate a tracking status (e.g. ``"In Transit"``). - - Parameters - ---------- - count : int - Number of tracking statuses to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_TRACKING_STATUSES) - return self._engine.choices(_TRACKING_STATUSES, count) - - @overload - def incoterm(self) -> str: ... - @overload - def incoterm(self, count: Literal[1]) -> str: ... - @overload - def incoterm(self, count: int) -> str | list[str]: ... - def incoterm(self, count: int = 1) -> str | list[str]: - """Generate an Incoterm (e.g. ``"FOB"``). - - Parameters - ---------- - count : int - Number of Incoterms to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_INCOTERMS) - return self._engine.choices(_INCOTERMS, count) - - @overload - def warehouse(self) -> str: ... - @overload - def warehouse(self, count: Literal[1]) -> str: ... - @overload - def warehouse(self, count: int) -> str | list[str]: ... def warehouse(self, count: int = 1) -> str | list[str]: - """Generate a warehouse name (e.g. ``"Pacific Fulfillment Center"``). - - Parameters - ---------- - count : int - Number of warehouse names to generate. - - Returns - ------- - str or list[str] - """ + """Generate a warehouse name (e.g. ``"Pacific Fulfillment Center"``).""" if count == 1: return self._one_warehouse() return [self._one_warehouse() for _ in range(count)] - @overload - def package_type(self) -> str: ... - @overload - def package_type(self, count: Literal[1]) -> str: ... - @overload - def package_type(self, count: int) -> str | list[str]: ... - def package_type(self, count: int = 1) -> str | list[str]: - """Generate a package type (e.g. ``"Box"``). - - Parameters - ---------- - count : int - Number of package types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_PACKAGE_TYPES) - return self._engine.choices(_PACKAGE_TYPES, count) - - @overload - def hs_code(self) -> str: ... - @overload - def hs_code(self, count: Literal[1]) -> str: ... - @overload - def hs_code(self, count: int) -> str | list[str]: ... def hs_code(self, count: int = 1) -> str | list[str]: - """Generate an HS (Harmonized System) code (e.g. ``"8471.30"``). - - Parameters - ---------- - count : int - Number of HS codes to generate. - - Returns - ------- - str or list[str] - """ + """Generate an HS (Harmonized System) code (e.g. ``"8471.30"``).""" if count == 1: return self._one_hs_code() return [self._one_hs_code() for _ in range(count)] - @overload - def shipping_weight(self) -> str: ... - @overload - def shipping_weight(self, count: Literal[1]) -> str: ... - @overload - def shipping_weight(self, count: int) -> str | list[str]: ... def shipping_weight(self, count: int = 1) -> str | list[str]: - """Generate a shipping weight (e.g. ``"45.8 lbs"``). - - Parameters - ---------- - count : int - Number of weights to generate. - - Returns - ------- - str or list[str] - """ + """Generate a shipping weight (e.g. ``"45.8 lbs"``).""" if count == 1: return self._one_shipping_weight() return [self._one_shipping_weight() for _ in range(count)] - @overload - def freight_class(self) -> str: ... - @overload - def freight_class(self, count: Literal[1]) -> str: ... - @overload - def freight_class(self, count: int) -> str | list[str]: ... def freight_class(self, count: int = 1) -> str | list[str]: - """Generate an NMFC freight class (e.g. ``"85"``). - - Parameters - ---------- - count : int - Number of freight classes to generate. - - Returns - ------- - str or list[str] - """ + """Generate an NMFC freight class (e.g. ``"85"``).""" if count == 1: return self._one_freight_class() return [self._one_freight_class() for _ in range(count)] diff --git a/src/dataforge/providers/lorem.py b/src/dataforge/providers/lorem.py index 63a770e..fbfe35d 100644 --- a/src/dataforge/providers/lorem.py +++ b/src/dataforge/providers/lorem.py @@ -4,8 +4,6 @@ word list which is universally used for placeholder text. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider # Standard Lorem Ipsum word pool — immutable tuple for speed @@ -30,86 +28,6 @@ "magna", "aliqua", "enim", - "ad", - "minim", - "veniam", - "quis", - "nostrud", - "exercitation", - "ullamco", - "laboris", - "nisi", - "aliquip", - "ex", - "ea", - "commodo", - "consequat", - "duis", - "aute", - "irure", - "in", - "reprehenderit", - "voluptate", - "velit", - "esse", - "cillum", - "fugiat", - "nulla", - "pariatur", - "excepteur", - "sint", - "occaecat", - "cupidatat", - "non", - "proident", - "sunt", - "culpa", - "qui", - "officia", - "deserunt", - "mollit", - "anim", - "id", - "est", - "laborum", - "at", - "vero", - "eos", - "accusamus", - "iusto", - "odio", - "dignissimos", - "ducimus", - "blanditiis", - "praesentium", - "voluptatum", - "deleniti", - "atque", - "corrupti", - "quos", - "dolores", - "quas", - "molestias", - "excepturi", - "obcaecati", - "cupiditate", - "provident", - "similique", - "architecto", - "beatae", - "vitae", - "dicta", - "explicabo", - "nemo", - "ipsam", - "voluptatem", - "quia", - "voluptas", - "aspernatur", - "aut", - "fugit", - "consequuntur", - "magni", ) @@ -135,9 +53,11 @@ class LoremProvider(BaseProvider): "paragraph": "paragraph", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "word": _LOREM_WORDS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_sentence(self, word_count: int) -> str: words = self._engine.choices(_LOREM_WORDS, word_count) @@ -151,44 +71,10 @@ def _one_paragraph(self, sentence_count: int) -> str: ] return " ".join(sentences) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def word(self) -> str: ... - @overload - def word(self, count: Literal[1]) -> str: ... - @overload - def word(self, count: int) -> str | list[str]: ... - def word(self, count: int = 1) -> str | list[str]: - """Generate random Lorem Ipsum word(s). - - Parameters - ---------- - count : int - Number of words to generate. - """ - if count == 1: - return self._engine.choice(_LOREM_WORDS) - return self._engine.choices(_LOREM_WORDS, count) - - @overload - def sentence(self) -> str: ... - @overload - def sentence(self, count: Literal[1]) -> str: ... - @overload - def sentence(self, count: int) -> str | list[str]: ... + def sentence(self, count: int = 1, word_count: int = 10) -> str | list[str]: - """Generate random Lorem Ipsum sentence(s). - - Parameters - ---------- - count : int - Number of sentences to generate. - word_count : int - Approximate number of words per sentence. - """ + """Generate random Lorem Ipsum sentence(s).""" if count == 1: return self._one_sentence(word_count) # Batch: generate all words at once, then slice into sentences @@ -202,34 +88,14 @@ def sentence(self, count: int = 1, word_count: int = 10) -> str | list[str]: result.append(_join(chunk) + ".") return result - @overload - def paragraph(self) -> str: ... - @overload - def paragraph(self, count: Literal[1]) -> str: ... - @overload - def paragraph(self, count: int) -> str | list[str]: ... def paragraph(self, count: int = 1, sentence_count: int = 5) -> str | list[str]: - """Generate random Lorem Ipsum paragraph(s). - - Parameters - ---------- - count : int - Number of paragraphs to generate. - sentence_count : int - Number of sentences per paragraph. - """ + """Generate random Lorem Ipsum paragraph(s).""" if count == 1: return self._one_paragraph(sentence_count) return [self._one_paragraph(sentence_count) for _ in range(count)] def text(self, max_chars: int = 200) -> str: - """Generate Lorem Ipsum text up to *max_chars* characters. - - Parameters - ---------- - max_chars : int - Maximum number of characters in the output. - """ + """Generate Lorem Ipsum text up to *max_chars* characters.""" parts: list[str] = [] current_len = 0 while current_len < max_chars: diff --git a/src/dataforge/providers/medical.py b/src/dataforge/providers/medical.py index 3c49066..c37b6c6 100644 --- a/src/dataforge/providers/medical.py +++ b/src/dataforge/providers/medical.py @@ -1,7 +1,5 @@ """Medical / healthcare provider — ICD-10 codes, drugs, blood types, etc.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _BLOOD_TYPES: tuple[str, ...] = ( @@ -31,11 +29,6 @@ "M", "N", "O", - "P", - "Q", - "R", - "S", - "T", ) _DRUG_NAMES: tuple[str, ...] = ( @@ -59,36 +52,6 @@ "Prednisone", "Furosemide", "Ciprofloxacin", - "Pantoprazole", - "Escitalopram", - "Montelukast", - "Cephalexin", - "Tramadol", - "Fluoxetine", - "Trazodone", - "Clonazepam", - "Alprazolam", - "Duloxetine", - "Venlafaxine", - "Bupropion", - "Warfarin", - "Clopidogrel", - "Rosuvastatin", - "Doxycycline", - "Meloxicam", - "Carvedilol", - "Tamsulosin", - "Finasteride", - "Propranolol", - "Ranitidine", - "Cetirizine", - "Loratadine", - "Diphenhydramine", - "Methylprednisolone", - "Naproxen", - "Cyclobenzaprine", - "Diazepam", - "Oxycodone", ) _DRUG_FORMS: tuple[str, ...] = ( @@ -130,27 +93,6 @@ "Anemia", "Migraine", "Epilepsy", - "Rheumatoid Arthritis", - "Gout", - "Psoriasis", - "Eczema", - "Sleep Apnea", - "Celiac Disease", - "Irritable Bowel Syndrome", - "Crohn's Disease", - "Fibromyalgia", - "Osteoporosis", - "Glaucoma", - "Cataracts", - "Vertigo", - "Sinusitis", - "Tonsillitis", - "Influenza", - "COVID-19", - "Hepatitis B", - "Hepatitis C", - "HIV/AIDS", - "Tuberculosis", ) _PROCEDURES: tuple[str, ...] = ( @@ -174,16 +116,6 @@ "Radiation Therapy", "Cardiac Catheterization", "Angioplasty", - "Appendectomy", - "Cholecystectomy", - "Knee Replacement", - "Hip Replacement", - "Cataract Surgery", - "Tonsillectomy", - "Cesarean Section", - "Hernia Repair", - "Coronary Bypass", - "Pacemaker Implantation", ) _DOSAGE_UNITS: tuple[str, ...] = ( @@ -231,7 +163,13 @@ class MedicalProvider(BaseProvider): "mrn": "medical_record_number", } - # --- Scalar helpers --- + _choice_fields: dict[str, tuple[str, ...]] = { + "blood_type": _BLOOD_TYPES, + "drug_name": _DRUG_NAMES, + "drug_form": _DRUG_FORMS, + "diagnosis": _DIAGNOSES, + "procedure": _PROCEDURES, + } def _one_icd10(self) -> str: """Generate ICD-10 code format: A##.# or A##.##""" @@ -248,115 +186,24 @@ def _one_dosage(self) -> str: def _one_mrn(self) -> str: return f"MRN-{self._engine.random_digits_str(8)}" - # --- Public API --- - - @overload - def blood_type(self) -> str: ... - @overload - def blood_type(self, count: Literal[1]) -> str: ... - @overload - def blood_type(self, count: int) -> str | list[str]: ... - def blood_type(self, count: int = 1) -> str | list[str]: - """Generate a blood type (e.g., A+, O-, AB+).""" - if count == 1: - return self._engine.choice(_BLOOD_TYPES) - return self._engine.choices(_BLOOD_TYPES, count) - - @overload - def realistic_blood_type(self) -> str: ... - @overload - def realistic_blood_type(self, count: Literal[1]) -> str: ... - @overload - def realistic_blood_type(self, count: int) -> str | list[str]: ... def realistic_blood_type(self, count: int = 1) -> str | list[str]: - """Generate a blood type with real-world frequency distribution. - - Uses American Red Cross population statistics: - O+ 37.4%, A+ 35.7%, B+ 8.5%, O- 6.6%, A- 6.3%, AB+ 3.4%, - B- 1.5%, AB- 0.6%. - """ + """Generate a blood type with real-world frequency distribution.""" if count == 1: return self._engine.weighted_choice(_BLOOD_TYPES, _BLOOD_TYPE_WEIGHTS) return self._engine.weighted_choices(_BLOOD_TYPES, _BLOOD_TYPE_WEIGHTS, count) - @overload - def icd10_code(self) -> str: ... - @overload - def icd10_code(self, count: Literal[1]) -> str: ... - @overload - def icd10_code(self, count: int) -> str | list[str]: ... def icd10_code(self, count: int = 1) -> str | list[str]: """Generate an ICD-10 diagnostic code (e.g., A01.0).""" if count == 1: return self._one_icd10() return [self._one_icd10() for _ in range(count)] - @overload - def drug_name(self) -> str: ... - @overload - def drug_name(self, count: Literal[1]) -> str: ... - @overload - def drug_name(self, count: int) -> str | list[str]: ... - def drug_name(self, count: int = 1) -> str | list[str]: - """Generate a drug/medication name.""" - if count == 1: - return self._engine.choice(_DRUG_NAMES) - return self._engine.choices(_DRUG_NAMES, count) - - @overload - def drug_form(self) -> str: ... - @overload - def drug_form(self, count: Literal[1]) -> str: ... - @overload - def drug_form(self, count: int) -> str | list[str]: ... - def drug_form(self, count: int = 1) -> str | list[str]: - """Generate a drug dosage form (e.g., Tablet, Capsule).""" - if count == 1: - return self._engine.choice(_DRUG_FORMS) - return self._engine.choices(_DRUG_FORMS, count) - - @overload - def dosage(self) -> str: ... - @overload - def dosage(self, count: Literal[1]) -> str: ... - @overload - def dosage(self, count: int) -> str | list[str]: ... def dosage(self, count: int = 1) -> str | list[str]: """Generate a drug dosage (e.g., 500 mg).""" if count == 1: return self._one_dosage() return [self._one_dosage() for _ in range(count)] - @overload - def diagnosis(self) -> str: ... - @overload - def diagnosis(self, count: Literal[1]) -> str: ... - @overload - def diagnosis(self, count: int) -> str | list[str]: ... - def diagnosis(self, count: int = 1) -> str | list[str]: - """Generate a medical diagnosis.""" - if count == 1: - return self._engine.choice(_DIAGNOSES) - return self._engine.choices(_DIAGNOSES, count) - - @overload - def procedure(self) -> str: ... - @overload - def procedure(self, count: Literal[1]) -> str: ... - @overload - def procedure(self, count: int) -> str | list[str]: ... - def procedure(self, count: int = 1) -> str | list[str]: - """Generate a medical procedure name.""" - if count == 1: - return self._engine.choice(_PROCEDURES) - return self._engine.choices(_PROCEDURES, count) - - @overload - def medical_record_number(self) -> str: ... - @overload - def medical_record_number(self, count: Literal[1]) -> str: ... - @overload - def medical_record_number(self, count: int) -> str | list[str]: ... def medical_record_number(self, count: int = 1) -> str | list[str]: """Generate a medical record number (MRN-########).""" if count == 1: diff --git a/src/dataforge/providers/misc.py b/src/dataforge/providers/misc.py index d2a2ba8..a44357a 100644 --- a/src/dataforge/providers/misc.py +++ b/src/dataforge/providers/misc.py @@ -1,10 +1,7 @@ -"""MiscProvider — utility generators for common testing needs. - -All methods are locale-independent and optimized for speed. -""" +"""MiscProvider — utility generators for common testing needs.""" import time as _time -from typing import Any, Literal, overload +from typing import Any from dataforge.providers.base import BaseProvider @@ -15,15 +12,7 @@ class MiscProvider(BaseProvider): - """Generates UUIDs, booleans, and utility random selections. - - This provider is locale-independent. - - Parameters - ---------- - engine : RandomEngine - The shared random engine instance. - """ + """Generates UUIDs, booleans, and utility random selections.""" __slots__ = () @@ -36,9 +25,7 @@ class MiscProvider(BaseProvider): "boolean": "boolean", } - # ------------------------------------------------------------------ # Scalar helpers - # ------------------------------------------------------------------ def _one_uuid4(self) -> str: # 128 random bits → set version 4 and variant 1 with 2 ops @@ -58,28 +45,10 @@ def _one_uuid7(self) -> str: h = f"{n:032x}" return f"{h[:8]}-{h[8:12]}-{h[12:16]}-{h[16:20]}-{h[20:]}" - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def uuid4(self) -> str: ... - @overload - def uuid4(self, count: Literal[1]) -> str: ... - @overload - def uuid4(self, count: int) -> str | list[str]: ... - def uuid4(self, count: int = 1) -> str | list[str]: - """Generate a random UUID4 string. - - Uses direct hex formatting from ``getrandbits(128)`` with - version/variant bits set arithmetically — avoids ``bytearray``, - ``bytes``, and ``uuid.UUID()`` constructor overhead entirely. - Parameters - ---------- - count : int - Number of UUIDs to generate. - """ + def uuid4(self, count: int = 1) -> str | list[str]: + """Generate a random UUID4 string.""" if count == 1: return self._one_uuid4() rng_bits = self._engine._rng.getrandbits @@ -92,40 +61,8 @@ def uuid4(self, count: int = 1) -> str | list[str]: result.append(f"{h[:8]}-{h[8:12]}-{h[12:16]}-{h[16:20]}-{h[20:]}") return result - @overload - def uuid7(self) -> str: ... - @overload - def uuid7(self, count: Literal[1]) -> str: ... - @overload - def uuid7(self, count: int) -> str | list[str]: ... def uuid7(self, count: int = 1) -> str | list[str]: - """Generate a random UUID7 string (time-ordered, monotonic). - - UUID7 (RFC 9562) embeds a millisecond-precision Unix timestamp - in the first 48 bits, making the values naturally sortable by - creation time — ideal for database primary keys. - - The timestamp uses real wall-clock time for time-ordering. - The random portion uses the shared engine RNG, so output is - deterministic when a seed is set (only the random bits are - reproducible; the timestamp reflects actual generation time). - - Uses direct hex formatting — avoids ``to_bytes``, ``bytearray``, - and ``uuid.UUID()`` constructor overhead entirely. - - Parameters - ---------- - count : int - Number of UUIDs to generate. - - Returns - ------- - str or list[str] - - .. versionadded:: 1.1.0 - Custom RFC 9562 implementation — no stdlib ``uuid.uuid7()`` - dependency. Works on Python >= 3.12. - """ + """Generate a random UUID7 string (time-ordered, monotonic).""" if count == 1: return self._one_uuid7() rng_bits = self._engine._rng.getrandbits @@ -142,15 +79,7 @@ def uuid7(self, count: int = 1) -> str | list[str]: return result def boolean(self, count: int = 1, probability: float = 0.5) -> bool | list[bool]: - """Generate a random boolean. - - Parameters - ---------- - count : int - Number of booleans to generate. - probability : float - Probability of ``True`` (0.0–1.0). Default 0.5. - """ + """Generate a random boolean.""" rng = self._engine._rng if count == 1: return rng.random() < probability @@ -159,38 +88,14 @@ def boolean(self, count: int = 1, probability: float = 0.5) -> bool | list[bool] def random_element( self, elements: tuple[Any, ...] | list[Any], count: int = 1 ) -> Any: - """Pick random element(s) from a user-provided collection. - - Parameters - ---------- - elements : tuple or list - The items to choose from. - count : int - Number of items to pick. - - Returns - ------- - Any or list[Any] - """ + """Pick random element(s) from a user-provided collection.""" data = tuple(elements) if isinstance(elements, list) else elements if count == 1: return self._engine.choice(data) return self._engine.choices(data, count) def null_or(self, value: Any, probability: float = 0.1) -> Any: - """Return ``None`` with *probability*, otherwise return *value*. - - Parameters - ---------- - value : Any - The value to return when not null. - probability : float - Probability of returning ``None`` (0.0–1.0). - - Returns - ------- - Any - """ + """Return ``None`` with *probability*, otherwise return *value*.""" if self._engine._rng.random() < probability: return None return value diff --git a/src/dataforge/providers/music.py b/src/dataforge/providers/music.py index 2cf1949..a977d1b 100644 --- a/src/dataforge/providers/music.py +++ b/src/dataforge/providers/music.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _GENRES: tuple[str, ...] = ( "Rock", @@ -34,16 +30,6 @@ "Alternative", "K-Pop", "Afrobeats", - "Dancehall", - "Techno", - "House", - "Drum and Bass", - "Ambient", - "Gospel", - "Ska", - "Grunge", - "Trap", - "Lo-fi", ) _ARTISTS: tuple[str, ...] = ( @@ -67,26 +53,6 @@ "Thunderbolt", "Phoenix Rising", "Midnight Sun", - "Blue Velvet", - "Golden Hour", - "Platinum Waves", - "Diamond Cut", - "Ruby Red", - "Emerald City", - "Sapphire Sky", - "Amber Glow", - "Ivory Tower", - "Obsidian Edge", - "Coral Reef", - "Jade Garden", - "Opal Dreams", - "Topaz Fire", - "Onyx Knight", - "Pearl Harbor", - "Quartz Crystal", - "Garnet Stone", - "Turquoise Bay", - "Cobalt Blue", ) _ALBUM_ADJECTIVES: tuple[str, ...] = ( @@ -105,11 +71,6 @@ "Neon", "Velvet", "Crystal", - "Frozen", - "Burning", - "Rising", - "Falling", - "Hidden", ) _ALBUM_NOUNS: tuple[str, ...] = ( @@ -128,11 +89,6 @@ "Mountains", "Stars", "Flames", - "Waves", - "Gardens", - "Kingdoms", - "Journeys", - "Nights", ) _SONG_STARTERS: tuple[str, ...] = ( @@ -192,16 +148,6 @@ "Accordion", "Synthesizer", "Organ", - "Tabla", - "Sitar", - "Didgeridoo", - "Bagpipes", - "Timpani", - "Xylophone", - "Marimba", - "Vibraphone", - "French Horn", - "Tuba", ) _RECORD_LABELS: tuple[str, ...] = ( @@ -220,11 +166,6 @@ "4AD", "Rough Trade", "Domino Recording", - "XL Recordings", - "Warp Records", - "Merge Records", - "Matador Records", - "Beggars Banquet", ) _STREAMING_SERVICES: tuple[str, ...] = ( @@ -277,229 +218,53 @@ class MusicProvider(BaseProvider): "bpm": "bpm", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "genre": _GENRES, + "artist": _ARTISTS, + "instrument": _INSTRUMENTS, + "record_label": _RECORD_LABELS, + "streaming_service": _STREAMING_SERVICES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_album(self) -> str: - """Generate a single album name.""" choice = self._engine._rng.choice return f"{choice(_ALBUM_ADJECTIVES)} {choice(_ALBUM_NOUNS)}" def _one_song(self) -> str: - """Generate a single song title.""" choice = self._engine._rng.choice return f"{choice(_SONG_STARTERS)} {choice(_SONG_ENDINGS)}" def _one_duration(self) -> str: - """Generate a single track duration string (M:SS).""" ri = self._engine.random_int - minutes = ri(1, 8) - seconds = ri(0, 59) - return f"{minutes}:{seconds:02d}" + return f"{ri(1, 8)}:{ri(0, 59):02d}" def _one_bpm(self) -> str: - """Generate a single BPM value as string.""" return str(self._engine.random_int(60, 200)) - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def genre(self) -> str: ... - @overload - def genre(self, count: Literal[1]) -> str: ... - @overload - def genre(self, count: int) -> str | list[str]: ... - def genre(self, count: int = 1) -> str | list[str]: - """Generate a music genre (e.g. ``"Jazz"``). - - Parameters - ---------- - count : int - Number of genres to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_GENRES) - return self._engine.choices(_GENRES, count) - - @overload - def artist(self) -> str: ... - @overload - def artist(self, count: Literal[1]) -> str: ... - @overload - def artist(self, count: int) -> str | list[str]: ... - def artist(self, count: int = 1) -> str | list[str]: - """Generate an artist name (e.g. ``"Silver Horizon"``). + # Public API — custom methods - Parameters - ---------- - count : int - Number of artist names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_ARTISTS) - return self._engine.choices(_ARTISTS, count) - - @overload - def album(self) -> str: ... - @overload - def album(self, count: Literal[1]) -> str: ... - @overload - def album(self, count: int) -> str | list[str]: ... def album(self, count: int = 1) -> str | list[str]: - """Generate an album name (e.g. ``"Eternal Dreams"``). - - Parameters - ---------- - count : int - Number of album names to generate. - - Returns - ------- - str or list[str] - """ + """Generate an album name (e.g. ``"Eternal Dreams"``).""" if count == 1: return self._one_album() return [self._one_album() for _ in range(count)] - @overload - def song(self) -> str: ... - @overload - def song(self, count: Literal[1]) -> str: ... - @overload - def song(self, count: int) -> str | list[str]: ... def song(self, count: int = 1) -> str | list[str]: - """Generate a song title (e.g. ``"Dancing in the Moonlight"``). - - Parameters - ---------- - count : int - Number of song titles to generate. - - Returns - ------- - str or list[str] - """ + """Generate a song title (e.g. ``"Dancing in the Moonlight"``).""" if count == 1: return self._one_song() return [self._one_song() for _ in range(count)] - @overload - def instrument(self) -> str: ... - @overload - def instrument(self, count: Literal[1]) -> str: ... - @overload - def instrument(self, count: int) -> str | list[str]: ... - def instrument(self, count: int = 1) -> str | list[str]: - """Generate an instrument name (e.g. ``"Guitar"``). - - Parameters - ---------- - count : int - Number of instruments to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_INSTRUMENTS) - return self._engine.choices(_INSTRUMENTS, count) - - @overload - def record_label(self) -> str: ... - @overload - def record_label(self, count: Literal[1]) -> str: ... - @overload - def record_label(self, count: int) -> str | list[str]: ... - def record_label(self, count: int = 1) -> str | list[str]: - """Generate a record label name (e.g. ``"Atlantic Records"``). - - Parameters - ---------- - count : int - Number of record label names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_RECORD_LABELS) - return self._engine.choices(_RECORD_LABELS, count) - - @overload - def streaming_service(self) -> str: ... - @overload - def streaming_service(self, count: Literal[1]) -> str: ... - @overload - def streaming_service(self, count: int) -> str | list[str]: ... - def streaming_service(self, count: int = 1) -> str | list[str]: - """Generate a music streaming service name (e.g. ``"Spotify"``). - - Parameters - ---------- - count : int - Number of streaming service names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_STREAMING_SERVICES) - return self._engine.choices(_STREAMING_SERVICES, count) - - @overload - def duration(self) -> str: ... - @overload - def duration(self, count: Literal[1]) -> str: ... - @overload - def duration(self, count: int) -> str | list[str]: ... def duration(self, count: int = 1) -> str | list[str]: - """Generate a track duration (e.g. ``"3:42"``). - - Parameters - ---------- - count : int - Number of durations to generate. - - Returns - ------- - str or list[str] - """ + """Generate a track duration (e.g. ``"3:42"``).""" if count == 1: return self._one_duration() return [self._one_duration() for _ in range(count)] - @overload - def bpm(self) -> str: ... - @overload - def bpm(self, count: Literal[1]) -> str: ... - @overload - def bpm(self, count: int) -> str | list[str]: ... def bpm(self, count: int = 1) -> str | list[str]: - """Generate a BPM value as string (e.g. ``"128"``). - - Parameters - ---------- - count : int - Number of BPM values to generate. - - Returns - ------- - str or list[str] - """ + """Generate a BPM value as string (e.g. ``"128"``).""" if count == 1: return self._one_bpm() return [self._one_bpm() for _ in range(count)] diff --git a/src/dataforge/providers/network.py b/src/dataforge/providers/network.py index 4b78f8b..4a192b0 100644 --- a/src/dataforge/providers/network.py +++ b/src/dataforge/providers/network.py @@ -4,8 +4,6 @@ for maximum performance (bytecode constants, zero import overhead). """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider # User-Agent templates — realistic browser strings @@ -103,9 +101,13 @@ class NetworkProvider(BaseProvider): "http_method": "http_method", "http_status_code": "http_status_code", } - # ------------------------------------------------------------------ + + _choice_fields: dict[str, tuple[str, ...]] = { + "user_agent": _USER_AGENTS, + "http_method": _HTTP_METHODS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_ipv6(self) -> str: # Single getrandbits(128) call instead of 32 choice() calls. @@ -127,24 +129,10 @@ def _one_hostname(self) -> str: suffix = self._engine.choice(_HOST_SUFFIXES) return f"{prefix}-{num}{suffix}" - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - @overload - def ipv6(self) -> str: ... - @overload - def ipv6(self, count: Literal[1]) -> str: ... - @overload - def ipv6(self, count: int) -> str | list[str]: ... def ipv6(self, count: int = 1) -> str | list[str]: - """Generate a random IPv6 address. - - Parameters - ---------- - count : int - Number of addresses to generate. - """ + """Generate a random IPv6 address.""" if count == 1: return self._one_ipv6() # Inlined batch with local-bound getrandbits — avoids per-item @@ -159,20 +147,8 @@ def ipv6(self, count: int = 1) -> str | list[str]: ) return result - @overload - def mac_address(self) -> str: ... - @overload - def mac_address(self, count: Literal[1]) -> str: ... - @overload - def mac_address(self, count: int) -> str | list[str]: ... def mac_address(self, count: int = 1) -> str | list[str]: - """Generate a random MAC address (e.g. ``"a1:b2:c3:d4:e5:f6"``). - - Parameters - ---------- - count : int - Number of addresses to generate. - """ + """Generate a random MAC address (e.g. ``"a1:b2:c3:d4:e5:f6"``).""" if count == 1: return self._one_mac_address() # Inlined batch with local-bound getrandbits @@ -184,38 +160,14 @@ def mac_address(self, count: int = 1) -> str | list[str]: result.append(f"{h[0:2]}:{h[2:4]}:{h[4:6]}:{h[6:8]}:{h[8:10]}:{h[10:12]}") return result - @overload - def port(self) -> int: ... - @overload - def port(self, count: Literal[1]) -> int: ... - @overload - def port(self, count: int) -> int | list[int]: ... def port(self, count: int = 1) -> int | list[int]: - """Generate a random port number (1–65535). - - Parameters - ---------- - count : int - Number of ports to generate. - """ + """Generate a random port number (1–65535).""" if count == 1: return self._engine.random_int(1, 65535) return [self._engine.random_int(1, 65535) for _ in range(count)] - @overload - def hostname(self) -> str: ... - @overload - def hostname(self, count: Literal[1]) -> str: ... - @overload - def hostname(self, count: int) -> str | list[str]: ... def hostname(self, count: int = 1) -> str | list[str]: - """Generate a random hostname (e.g. ``"srv-48201.local"``). - - Parameters - ---------- - count : int - Number of hostnames to generate. - """ + """Generate a random hostname (e.g. ``"srv-48201.local"``).""" if count == 1: return self._one_hostname() # Inlined batch loop with local-bound choices @@ -226,56 +178,8 @@ def hostname(self, count: int = 1) -> str | list[str]: for _ in range(count) ] - @overload - def user_agent(self) -> str: ... - @overload - def user_agent(self, count: Literal[1]) -> str: ... - @overload - def user_agent(self, count: int) -> str | list[str]: ... - def user_agent(self, count: int = 1) -> str | list[str]: - """Generate a random User-Agent string. - - Parameters - ---------- - count : int - Number of user agent strings to generate. - """ - if count == 1: - return self._engine.choice(_USER_AGENTS) - return self._engine.choices(_USER_AGENTS, count) - - @overload - def http_method(self) -> str: ... - @overload - def http_method(self, count: Literal[1]) -> str: ... - @overload - def http_method(self, count: int) -> str | list[str]: ... - def http_method(self, count: int = 1) -> str | list[str]: - """Generate a random HTTP method (GET, POST, PUT, etc.). - - Parameters - ---------- - count : int - Number of methods to generate. - """ - if count == 1: - return self._engine.choice(_HTTP_METHODS) - return self._engine.choices(_HTTP_METHODS, count) - - @overload - def http_status_code(self) -> str: ... - @overload - def http_status_code(self, count: Literal[1]) -> str: ... - @overload - def http_status_code(self, count: int) -> str | list[str]: ... def http_status_code(self, count: int = 1) -> str | list[str]: - """Generate a random HTTP status code with reason (e.g. ``"404 Not Found"``). - - Parameters - ---------- - count : int - Number of status codes to generate. - """ + """Generate a random HTTP status code with reason (e.g. ``"404 Not Found"``).""" if count == 1: code, reason = self._engine.choice(_HTTP_STATUS_CODES) return f"{code} {reason}" diff --git a/src/dataforge/providers/payment.py b/src/dataforge/providers/payment.py index eb27d2b..91687bb 100644 --- a/src/dataforge/providers/payment.py +++ b/src/dataforge/providers/payment.py @@ -1,7 +1,5 @@ """Payment provider — credit card types, payment methods, processors, etc.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _CARD_TYPES: tuple[str, ...] = ( @@ -33,11 +31,6 @@ "Zelle", "Alipay", "WeChat Pay", - "Klarna", - "Afterpay", - "Cash App", - "Money Order", - "ACH Transfer", ) _PROCESSORS: tuple[str, ...] = ( @@ -151,7 +144,14 @@ class PaymentProvider(BaseProvider): "card_expiry": "expiry_date", } - # --- Scalar helpers --- + _choice_fields: dict[str, tuple[str, ...]] = { + "card_type": _CARD_TYPES, + "payment_method": _PAYMENT_METHODS, + "payment_processor": _PROCESSORS, + "transaction_status": _TRANSACTION_STATUSES, + "currency_code": _CURRENCIES, + "currency_symbol": _CURRENCY_SYMBOLS, + } def _one_transaction_id(self) -> str: return f"TXN-{self._engine.random_digits_str(12)}" @@ -171,130 +171,26 @@ def _one_expiry_date(self) -> str: year = self._engine.random_int(25, 32) return f"{month:02d}/{year:02d}" - # --- Public API --- - - @overload - def card_type(self) -> str: ... - @overload - def card_type(self, count: Literal[1]) -> str: ... - @overload - def card_type(self, count: int) -> str | list[str]: ... - def card_type(self, count: int = 1) -> str | list[str]: - """Generate a credit/debit card type (e.g., Visa, Mastercard).""" - if count == 1: - return self._engine.choice(_CARD_TYPES) - return self._engine.choices(_CARD_TYPES, count) - - @overload - def payment_method(self) -> str: ... - @overload - def payment_method(self, count: Literal[1]) -> str: ... - @overload - def payment_method(self, count: int) -> str | list[str]: ... - def payment_method(self, count: int = 1) -> str | list[str]: - """Generate a payment method (e.g., Credit Card, PayPal).""" - if count == 1: - return self._engine.choice(_PAYMENT_METHODS) - return self._engine.choices(_PAYMENT_METHODS, count) - - @overload - def payment_processor(self) -> str: ... - @overload - def payment_processor(self, count: Literal[1]) -> str: ... - @overload - def payment_processor(self, count: int) -> str | list[str]: ... - def payment_processor(self, count: int = 1) -> str | list[str]: - """Generate a payment processor name (e.g., Stripe, Square).""" - if count == 1: - return self._engine.choice(_PROCESSORS) - return self._engine.choices(_PROCESSORS, count) - - @overload - def transaction_status(self) -> str: ... - @overload - def transaction_status(self, count: Literal[1]) -> str: ... - @overload - def transaction_status(self, count: int) -> str | list[str]: ... - def transaction_status(self, count: int = 1) -> str | list[str]: - """Generate a transaction status (e.g., pending, completed).""" - if count == 1: - return self._engine.choice(_TRANSACTION_STATUSES) - return self._engine.choices(_TRANSACTION_STATUSES, count) - - @overload - def transaction_id(self) -> str: ... - @overload - def transaction_id(self, count: Literal[1]) -> str: ... - @overload - def transaction_id(self, count: int) -> str | list[str]: ... def transaction_id(self, count: int = 1) -> str | list[str]: """Generate a transaction ID (TXN-############).""" if count == 1: return self._one_transaction_id() - # Inlined batch with local-bound random_digits_str - _rds = self._engine.random_digits_str - return [f"TXN-{_rds(12)}" for _ in range(count)] - - @overload - def currency_code(self) -> str: ... - @overload - def currency_code(self, count: Literal[1]) -> str: ... - @overload - def currency_code(self, count: int) -> str | list[str]: ... - def currency_code(self, count: int = 1) -> str | list[str]: - """Generate an ISO 4217 currency code (e.g., USD, EUR).""" - if count == 1: - return self._engine.choice(_CURRENCIES) - return self._engine.choices(_CURRENCIES, count) - - @overload - def currency_symbol(self) -> str: ... - @overload - def currency_symbol(self, count: Literal[1]) -> str: ... - @overload - def currency_symbol(self, count: int) -> str | list[str]: ... - def currency_symbol(self, count: int = 1) -> str | list[str]: - """Generate a currency symbol (e.g., $, EUR, GBP).""" - if count == 1: - return self._engine.choice(_CURRENCY_SYMBOLS) - return self._engine.choices(_CURRENCY_SYMBOLS, count) + return [self._one_transaction_id() for _ in range(count)] - @overload - def payment_amount(self) -> str: ... - @overload - def payment_amount(self, count: Literal[1]) -> str: ... - @overload - def payment_amount(self, count: int) -> str | list[str]: ... def payment_amount(self, count: int = 1) -> str | list[str]: """Generate a payment amount (e.g., 49.99).""" if count == 1: return self._one_payment_amount() - _ri = self._engine.random_int - return [f"{_ri(1, 9999)}.{_ri(0, 99):02d}" for _ in range(count)] + return [self._one_payment_amount() for _ in range(count)] - @overload - def cvv(self) -> str: ... - @overload - def cvv(self, count: Literal[1]) -> str: ... - @overload - def cvv(self, count: int) -> str | list[str]: ... def cvv(self, count: int = 1) -> str | list[str]: """Generate a CVV code (3 or 4 digits).""" if count == 1: return self._one_cvv() - _rds = self._engine.random_digits_str - _ri = self._engine.random_int - return [_rds(4 if _ri(0, 1) == 0 else 3) for _ in range(count)] + return [self._one_cvv() for _ in range(count)] - @overload - def expiry_date(self) -> str: ... - @overload - def expiry_date(self, count: Literal[1]) -> str: ... - @overload - def expiry_date(self, count: int) -> str | list[str]: ... def expiry_date(self, count: int = 1) -> str | list[str]: """Generate a card expiry date (MM/YY).""" if count == 1: return self._one_expiry_date() - _ri = self._engine.random_int - return [f"{_ri(1, 12):02d}/{_ri(25, 32):02d}" for _ in range(count)] + return [self._one_expiry_date() for _ in range(count)] diff --git a/src/dataforge/providers/person.py b/src/dataforge/providers/person.py index 7b1f4f1..4aa2ab0 100644 --- a/src/dataforge/providers/person.py +++ b/src/dataforge/providers/person.py @@ -1,7 +1,6 @@ """Person provider — generates fake personal names.""" from types import ModuleType -from typing import Literal, overload from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider @@ -42,6 +41,11 @@ class PersonProvider(BaseProvider): "female_first_name": "female_first_name", } + _choice_fields: dict[str, tuple[str, ...]] = { + "prefix": _PREFIXES, + "suffix": _SUFFIXES, + } + def __init__(self, engine: RandomEngine, locale_data: ModuleType) -> None: super().__init__(engine) self._first_names: tuple[str, ...] = locale_data.first_names @@ -54,63 +58,22 @@ def __init__(self, engine: RandomEngine, locale_data: ModuleType) -> None: locale_data, "female_first_names", locale_data.first_names ) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - - @overload - def first_name(self) -> str: ... - @overload - def first_name(self, count: Literal[1]) -> str: ... - @overload - def first_name(self, count: int) -> str | list[str]: ... + def first_name(self, count: int = 1) -> str | list[str]: - """Generate a random first name. - - Parameters - ---------- - count : int - Number of names to generate. ``1`` returns a single ``str``; - any value > 1 returns a ``list[str]``. - """ + """Generate a random first name.""" if count == 1: return self._engine.choice(self._first_names) return self._engine.choices(self._first_names, count) - @overload - def last_name(self) -> str: ... - @overload - def last_name(self, count: Literal[1]) -> str: ... - @overload - def last_name(self, count: int) -> str | list[str]: ... def last_name(self, count: int = 1) -> str | list[str]: - """Generate a random last name. - - Parameters - ---------- - count : int - Number of names to generate. ``1`` returns a single ``str``; - any value > 1 returns a ``list[str]``. - """ + """Generate a random last name.""" if count == 1: return self._engine.choice(self._last_names) return self._engine.choices(self._last_names, count) - @overload - def full_name(self) -> str: ... - @overload - def full_name(self, count: Literal[1]) -> str: ... - @overload - def full_name(self, count: int) -> str | list[str]: ... def full_name(self, count: int = 1) -> str | list[str]: - """Generate a random full name (first + last). - - Parameters - ---------- - count : int - Number of names to generate. ``1`` returns a single ``str``; - any value > 1 returns a ``list[str]``. - """ + """Generate a random full name (first + last).""" if count == 1: first = self._engine.choice(self._first_names) last = self._engine.choice(self._last_names) @@ -120,76 +83,14 @@ def full_name(self, count: int = 1) -> str | list[str]: lasts = self._engine.choices(self._last_names, count) return [f"{f} {ln}" for f, ln in zip(firsts, lasts)] - @overload - def prefix(self) -> str: ... - @overload - def prefix(self, count: Literal[1]) -> str: ... - @overload - def prefix(self, count: int) -> str | list[str]: ... - def prefix(self, count: int = 1) -> str | list[str]: - """Generate a name prefix (Mr., Mrs., Ms., Dr.). - - Parameters - ---------- - count : int - Number of prefixes to generate. - """ - prefixes = _PREFIXES - if count == 1: - return self._engine.choice(prefixes) - return self._engine.choices(prefixes, count) - - @overload - def suffix(self) -> str: ... - @overload - def suffix(self, count: Literal[1]) -> str: ... - @overload - def suffix(self, count: int) -> str | list[str]: ... - def suffix(self, count: int = 1) -> str | list[str]: - """Generate a name suffix (Jr., Sr., III, IV, V). - - Parameters - ---------- - count : int - Number of suffixes to generate. - """ - suffixes = _SUFFIXES - if count == 1: - return self._engine.choice(suffixes) - return self._engine.choices(suffixes, count) - - @overload - def male_first_name(self) -> str: ... - @overload - def male_first_name(self, count: Literal[1]) -> str: ... - @overload - def male_first_name(self, count: int) -> str | list[str]: ... def male_first_name(self, count: int = 1) -> str | list[str]: - """Generate a random male first name. - - Parameters - ---------- - count : int - Number of names to generate. - """ + """Generate a random male first name.""" if count == 1: return self._engine.choice(self._male_first_names) return self._engine.choices(self._male_first_names, count) - @overload - def female_first_name(self) -> str: ... - @overload - def female_first_name(self, count: Literal[1]) -> str: ... - @overload - def female_first_name(self, count: int) -> str | list[str]: ... def female_first_name(self, count: int = 1) -> str | list[str]: - """Generate a random female first name. - - Parameters - ---------- - count : int - Number of names to generate. - """ + """Generate a random female first name.""" if count == 1: return self._engine.choice(self._female_first_names) return self._engine.choices(self._female_first_names, count) diff --git a/src/dataforge/providers/phone.py b/src/dataforge/providers/phone.py index 62670b5..dc69eca 100644 --- a/src/dataforge/providers/phone.py +++ b/src/dataforge/providers/phone.py @@ -1,7 +1,6 @@ """Phone provider — generates fake phone and cell numbers.""" from types import ModuleType -from typing import Literal, overload from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider @@ -34,9 +33,7 @@ def __init__(self, engine: RandomEngine, locale_data: ModuleType) -> None: self._phone_formats: tuple[str, ...] = locale_data.phone_formats self._cell_formats: tuple[str, ...] = locale_data.cell_formats - # ------------------------------------------------------------------ # Scalar helpers - # ------------------------------------------------------------------ def _one_phone(self) -> str: fmt = self._engine.choice(self._phone_formats) @@ -46,42 +43,16 @@ def _one_cell(self) -> str: fmt = self._engine.choice(self._cell_formats) return self._engine.numerify(fmt) - # ------------------------------------------------------------------ # Public API - # ------------------------------------------------------------------ - @overload - def phone_number(self) -> str: ... - @overload - def phone_number(self, count: Literal[1]) -> str: ... - @overload - def phone_number(self, count: int) -> str | list[str]: ... def phone_number(self, count: int = 1) -> str | list[str]: - """Generate a random phone number. - - Parameters - ---------- - count : int - Number of phone numbers to generate. - """ + """Generate a random phone number.""" if count == 1: return self._one_phone() return [self._one_phone() for _ in range(count)] - @overload - def cell_phone(self) -> str: ... - @overload - def cell_phone(self, count: Literal[1]) -> str: ... - @overload - def cell_phone(self, count: int) -> str | list[str]: ... def cell_phone(self, count: int = 1) -> str | list[str]: - """Generate a random cell phone number. - - Parameters - ---------- - count : int - Number of cell phone numbers to generate. - """ + """Generate a random cell phone number.""" if count == 1: return self._one_cell() return [self._one_cell() for _ in range(count)] diff --git a/src/dataforge/providers/profile.py b/src/dataforge/providers/profile.py index 08ed67a..fe47de6 100644 --- a/src/dataforge/providers/profile.py +++ b/src/dataforge/providers/profile.py @@ -9,7 +9,7 @@ and is available only via direct API use. """ -from typing import TYPE_CHECKING, Literal, overload +from typing import TYPE_CHECKING from dataforge.backend import RandomEngine from dataforge.providers.base import BaseProvider @@ -19,19 +19,7 @@ class ProfileProvider(BaseProvider): - """Generates coherent fake user profiles. - - Unlike other providers, ``ProfileProvider`` needs a reference to - the parent :class:`DataForge` instance so it can delegate to - ``person``, ``internet``, ``address``, and ``phone`` providers. - - Parameters - ---------- - engine : RandomEngine - The shared random engine instance. - forge : DataForge - The parent DataForge instance for cross-provider access. - """ + """Generates coherent fake user profiles.""" __slots__ = ("_forge",) @@ -53,195 +41,44 @@ def __init__(self, engine: RandomEngine, forge: "DataForge") -> None: super().__init__(engine) self._forge = forge - # ------------------------------------------------------------------ # Individual field methods (for _field_map / Schema compatibility) - # These delegate to sub-providers — values are independent per call. - # For coherent profiles, use profile() instead. - # ------------------------------------------------------------------ - @overload - def profile_first_name(self) -> str: ... - @overload - def profile_first_name(self, count: Literal[1]) -> str: ... - @overload - def profile_first_name(self, count: int) -> str | list[str]: ... def profile_first_name(self, count: int = 1) -> str | list[str]: - """Generate a first name (delegates to PersonProvider). - - Parameters - ---------- - count : int - Number of names to generate. - - Returns - ------- - str or list[str] - """ + """Generate a first name (delegates to PersonProvider).""" return self._forge.person.first_name(count) - @overload - def profile_last_name(self) -> str: ... - @overload - def profile_last_name(self, count: Literal[1]) -> str: ... - @overload - def profile_last_name(self, count: int) -> str | list[str]: ... def profile_last_name(self, count: int = 1) -> str | list[str]: - """Generate a last name (delegates to PersonProvider). - - Parameters - ---------- - count : int - Number of names to generate. - - Returns - ------- - str or list[str] - """ + """Generate a last name (delegates to PersonProvider).""" return self._forge.person.last_name(count) - @overload - def profile_email(self) -> str: ... - @overload - def profile_email(self, count: Literal[1]) -> str: ... - @overload - def profile_email(self, count: int) -> str | list[str]: ... def profile_email(self, count: int = 1) -> str | list[str]: - """Generate an email address (delegates to InternetProvider). - - Parameters - ---------- - count : int - Number of emails to generate. - - Returns - ------- - str or list[str] - """ + """Generate an email address (delegates to InternetProvider).""" return self._forge.internet.email(count) - @overload - def profile_phone(self) -> str: ... - @overload - def profile_phone(self, count: Literal[1]) -> str: ... - @overload - def profile_phone(self, count: int) -> str | list[str]: ... def profile_phone(self, count: int = 1) -> str | list[str]: - """Generate a phone number (delegates to PhoneProvider). - - Parameters - ---------- - count : int - Number of phone numbers to generate. - - Returns - ------- - str or list[str] - """ + """Generate a phone number (delegates to PhoneProvider).""" return self._forge.phone.phone_number(count) - @overload - def profile_city(self) -> str: ... - @overload - def profile_city(self, count: Literal[1]) -> str: ... - @overload - def profile_city(self, count: int) -> str | list[str]: ... def profile_city(self, count: int = 1) -> str | list[str]: - """Generate a city name (delegates to AddressProvider). - - Parameters - ---------- - count : int - Number of cities to generate. - - Returns - ------- - str or list[str] - """ + """Generate a city name (delegates to AddressProvider).""" return self._forge.address.city(count) - @overload - def profile_state(self) -> str: ... - @overload - def profile_state(self, count: Literal[1]) -> str: ... - @overload - def profile_state(self, count: int) -> str | list[str]: ... def profile_state(self, count: int = 1) -> str | list[str]: - """Generate a state name (delegates to AddressProvider). - - Parameters - ---------- - count : int - Number of states to generate. - - Returns - ------- - str or list[str] - """ + """Generate a state name (delegates to AddressProvider).""" return self._forge.address.state(count) - @overload - def profile_zip_code(self) -> str: ... - @overload - def profile_zip_code(self, count: Literal[1]) -> str: ... - @overload - def profile_zip_code(self, count: int) -> str | list[str]: ... def profile_zip_code(self, count: int = 1) -> str | list[str]: - """Generate a zip code (delegates to AddressProvider). - - Parameters - ---------- - count : int - Number of zip codes to generate. - - Returns - ------- - str or list[str] - """ + """Generate a zip code (delegates to AddressProvider).""" return self._forge.address.zip_code(count) - @overload - def profile_job_title(self) -> str: ... - @overload - def profile_job_title(self, count: Literal[1]) -> str: ... - @overload - def profile_job_title(self, count: int) -> str | list[str]: ... def profile_job_title(self, count: int = 1) -> str | list[str]: - """Generate a job title (delegates to CompanyProvider). - - Parameters - ---------- - count : int - Number of job titles to generate. - - Returns - ------- - str or list[str] - """ + """Generate a job title (delegates to CompanyProvider).""" return self._forge.company.job_title(count) - # ------------------------------------------------------------------ # Compound profile method (direct API only, not in _field_map) - # ------------------------------------------------------------------ def profile(self, count: int = 1) -> dict[str, str] | list[dict[str, str]]: - """Generate a coherent user profile. - - Each profile is a ``dict`` with keys: ``first_name``, - ``last_name``, ``email``, ``phone``, ``city``, ``state``, - ``zip_code``, ``job_title``. - - The ``email`` is derived from the same first/last name for - coherence within each profile. - - Parameters - ---------- - count : int - Number of profiles to generate. - - Returns - ------- - dict[str, str] or list[dict[str, str]] - """ + """Generate a coherent user profile dict.""" def _one_profile() -> dict[str, str]: first = self._forge.person.first_name() diff --git a/src/dataforge/providers/real_estate.py b/src/dataforge/providers/real_estate.py index 64e7f45..44eafa5 100644 --- a/src/dataforge/providers/real_estate.py +++ b/src/dataforge/providers/real_estate.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _PROPERTY_TYPES: tuple[str, ...] = ( "Single Family Home", @@ -34,16 +30,6 @@ "Mediterranean", "Tudor", "Cape Cod", - "Farmhouse", - "Cabin", - "Mansion", - "Mobile Home", - "Co-op", - "Land", - "Commercial", - "Industrial", - "Mixed Use", - "Warehouse", ) _NEIGHBORHOODS: tuple[str, ...] = ( @@ -67,16 +53,6 @@ "Fairview", "Heritage Park", "Eagle Ridge", - "Harbor Point", - "Bay View", - "Forest Glen", - "Meadow Creek", - "Stonebridge", - "Willow Springs", - "Coral Gables", - "Silver Lake", - "Mission Hills", - "Pacific Heights", ) _BUILDING_MATERIALS: tuple[str, ...] = ( @@ -95,11 +71,6 @@ "Timber Frame", "Precast Concrete", "Insulated Concrete Form", - "Structural Insulated Panel", - "Cross-Laminated Timber", - "Rammed Earth", - "Cob", - "Hempcrete", ) _LISTING_STATUSES: tuple[str, ...] = ( @@ -138,16 +109,6 @@ "Solar Panels", "EV Charger", "Security System", - "Laundry Room", - "Basement", - "Attic", - "Rooftop Terrace", - "Elevator", - "Concierge", - "Doorman", - "Pet-Friendly", - "In-Unit Washer/Dryer", - "Storage Unit", ) _HEATING_TYPES: tuple[str, ...] = ( @@ -194,9 +155,16 @@ class RealEstateProvider(BaseProvider): "year_built": "year_built", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "property_type": _PROPERTY_TYPES, + "neighborhood": _NEIGHBORHOODS, + "building_material": _BUILDING_MATERIALS, + "listing_status": _LISTING_STATUSES, + "amenity": _AMENITIES, + "heating_type": _HEATING_TYPES, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_listing_price(self) -> str: """Generate a single listing price string.""" @@ -223,248 +191,34 @@ def _one_year_built(self) -> str: """Generate a single year built string.""" return str(self._engine.random_int(1900, 2026)) - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def property_type(self) -> str: ... - @overload - def property_type(self, count: Literal[1]) -> str: ... - @overload - def property_type(self, count: int) -> str | list[str]: ... - def property_type(self, count: int = 1) -> str | list[str]: - """Generate a property type (e.g. ``"Condo"``). - - Parameters - ---------- - count : int - Number of property types to generate. + # Public API — custom methods - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_PROPERTY_TYPES) - return self._engine.choices(_PROPERTY_TYPES, count) - - @overload - def listing_price(self) -> str: ... - @overload - def listing_price(self, count: Literal[1]) -> str: ... - @overload - def listing_price(self, count: int) -> str | list[str]: ... def listing_price(self, count: int = 1) -> str | list[str]: - """Generate a listing price (e.g. ``"$450,000"``). - - Parameters - ---------- - count : int - Number of listing prices to generate. - - Returns - ------- - str or list[str] - """ + """Generate a listing price (e.g. ``"$450,000"``).""" if count == 1: return self._one_listing_price() return [self._one_listing_price() for _ in range(count)] - @overload - def square_footage(self) -> str: ... - @overload - def square_footage(self, count: Literal[1]) -> str: ... - @overload - def square_footage(self, count: int) -> str | list[str]: ... def square_footage(self, count: int = 1) -> str | list[str]: - """Generate square footage (e.g. ``"2,400 sqft"``). - - Parameters - ---------- - count : int - Number of square footages to generate. - - Returns - ------- - str or list[str] - """ + """Generate square footage (e.g. ``"2,400 sqft"``).""" if count == 1: return self._one_sqft() return [self._one_sqft() for _ in range(count)] - @overload - def bedrooms(self) -> str: ... - @overload - def bedrooms(self, count: Literal[1]) -> str: ... - @overload - def bedrooms(self, count: int) -> str | list[str]: ... def bedrooms(self, count: int = 1) -> str | list[str]: - """Generate a bedroom count (e.g. ``"3"``). - - Parameters - ---------- - count : int - Number of bedroom counts to generate. - - Returns - ------- - str or list[str] - """ + """Generate a bedroom count (e.g. ``"3"``).""" if count == 1: return self._one_bedrooms() return [self._one_bedrooms() for _ in range(count)] - @overload - def bathrooms(self) -> str: ... - @overload - def bathrooms(self, count: Literal[1]) -> str: ... - @overload - def bathrooms(self, count: int) -> str | list[str]: ... def bathrooms(self, count: int = 1) -> str | list[str]: - """Generate a bathroom count (e.g. ``"2.5"``). - - Parameters - ---------- - count : int - Number of bathroom counts to generate. - - Returns - ------- - str or list[str] - """ + """Generate a bathroom count (e.g. ``"2.5"``).""" if count == 1: return self._one_bathrooms() return [self._one_bathrooms() for _ in range(count)] - @overload - def neighborhood(self) -> str: ... - @overload - def neighborhood(self, count: Literal[1]) -> str: ... - @overload - def neighborhood(self, count: int) -> str | list[str]: ... - def neighborhood(self, count: int = 1) -> str | list[str]: - """Generate a neighborhood name (e.g. ``"Hillcrest"``). - - Parameters - ---------- - count : int - Number of neighborhood names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_NEIGHBORHOODS) - return self._engine.choices(_NEIGHBORHOODS, count) - - @overload - def building_material(self) -> str: ... - @overload - def building_material(self, count: Literal[1]) -> str: ... - @overload - def building_material(self, count: int) -> str | list[str]: ... - def building_material(self, count: int = 1) -> str | list[str]: - """Generate a building material (e.g. ``"Brick"``). - - Parameters - ---------- - count : int - Number of building materials to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_BUILDING_MATERIALS) - return self._engine.choices(_BUILDING_MATERIALS, count) - - @overload - def listing_status(self) -> str: ... - @overload - def listing_status(self, count: Literal[1]) -> str: ... - @overload - def listing_status(self, count: int) -> str | list[str]: ... - def listing_status(self, count: int = 1) -> str | list[str]: - """Generate a listing status (e.g. ``"For Sale"``). - - Parameters - ---------- - count : int - Number of listing statuses to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_LISTING_STATUSES) - return self._engine.choices(_LISTING_STATUSES, count) - - @overload - def amenity(self) -> str: ... - @overload - def amenity(self, count: Literal[1]) -> str: ... - @overload - def amenity(self, count: int) -> str | list[str]: ... - def amenity(self, count: int = 1) -> str | list[str]: - """Generate a property amenity (e.g. ``"Swimming Pool"``). - - Parameters - ---------- - count : int - Number of amenities to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_AMENITIES) - return self._engine.choices(_AMENITIES, count) - - @overload - def heating_type(self) -> str: ... - @overload - def heating_type(self, count: Literal[1]) -> str: ... - @overload - def heating_type(self, count: int) -> str | list[str]: ... - def heating_type(self, count: int = 1) -> str | list[str]: - """Generate a heating type (e.g. ``"Forced Air"``). - - Parameters - ---------- - count : int - Number of heating types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_HEATING_TYPES) - return self._engine.choices(_HEATING_TYPES, count) - - @overload - def year_built(self) -> str: ... - @overload - def year_built(self, count: Literal[1]) -> str: ... - @overload - def year_built(self, count: int) -> str | list[str]: ... def year_built(self, count: int = 1) -> str | list[str]: - """Generate a year built (e.g. ``"1985"``). - - Parameters - ---------- - count : int - Number of year-built values to generate. - - Returns - ------- - str or list[str] - """ + """Generate a year built (e.g. ``"1985"``).""" if count == 1: return self._one_year_built() return [self._one_year_built() for _ in range(count)] diff --git a/src/dataforge/providers/science.py b/src/dataforge/providers/science.py index 744935e..138f7eb 100644 --- a/src/dataforge/providers/science.py +++ b/src/dataforge/providers/science.py @@ -1,7 +1,5 @@ """Science provider — elements, units, formulas, planets, etc.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _CHEMICAL_ELEMENTS: tuple[str, ...] = ( @@ -35,26 +33,6 @@ "Nickel", "Copper", "Zinc", - "Gallium", - "Germanium", - "Arsenic", - "Selenium", - "Bromine", - "Krypton", - "Rubidium", - "Strontium", - "Yttrium", - "Zirconium", - "Niobium", - "Molybdenum", - "Technetium", - "Ruthenium", - "Rhodium", - "Palladium", - "Silver", - "Gold", - "Platinum", - "Iridium", ) _ELEMENT_SYMBOLS: tuple[str, ...] = ( @@ -88,26 +66,6 @@ "Ni", "Cu", "Zn", - "Ga", - "Ge", - "As", - "Se", - "Br", - "Kr", - "Rb", - "Sr", - "Y", - "Zr", - "Nb", - "Mo", - "Tc", - "Ru", - "Rh", - "Pd", - "Ag", - "Au", - "Pt", - "Ir", ) _SI_UNITS: tuple[str, ...] = ( @@ -183,16 +141,6 @@ "Cancer", "Capricornus", "Libra", - "Pisces", - "Virgo", - "Cygnus", - "Lyra", - "Pegasus", - "Perseus", - "Draco", - "Centaurus", - "Canis Major", - "Canis Minor", ) _SCIENTIFIC_DISCIPLINES: tuple[str, ...] = ( @@ -211,16 +159,6 @@ "Electrodynamics", "Organic Chemistry", "Inorganic Chemistry", - "Molecular Biology", - "Astrophysics", - "Cosmology", - "Paleontology", - "Oceanography", - "Meteorology", - "Seismology", - "Virology", - "Immunology", - "Epidemiology", ) _METRIC_PREFIXES: tuple[str, ...] = ( @@ -266,100 +204,13 @@ class ScienceProvider(BaseProvider): "metric_prefix": "metric_prefix", } - # --- Public API --- - - @overload - def chemical_element(self) -> str: ... - @overload - def chemical_element(self, count: Literal[1]) -> str: ... - @overload - def chemical_element(self, count: int) -> str | list[str]: ... - def chemical_element(self, count: int = 1) -> str | list[str]: - """Generate a chemical element name (e.g., Hydrogen, Carbon).""" - if count == 1: - return self._engine.choice(_CHEMICAL_ELEMENTS) - return self._engine.choices(_CHEMICAL_ELEMENTS, count) - - @overload - def element_symbol(self) -> str: ... - @overload - def element_symbol(self, count: Literal[1]) -> str: ... - @overload - def element_symbol(self, count: int) -> str | list[str]: ... - def element_symbol(self, count: int = 1) -> str | list[str]: - """Generate a chemical element symbol (e.g., H, C, Fe).""" - if count == 1: - return self._engine.choice(_ELEMENT_SYMBOLS) - return self._engine.choices(_ELEMENT_SYMBOLS, count) - - @overload - def si_unit(self) -> str: ... - @overload - def si_unit(self, count: Literal[1]) -> str: ... - @overload - def si_unit(self, count: int) -> str | list[str]: ... - def si_unit(self, count: int = 1) -> str | list[str]: - """Generate an SI unit (e.g., meter (m), joule (J)).""" - if count == 1: - return self._engine.choice(_SI_UNITS) - return self._engine.choices(_SI_UNITS, count) - - @overload - def planet(self) -> str: ... - @overload - def planet(self, count: Literal[1]) -> str: ... - @overload - def planet(self, count: int) -> str | list[str]: ... - def planet(self, count: int = 1) -> str | list[str]: - """Generate a planet name from our solar system.""" - if count == 1: - return self._engine.choice(_PLANETS) - return self._engine.choices(_PLANETS, count) - - @overload - def galaxy(self) -> str: ... - @overload - def galaxy(self, count: Literal[1]) -> str: ... - @overload - def galaxy(self, count: int) -> str | list[str]: ... - def galaxy(self, count: int = 1) -> str | list[str]: - """Generate a galaxy name (e.g., Milky Way, Andromeda).""" - if count == 1: - return self._engine.choice(_GALAXIES) - return self._engine.choices(_GALAXIES, count) - - @overload - def constellation(self) -> str: ... - @overload - def constellation(self, count: Literal[1]) -> str: ... - @overload - def constellation(self, count: int) -> str | list[str]: ... - def constellation(self, count: int = 1) -> str | list[str]: - """Generate a constellation name (e.g., Orion, Leo).""" - if count == 1: - return self._engine.choice(_CONSTELLATIONS) - return self._engine.choices(_CONSTELLATIONS, count) - - @overload - def scientific_discipline(self) -> str: ... - @overload - def scientific_discipline(self, count: Literal[1]) -> str: ... - @overload - def scientific_discipline(self, count: int) -> str | list[str]: ... - def scientific_discipline(self, count: int = 1) -> str | list[str]: - """Generate a scientific discipline (e.g., Physics, Genetics).""" - if count == 1: - return self._engine.choice(_SCIENTIFIC_DISCIPLINES) - return self._engine.choices(_SCIENTIFIC_DISCIPLINES, count) - - @overload - def metric_prefix(self) -> str: ... - @overload - def metric_prefix(self, count: Literal[1]) -> str: ... - @overload - def metric_prefix(self, count: int) -> str | list[str]: ... - def metric_prefix(self, count: int = 1) -> str | list[str]: - """Generate a metric prefix (e.g., kilo (k), nano (n)).""" - if count == 1: - return self._engine.choice(_METRIC_PREFIXES) - return self._engine.choices(_METRIC_PREFIXES, count) + _choice_fields: dict[str, tuple[str, ...]] = { + "chemical_element": _CHEMICAL_ELEMENTS, + "element_symbol": _ELEMENT_SYMBOLS, + "si_unit": _SI_UNITS, + "planet": _PLANETS, + "galaxy": _GALAXIES, + "constellation": _CONSTELLATIONS, + "scientific_discipline": _SCIENTIFIC_DISCIPLINES, + "metric_prefix": _METRIC_PREFIXES, + } diff --git a/src/dataforge/providers/social_media.py b/src/dataforge/providers/social_media.py index 9225285..b003682 100644 --- a/src/dataforge/providers/social_media.py +++ b/src/dataforge/providers/social_media.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _PLATFORMS: tuple[str, ...] = ( "Twitter", @@ -29,11 +25,6 @@ "Discord", "Twitch", "Tumblr", - "WhatsApp", - "Telegram", - "Signal", - "WeChat", - "Line", ) _POST_TYPES: tuple[str, ...] = ( @@ -70,11 +61,6 @@ "heart", "fire", "clap", - "100", - "thumbs_up", - "thumbs_down", - "laugh", - "cry", ) _HASHTAG_WORDS: tuple[str, ...] = ( @@ -98,26 +84,6 @@ "nature", "girl", "fun", - "style", - "smile", - "food", - "travel", - "fitness", - "music", - "beauty", - "photo", - "life", - "motivation", - "family", - "nofilter", - "makeup", - "ootd", - "dog", - "explore", - "viral", - "trending", - "fyp", - "goals", ) _USERNAME_ADJECTIVES: tuple[str, ...] = ( @@ -141,16 +107,6 @@ "wild", "zen", "blue", - "red", - "green", - "black", - "white", - "silver", - "cyber", - "pixel", - "quantum", - "shadow", - "storm", ) _USERNAME_NOUNS: tuple[str, ...] = ( @@ -174,16 +130,6 @@ "sage", "spark", "blaze", - "frost", - "ghost", - "raven", - "viper", - "cobra", - "eagle", - "panda", - "shark", - "whale", - "falcon", ) _CONTENT_SNIPPETS: tuple[str, ...] = ( @@ -202,11 +148,6 @@ "Throwback to this moment", "Tag someone who needs to see this", "Link in bio!", - "What do you think?", - "Swipe for more!", - "Follow for more content like this", - "Drop a comment below!", - "Story time...", ) @@ -238,9 +179,14 @@ class SocialMediaProvider(BaseProvider): "verified": "verified", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "platform": _PLATFORMS, + "post_type": _POST_TYPES, + "reaction": _REACTIONS, + "content": _CONTENT_SNIPPETS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_username(self) -> str: choice = self._engine._rng.choice @@ -260,182 +206,28 @@ def _one_follower_count(self) -> str: return f"{n / 1_000:.1f}K" return str(n) - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def platform(self) -> str: ... - @overload - def platform(self, count: Literal[1]) -> str: ... - @overload - def platform(self, count: int) -> str | list[str]: ... - def platform(self, count: int = 1) -> str | list[str]: - """Generate a social media platform name (e.g. ``"Instagram"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_PLATFORMS) - return self._engine.choices(_PLATFORMS, count) + # Public API — custom methods - @overload - def username(self) -> str: ... - @overload - def username(self, count: Literal[1]) -> str: ... - @overload - def username(self, count: int) -> str | list[str]: ... def username(self, count: int = 1) -> str | list[str]: - """Generate a social media username (e.g. ``"cool_wolf42"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ + """Generate a social media username (e.g. ``"cool_wolf42"``).""" if count == 1: return self._one_username() return [self._one_username() for _ in range(count)] - @overload - def hashtag(self) -> str: ... - @overload - def hashtag(self, count: Literal[1]) -> str: ... - @overload - def hashtag(self, count: int) -> str | list[str]: ... def hashtag(self, count: int = 1) -> str | list[str]: - """Generate a hashtag (e.g. ``"#trending"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ + """Generate a hashtag (e.g. ``"#trending"``).""" if count == 1: return self._one_hashtag() return [self._one_hashtag() for _ in range(count)] - @overload - def post_type(self) -> str: ... - @overload - def post_type(self, count: Literal[1]) -> str: ... - @overload - def post_type(self, count: int) -> str | list[str]: ... - def post_type(self, count: int = 1) -> str | list[str]: - """Generate a social media post type (e.g. ``"reel"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_POST_TYPES) - return self._engine.choices(_POST_TYPES, count) - - @overload - def reaction(self) -> str: ... - @overload - def reaction(self, count: Literal[1]) -> str: ... - @overload - def reaction(self, count: int) -> str | list[str]: ... - def reaction(self, count: int = 1) -> str | list[str]: - """Generate a social media reaction (e.g. ``"love"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_REACTIONS) - return self._engine.choices(_REACTIONS, count) - - @overload - def follower_count(self) -> str: ... - @overload - def follower_count(self, count: Literal[1]) -> str: ... - @overload - def follower_count(self, count: int) -> str | list[str]: ... def follower_count(self, count: int = 1) -> str | list[str]: - """Generate a formatted follower count (e.g. ``"1.2M"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ + """Generate a formatted follower count (e.g. ``"1.2M"``).""" if count == 1: return self._one_follower_count() return [self._one_follower_count() for _ in range(count)] - @overload - def content(self) -> str: ... - @overload - def content(self, count: Literal[1]) -> str: ... - @overload - def content(self, count: int) -> str | list[str]: ... - def content(self, count: int = 1) -> str | list[str]: - """Generate social media post content. - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CONTENT_SNIPPETS) - return self._engine.choices(_CONTENT_SNIPPETS, count) - - @overload - def verified(self) -> str: ... - @overload - def verified(self, count: Literal[1]) -> str: ... - @overload - def verified(self, count: int) -> str | list[str]: ... def verified(self, count: int = 1) -> str | list[str]: - """Generate a verification status (``"verified"`` or ``"unverified"``). - - Parameters - ---------- - count : int - Number of items to generate. - - Returns - ------- - str or list[str] - """ + """Generate a verification status (``"verified"`` or ``"unverified"``).""" _choices = ("verified", "unverified") if count == 1: return self._engine.choice(_choices) diff --git a/src/dataforge/providers/sports.py b/src/dataforge/providers/sports.py index 4096445..68bb617 100644 --- a/src/dataforge/providers/sports.py +++ b/src/dataforge/providers/sports.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _SPORTS: tuple[str, ...] = ( "Football", @@ -34,16 +30,6 @@ "Skiing", "Snowboarding", "Surfing", - "Skateboarding", - "Track and Field", - "Gymnastics", - "Fencing", - "Rowing", - "Archery", - "Handball", - "Water Polo", - "Lacrosse", - "Motorsport", ) _TEAMS: tuple[str, ...] = ( @@ -67,26 +53,6 @@ "Vikings", "Spartans", "Rebels", - "Royals", - "Kings", - "Generals", - "Mustangs", - "Cougars", - "Jaguars", - "Dolphins", - "Stallions", - "Falcons", - "Cobras", - "Phoenix", - "Vipers", - "Raptors", - "Hornets", - "Bulldogs", - "Grizzlies", - "Wildcats", - "Rams", - "Chargers", - "Raiders", ) _LEAGUES: tuple[str, ...] = ( @@ -110,16 +76,6 @@ "NASCAR", "UFC", "WWE", - "A-League", - "J-League", - "K League", - "Eredivisie", - "Primeira Liga", - "Scottish Premiership", - "Super Rugby", - "Six Nations", - "BBL", - "CPL", ) _POSITIONS: tuple[str, ...] = ( @@ -143,16 +99,6 @@ "Striker", "Winger", "Flanker", - "Fly-half", - "Hooker", - "Lock", - "Prop", - "Scrum-half", - "Setter", - "Libero", - "Outside Hitter", - "Goaltender", - "Defenseman", ) _VENUES: tuple[str, ...] = ( @@ -176,16 +122,6 @@ "Fenway Park", "Oracle Park", "MetLife Stadium", - "SoFi Stadium", - "Levi's Stadium", - "Lincoln Financial Field", - "Hard Rock Stadium", - "Allegiant Stadium", - "State Farm Arena", - "Barclays Center", - "Chase Center", - "United Center", - "TD Garden", ) _EVENTS: tuple[str, ...] = ( @@ -209,16 +145,6 @@ "Pan American Games", "Asian Games", "Rugby World Cup", - "Cricket World Cup", - "FIFA Club World Cup", - "UEFA Euro", - "Copa America", - "African Cup of Nations", - "Ryder Cup", - "All-Star Game", - "Pro Bowl", - "X Games", - "Ironman Triathlon", ) _ATHLETE_FIRST: tuple[str, ...] = ( @@ -237,11 +163,6 @@ "Eric", "Kevin", "Ryan", - "Sofia", - "Emma", - "Mia", - "Serena", - "Venus", ) _ATHLETE_LAST: tuple[str, ...] = ( @@ -260,11 +181,6 @@ "Jackson", "Martin", "Lee", - "Thompson", - "White", - "Harris", - "Clark", - "Robinson", ) @@ -299,196 +215,35 @@ class SportsProvider(BaseProvider): "score": "score", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "sport": _SPORTS, + "team": _TEAMS, + "league": _LEAGUES, + "position": _POSITIONS, + "venue": _VENUES, + "event": _EVENTS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_athlete(self) -> str: - """Generate a single athlete name.""" choice = self._engine._rng.choice return f"{choice(_ATHLETE_FIRST)} {choice(_ATHLETE_LAST)}" def _one_score(self) -> str: - """Generate a single game score.""" ri = self._engine.random_int return f"{ri(0, 120)}-{ri(0, 120)}" - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def sport(self) -> str: ... - @overload - def sport(self, count: Literal[1]) -> str: ... - @overload - def sport(self, count: int) -> str | list[str]: ... - def sport(self, count: int = 1) -> str | list[str]: - """Generate a sport name (e.g. ``"Basketball"``). - - Parameters - ---------- - count : int - Number of sport names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_SPORTS) - return self._engine.choices(_SPORTS, count) - - @overload - def team(self) -> str: ... - @overload - def team(self, count: Literal[1]) -> str: ... - @overload - def team(self, count: int) -> str | list[str]: ... - def team(self, count: int = 1) -> str | list[str]: - """Generate a team name (e.g. ``"Eagles"``). - - Parameters - ---------- - count : int - Number of team names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_TEAMS) - return self._engine.choices(_TEAMS, count) - - @overload - def league(self) -> str: ... - @overload - def league(self, count: Literal[1]) -> str: ... - @overload - def league(self, count: int) -> str | list[str]: ... - def league(self, count: int = 1) -> str | list[str]: - """Generate a league name (e.g. ``"NBA"``). - - Parameters - ---------- - count : int - Number of league names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_LEAGUES) - return self._engine.choices(_LEAGUES, count) - - @overload - def position(self) -> str: ... - @overload - def position(self, count: Literal[1]) -> str: ... - @overload - def position(self, count: int) -> str | list[str]: ... - def position(self, count: int = 1) -> str | list[str]: - """Generate a sports position (e.g. ``"Quarterback"``). + # Public API — custom methods - Parameters - ---------- - count : int - Number of positions to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_POSITIONS) - return self._engine.choices(_POSITIONS, count) - - @overload - def venue(self) -> str: ... - @overload - def venue(self, count: Literal[1]) -> str: ... - @overload - def venue(self, count: int) -> str | list[str]: ... - def venue(self, count: int = 1) -> str | list[str]: - """Generate a sports venue name (e.g. ``"Wembley Stadium"``). - - Parameters - ---------- - count : int - Number of venue names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_VENUES) - return self._engine.choices(_VENUES, count) - - @overload - def event(self) -> str: ... - @overload - def event(self, count: Literal[1]) -> str: ... - @overload - def event(self, count: int) -> str | list[str]: ... - def event(self, count: int = 1) -> str | list[str]: - """Generate a sports event name (e.g. ``"Olympics"``). - - Parameters - ---------- - count : int - Number of event names to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_EVENTS) - return self._engine.choices(_EVENTS, count) - - @overload - def athlete(self) -> str: ... - @overload - def athlete(self, count: Literal[1]) -> str: ... - @overload - def athlete(self, count: int) -> str | list[str]: ... def athlete(self, count: int = 1) -> str | list[str]: - """Generate an athlete name (e.g. ``"Marcus Johnson"``). - - Parameters - ---------- - count : int - Number of athlete names to generate. - - Returns - ------- - str or list[str] - """ + """Generate an athlete name (e.g. ``"Marcus Johnson"``).""" if count == 1: return self._one_athlete() return [self._one_athlete() for _ in range(count)] - @overload - def score(self) -> str: ... - @overload - def score(self, count: Literal[1]) -> str: ... - @overload - def score(self, count: int) -> str | list[str]: ... def score(self, count: int = 1) -> str | list[str]: - """Generate a game score (e.g. ``"24-17"``). - - Parameters - ---------- - count : int - Number of scores to generate. - - Returns - ------- - str or list[str] - """ + """Generate a game score (e.g. ``"24-17"``).""" if count == 1: return self._one_score() return [self._one_score() for _ in range(count)] diff --git a/src/dataforge/providers/text.py b/src/dataforge/providers/text.py index 9a66024..1af6343 100644 --- a/src/dataforge/providers/text.py +++ b/src/dataforge/providers/text.py @@ -1,7 +1,5 @@ """Text provider — paragraphs, headlines, slugs, quotes, etc.""" -from typing import Literal, overload - from dataforge.providers.base import BaseProvider _QUOTE_AUTHORS: tuple[str, ...] = ( @@ -20,11 +18,6 @@ "William Shakespeare", "Maya Angelou", "Nelson Mandela", - "Theodore Roosevelt", - "Steve Jobs", - "Walt Disney", - "Thomas Edison", - "Nikola Tesla", ) _QUOTE_TEMPLATES: tuple[str, ...] = ( @@ -76,11 +69,6 @@ "Transportation Infrastructure Bill Advances", "Mental Health Awareness Campaign Launches", "Digital Privacy Regulations Proposed", - "Agricultural Innovation Addresses Food Security", - "Marine Conservation Efforts Expand", - "Quantum Computing Research Progresses", - "Public Health Initiative Gains Momentum", - "Urban Development Plans Spark Debate", ) _BUZZWORDS: tuple[str, ...] = ( @@ -99,16 +87,6 @@ "Internet of Things", "edge computing", "deep learning", - "containerization", - "serverless", - "zero trust", - "Web3", - "metaverse", - "sustainability", - "circular economy", - "stakeholder alignment", - "value proposition", - "growth hacking", ) _TEXT_WORDS: tuple[str, ...] = ( @@ -162,56 +140,6 @@ "which", "go", "me", - "when", - "make", - "can", - "like", - "time", - "no", - "just", - "him", - "know", - "take", - "people", - "into", - "year", - "your", - "good", - "some", - "could", - "them", - "see", - "other", - "than", - "then", - "now", - "look", - "only", - "come", - "its", - "over", - "think", - "also", - "back", - "after", - "use", - "two", - "how", - "our", - "work", - "first", - "well", - "way", - "even", - "new", - "want", - "because", - "any", - "these", - "give", - "day", - "most", - "us", ) @@ -230,7 +158,9 @@ class TextProvider(BaseProvider): "text_block": "text_block", } - # --- Scalar helpers --- + _choice_fields: dict[str, tuple[str, ...]] = { + "buzzword": _BUZZWORDS, + } def _one_quote(self) -> str: quote = self._engine.choice(_QUOTE_TEMPLATES) @@ -261,61 +191,26 @@ def _one_text_block(self) -> str: para_count = self._engine.random_int(2, 5) return "\n\n".join(self._one_paragraph() for _ in range(para_count)) - # --- Public API --- - - @overload - def quote(self) -> str: ... - @overload - def quote(self, count: Literal[1]) -> str: ... - @overload - def quote(self, count: int) -> str | list[str]: ... def quote(self, count: int = 1) -> str | list[str]: """Generate a fake quote with attribution.""" if count == 1: return self._one_quote() - # Batch: generate all quotes and authors in bulk _quotes = self._engine.choices(_QUOTE_TEMPLATES, count) _authors = self._engine.choices(_QUOTE_AUTHORS, count) return [f'"{q}" — {a}' for q, a in zip(_quotes, _authors)] - @overload - def headline(self) -> str: ... - @overload - def headline(self, count: Literal[1]) -> str: ... - @overload - def headline(self, count: int) -> str | list[str]: ... def headline(self, count: int = 1) -> str | list[str]: """Generate a news-style headline.""" if count == 1: return self._one_headline() - # Batch: generate all starters and topics in bulk _starters = self._engine.choices(_HEADLINE_STARTERS, count) _topics = self._engine.choices(_HEADLINE_TOPICS, count) return [f"{s} {t}" for s, t in zip(_starters, _topics)] - @overload - def buzzword(self) -> str: ... - @overload - def buzzword(self, count: Literal[1]) -> str: ... - @overload - def buzzword(self, count: int) -> str | list[str]: ... - def buzzword(self, count: int = 1) -> str | list[str]: - """Generate a business/tech buzzword.""" - if count == 1: - return self._engine.choice(_BUZZWORDS) - return self._engine.choices(_BUZZWORDS, count) - - @overload - def paragraph(self) -> str: ... - @overload - def paragraph(self, count: Literal[1]) -> str: ... - @overload - def paragraph(self, count: int) -> str | list[str]: ... def paragraph(self, count: int = 1) -> str | list[str]: """Generate a random paragraph of sentences.""" if count == 1: return self._one_paragraph() - # Inlined batch with local-bound helpers _choices = self._engine.choices _ri = self._engine.random_int _words = _TEXT_WORDS @@ -332,17 +227,10 @@ def paragraph(self, count: int = 1) -> str | list[str]: result.append(_join(sentences)) return result - @overload - def text_block(self) -> str: ... - @overload - def text_block(self, count: Literal[1]) -> str: ... - @overload - def text_block(self, count: int) -> str | list[str]: ... def text_block(self, count: int = 1) -> str | list[str]: """Generate a multi-paragraph text block.""" if count == 1: return self._one_text_block() - # Inlined batch with local-bound helpers _choices = self._engine.choices _ri = self._engine.random_int _words = _TEXT_WORDS diff --git a/src/dataforge/providers/weather.py b/src/dataforge/providers/weather.py index 7e89b3b..0469520 100644 --- a/src/dataforge/providers/weather.py +++ b/src/dataforge/providers/weather.py @@ -5,13 +5,9 @@ All data is stored as immutable ``tuple[str, ...]`` for cache friendliness. """ -from typing import Literal, overload - from dataforge.providers.base import BaseProvider -# ------------------------------------------------------------------ # Data tuples (immutable, module-level for zero per-call overhead) -# ------------------------------------------------------------------ _CONDITIONS: tuple[str, ...] = ( "Sunny", @@ -34,16 +30,6 @@ "Haze", "Windy", "Clear", - "Tornado", - "Hurricane", - "Blizzard", - "Dust Storm", - "Tropical Storm", - "Ice Storm", - "Scattered Showers", - "Partly Sunny", - "Mostly Sunny", - "Mostly Cloudy", ) _WIND_DIRECTIONS: tuple[str, ...] = ( @@ -81,11 +67,6 @@ "High Wind Warning", "Dense Fog Advisory", "Frost Advisory", - "Freeze Warning", - "Hurricane Watch", - "Hurricane Warning", - "Tropical Storm Warning", - "Coastal Flood Advisory", ) _CLOUD_TYPES: tuple[str, ...] = ( @@ -151,302 +132,73 @@ class WeatherProvider(BaseProvider): "visibility": "visibility", } - # ------------------------------------------------------------------ + _choice_fields: dict[str, tuple[str, ...]] = { + "condition": _CONDITIONS, + "wind_direction": _WIND_DIRECTIONS, + "air_quality": _AIR_QUALITY, + "alert": _ALERTS, + "cloud_type": _CLOUD_TYPES, + "season": _SEASONS, + } + # Scalar helpers - # ------------------------------------------------------------------ def _one_temperature(self) -> str: - """Generate a single temperature string in Fahrenheit.""" return f"{self._engine.random_int(-20, 120)}°F" def _one_humidity(self) -> str: - """Generate a single humidity percentage string.""" return f"{self._engine.random_int(0, 100)}%" def _one_wind_speed(self) -> str: - """Generate a single wind speed string.""" return f"{self._engine.random_int(0, 150)} mph" def _one_uv_index(self) -> str: - """Generate a single UV index string.""" return str(self._engine.random_int(0, 11)) def _one_pressure(self) -> str: - """Generate a single barometric pressure string.""" ri = self._engine.random_int whole = ri(28, 31) frac = ri(0, 99) return f"{whole}.{frac:02d} inHg" def _one_visibility(self) -> str: - """Generate a single visibility string.""" ri = self._engine.random_int return f"{ri(0, 30)} mi" - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - @overload - def condition(self) -> str: ... - @overload - def condition(self, count: Literal[1]) -> str: ... - @overload - def condition(self, count: int) -> str | list[str]: ... - def condition(self, count: int = 1) -> str | list[str]: - """Generate a weather condition (e.g. ``"Sunny"``). - - Parameters - ---------- - count : int - Number of conditions to generate. + # Public API — custom methods - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CONDITIONS) - return self._engine.choices(_CONDITIONS, count) - - @overload - def temperature(self) -> str: ... - @overload - def temperature(self, count: Literal[1]) -> str: ... - @overload - def temperature(self, count: int) -> str | list[str]: ... def temperature(self, count: int = 1) -> str | list[str]: - """Generate a temperature (e.g. ``"72°F"``). - - Parameters - ---------- - count : int - Number of temperatures to generate. - - Returns - ------- - str or list[str] - """ + """Generate a temperature (e.g. ``"72°F"``).""" if count == 1: return self._one_temperature() return [self._one_temperature() for _ in range(count)] - @overload - def humidity(self) -> str: ... - @overload - def humidity(self, count: Literal[1]) -> str: ... - @overload - def humidity(self, count: int) -> str | list[str]: ... def humidity(self, count: int = 1) -> str | list[str]: - """Generate a humidity percentage (e.g. ``"65%"``). - - Parameters - ---------- - count : int - Number of humidity values to generate. - - Returns - ------- - str or list[str] - """ + """Generate a humidity percentage (e.g. ``"65%"``).""" if count == 1: return self._one_humidity() return [self._one_humidity() for _ in range(count)] - @overload - def wind_speed(self) -> str: ... - @overload - def wind_speed(self, count: Literal[1]) -> str: ... - @overload - def wind_speed(self, count: int) -> str | list[str]: ... def wind_speed(self, count: int = 1) -> str | list[str]: - """Generate a wind speed (e.g. ``"15 mph"``). - - Parameters - ---------- - count : int - Number of wind speeds to generate. - - Returns - ------- - str or list[str] - """ + """Generate a wind speed (e.g. ``"15 mph"``).""" if count == 1: return self._one_wind_speed() return [self._one_wind_speed() for _ in range(count)] - @overload - def wind_direction(self) -> str: ... - @overload - def wind_direction(self, count: Literal[1]) -> str: ... - @overload - def wind_direction(self, count: int) -> str | list[str]: ... - def wind_direction(self, count: int = 1) -> str | list[str]: - """Generate a wind direction (e.g. ``"NW"``). - - Parameters - ---------- - count : int - Number of wind directions to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_WIND_DIRECTIONS) - return self._engine.choices(_WIND_DIRECTIONS, count) - - @overload - def uv_index(self) -> str: ... - @overload - def uv_index(self, count: Literal[1]) -> str: ... - @overload - def uv_index(self, count: int) -> str | list[str]: ... def uv_index(self, count: int = 1) -> str | list[str]: - """Generate a UV index (e.g. ``"7"``). - - Parameters - ---------- - count : int - Number of UV index values to generate. - - Returns - ------- - str or list[str] - """ + """Generate a UV index (e.g. ``"7"``).""" if count == 1: return self._one_uv_index() return [self._one_uv_index() for _ in range(count)] - @overload - def air_quality(self) -> str: ... - @overload - def air_quality(self, count: Literal[1]) -> str: ... - @overload - def air_quality(self, count: int) -> str | list[str]: ... - def air_quality(self, count: int = 1) -> str | list[str]: - """Generate an air quality level (e.g. ``"Good"``). - - Parameters - ---------- - count : int - Number of air quality levels to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_AIR_QUALITY) - return self._engine.choices(_AIR_QUALITY, count) - - @overload - def alert(self) -> str: ... - @overload - def alert(self, count: Literal[1]) -> str: ... - @overload - def alert(self, count: int) -> str | list[str]: ... - def alert(self, count: int = 1) -> str | list[str]: - """Generate a weather alert (e.g. ``"Tornado Watch"``). - - Parameters - ---------- - count : int - Number of alerts to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_ALERTS) - return self._engine.choices(_ALERTS, count) - - @overload - def cloud_type(self) -> str: ... - @overload - def cloud_type(self, count: Literal[1]) -> str: ... - @overload - def cloud_type(self, count: int) -> str | list[str]: ... - def cloud_type(self, count: int = 1) -> str | list[str]: - """Generate a cloud type (e.g. ``"Cumulus"``). - - Parameters - ---------- - count : int - Number of cloud types to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_CLOUD_TYPES) - return self._engine.choices(_CLOUD_TYPES, count) - - @overload - def season(self) -> str: ... - @overload - def season(self, count: Literal[1]) -> str: ... - @overload - def season(self, count: int) -> str | list[str]: ... - def season(self, count: int = 1) -> str | list[str]: - """Generate a season (e.g. ``"Summer"``). - - Parameters - ---------- - count : int - Number of seasons to generate. - - Returns - ------- - str or list[str] - """ - if count == 1: - return self._engine.choice(_SEASONS) - return self._engine.choices(_SEASONS, count) - - @overload - def pressure(self) -> str: ... - @overload - def pressure(self, count: Literal[1]) -> str: ... - @overload - def pressure(self, count: int) -> str | list[str]: ... def pressure(self, count: int = 1) -> str | list[str]: - """Generate barometric pressure (e.g. ``"29.92 inHg"``). - - Parameters - ---------- - count : int - Number of pressure values to generate. - - Returns - ------- - str or list[str] - """ + """Generate barometric pressure (e.g. ``"29.92 inHg"``).""" if count == 1: return self._one_pressure() return [self._one_pressure() for _ in range(count)] - @overload - def visibility(self) -> str: ... - @overload - def visibility(self, count: Literal[1]) -> str: ... - @overload - def visibility(self, count: int) -> str | list[str]: ... def visibility(self, count: int = 1) -> str | list[str]: - """Generate visibility distance (e.g. ``"10 mi"``). - - Parameters - ---------- - count : int - Number of visibility values to generate. - - Returns - ------- - str or list[str] - """ + """Generate visibility distance (e.g. ``"10 mi"``).""" if count == 1: return self._one_visibility() return [self._one_visibility() for _ in range(count)] diff --git a/src/dataforge/registry.py b/src/dataforge/registry.py index 5742d07..6f43bb4 100644 --- a/src/dataforge/registry.py +++ b/src/dataforge/registry.py @@ -1,54 +1,20 @@ -"""Provider registry — auto-discovery and field resolution for providers. - -The registry scans all ``BaseProvider`` subclasses that define -``_provider_name`` and builds two lookup tables: - -- **provider_info**: maps provider name → ``(provider_class, locale_modules)`` -- **field_map**: maps shorthand field name → ``(provider_name, method_name)`` - -These tables are built **lazily** on first access so that import-time -cost is zero. Once built, lookups are O(1) dict reads. - -**Built-in providers** are loaded via direct imports (fastest path). -**External plugins** are discovered through the -``dataforge.providers`` entry-point group, allowing third-party -packages to register providers with zero configuration. - -Adding a built-in provider requires only: - -1. Create a new ``BaseProvider`` subclass with ``_provider_name``, - ``_locale_modules``, and ``_field_map`` class attributes. -2. Add a direct import below in :func:`_ensure_loaded`. -3. Add an entry point in ``pyproject.toml`` under - ``[project.entry-points."dataforge.providers"]``. -""" +"""Provider registry — auto-discovery and field resolution for providers.""" import importlib import importlib.metadata from dataforge.providers.base import BaseProvider -# Lazy-initialized lookup tables _provider_info: dict[str, tuple[type, tuple[str, ...]]] | None = None _field_map: dict[str, tuple[str, str]] | None = None def _ensure_loaded() -> None: - """Import all provider modules and build the registry tables. - - Called lazily on first access. Subsequent calls are no-ops. - - Built-in providers are loaded via direct imports for speed. - External plugins are discovered through the - ``dataforge.providers`` entry-point group. - """ + """Import all provider modules and build the registry tables.""" global _provider_info, _field_map if _provider_info is not None: return - # ------------------------------------------------------------------ - # 1. Import all built-in provider modules (fast path). - # ------------------------------------------------------------------ import dataforge.providers.address # noqa: F401 import dataforge.providers.automotive # noqa: F401 import dataforge.providers.barcode # noqa: F401 @@ -75,7 +41,6 @@ def _ensure_loaded() -> None: import dataforge.providers.text # noqa: F401 import dataforge.providers.ai_prompt # noqa: F401 import dataforge.providers.llm # noqa: F401 - import dataforge.providers.ai_chat # noqa: F401 import dataforge.providers.social_media # noqa: F401 import dataforge.providers.music # noqa: F401 import dataforge.providers.sports # noqa: F401 @@ -86,17 +51,11 @@ def _ensure_loaded() -> None: import dataforge.providers.hardware # noqa: F401 import dataforge.providers.logistics # noqa: F401 - # ------------------------------------------------------------------ - # 2. Discover external plugins via entry points. - # ------------------------------------------------------------------ eps = importlib.metadata.entry_points(group="dataforge.providers") for ep in eps: - # Each entry point value is a module path. - # Loading it triggers the class registration via __subclasses__. try: importlib.import_module(ep.value) except Exception: - # Don't let a broken plugin prevent the registry from loading. import warnings warnings.warn( @@ -106,9 +65,6 @@ def _ensure_loaded() -> None: stacklevel=2, ) - # ------------------------------------------------------------------ - # 3. Build lookup tables from all registered subclasses. - # ------------------------------------------------------------------ info: dict[str, tuple[type, tuple[str, ...]]] = {} fields: dict[str, tuple[str, str]] = {} @@ -128,7 +84,7 @@ def _ensure_loaded() -> None: def get_provider_info() -> dict[str, tuple[type, tuple[str, ...]]]: - """Return the provider info table: ``{name: (class, locale_modules)}``.""" + """Return the provider info table.""" _ensure_loaded() if _provider_info is None: raise RuntimeError("Provider registry failed to initialize.") @@ -136,7 +92,7 @@ def get_provider_info() -> dict[str, tuple[type, tuple[str, ...]]]: def get_field_map() -> dict[str, tuple[str, str]]: - """Return the field map: ``{field_name: (provider_name, method_name)}``.""" + """Return the field map.""" _ensure_loaded() if _field_map is None: raise RuntimeError("Provider registry failed to initialize.") @@ -148,20 +104,7 @@ def register_runtime_provider( cls: type, locale_modules: tuple[str, ...] = (), ) -> None: - """Register a provider at runtime (called by DataForge.register_provider). - - Updates the global registry tables so that Schema and field - resolution can find the new provider's fields. - - Parameters - ---------- - name : str - The provider name (e.g. ``"weather"``). - cls : type - The provider class. - locale_modules : tuple[str, ...] - Locale modules required by the provider. - """ + """Register a provider at runtime (called by DataForge.register_provider).""" _ensure_loaded() if _provider_info is None or _field_map is None: raise RuntimeError("Provider registry failed to initialize.") diff --git a/src/dataforge/relational.py b/src/dataforge/relational.py index 54c8da5..bd7ee2e 100644 --- a/src/dataforge/relational.py +++ b/src/dataforge/relational.py @@ -1,41 +1,4 @@ -"""Relational schema — multi-table data generation with foreign keys. - -Generates related tables with referential integrity. Parent tables are -generated first, then child tables reference parent rows via foreign -keys with configurable cardinality. - -Usage:: - - from dataforge import DataForge - - forge = DataForge(seed=42) - - tables = forge.relational({ - "users": { - "fields": ["first_name", "last_name", "email"], - "count": 100, - }, - "orders": { - "fields": ["date", "city"], - "count": 500, - "parent": "users", # FK → users - "parent_key": "user_id", # column name for the FK - "children_per_parent": (1, 10), # 1-10 orders per user - }, - "order_items": { - "fields": {"product": "ecommerce.product_name", - "price": "ecommerce.price"}, - "count": 2000, - "parent": "orders", - "parent_key": "order_id", - "children_per_parent": (1, 5), - }, - }) - - users = tables["users"] # list[dict] with auto-id - orders = tables["orders"] # list[dict] with user_id FK - items = tables["order_items"] # list[dict] with order_id FK -""" +"""Relational schema — multi-table data generation with foreign keys.""" from __future__ import annotations @@ -47,29 +10,7 @@ class RelationalSchema: - """Multi-table schema with foreign key relationships. - - Tables are defined as a dict of table specs. Each spec can include: - - - ``fields`` — list or dict of field specs (same as Schema) - - ``count`` — number of rows to generate - - ``parent`` — name of the parent table (creates a FK relationship) - - ``parent_key`` — column name for the foreign key (default: ``{parent}_id``) - - ``children_per_parent`` — tuple ``(min, max)`` controlling how many - child rows each parent gets. If omitted, children are distributed - randomly across parents. - - ``null_fields`` — optional null probability dict (same as Schema) - - Tables are generated in topological order (parents before children). - Each table gets an auto-incrementing ``id`` column (1-based). - - Parameters - ---------- - forge : DataForge - The parent generator instance. - tables : dict[str, dict] - Table specifications. - """ + """Multi-table schema with foreign key relationships.""" __slots__ = ("_forge", "_table_specs", "_order") @@ -81,12 +22,10 @@ def __init__(self, forge: DataForge, tables: dict[str, dict[str, Any]]) -> None: @staticmethod def _topological_sort(tables: dict[str, dict[str, Any]]) -> list[str]: """Sort table names so parents come before children.""" - # Build dependency graph deps: dict[str, str | None] = {} for name, spec in tables.items(): deps[name] = spec.get("parent") - # Validate references for name, parent in deps.items(): if parent is not None and parent not in tables: raise ValueError( @@ -94,7 +33,6 @@ def _topological_sort(tables: dict[str, dict[str, Any]]) -> list[str]: f"which is not defined. Available tables: {list(tables)}" ) - # Kahn's algorithm in_degree: dict[str, int] = {name: 0 for name in tables} children_of: dict[str, list[str]] = {name: [] for name in tables} for name, parent in deps.items(): @@ -121,20 +59,11 @@ def _topological_sort(tables: dict[str, dict[str, Any]]) -> list[str]: return order def generate(self) -> dict[str, list[dict[str, Any]]]: - """Generate all tables with referential integrity. - - Returns - ------- - dict[str, list[dict[str, Any]]] - Mapping of table name → list of row dicts. Each row - includes an ``id`` column and, for child tables, a - foreign key column pointing to the parent's ``id``. - """ + """Generate all tables with referential integrity.""" forge = self._forge rng = forge._engine._rng result: dict[str, list[dict[str, Any]]] = {} - # Cache Schema objects to avoid re-building them on repeated calls schemas: dict[str, Any] = {} for table_name in self._order: @@ -146,17 +75,14 @@ def generate(self) -> dict[str, list[dict[str, Any]]]: parent_key = spec.get("parent_key") children_per_parent = spec.get("children_per_parent") - # Generate base data — cache Schema per table if table_name not in schemas: schemas[table_name] = forge.schema(fields, null_fields=null_fields) schema = schemas[table_name] rows = schema.generate(count=count) - # Add auto-increment id for i, row in enumerate(rows, 1): row["id"] = i - # Add foreign key if this is a child table if parent_name is not None: if parent_key is None: parent_key = f"{parent_name}_id" @@ -165,11 +91,9 @@ def generate(self) -> dict[str, list[dict[str, Any]]]: parent_ids = [r["id"] for r in parent_rows] if not parent_ids: - # No parent rows — all FKs are None for row in rows: row[parent_key] = None elif children_per_parent is not None: - # Distribute children across parents with cardinality bounds min_c, max_c = children_per_parent assignments = self._distribute_children( rng, parent_ids, count, min_c, max_c @@ -177,7 +101,6 @@ def generate(self) -> dict[str, list[dict[str, Any]]]: for row, fk_val in zip(rows, assignments): row[parent_key] = fk_val else: - # Random assignment — each child gets a random parent assigned = rng.choices(parent_ids, k=count) for row, fk_val in zip(rows, assigned): row[parent_key] = fk_val @@ -194,13 +117,8 @@ def _distribute_children( min_per_parent: int, max_per_parent: int, ) -> list[int]: - """Distribute children across parents respecting cardinality bounds. - - Returns a list of parent IDs (one per child row), shuffled - so children aren't grouped by parent. - """ + """Distribute children across parents respecting cardinality bounds.""" n_parents = len(parent_ids) - # Generate per-parent child counts counts: list[int] = [] remaining = total_children @@ -208,7 +126,6 @@ def _distribute_children( if remaining <= 0: counts.append(0) continue - # Clamp to ensure we can still fill remaining parents parents_left = n_parents - i - 1 max_here = min( max_per_parent, @@ -221,7 +138,6 @@ def _distribute_children( counts.append(c) remaining -= c - # If we still have remaining children, distribute them while remaining > 0: for i in range(n_parents): if remaining <= 0: @@ -231,19 +147,15 @@ def _distribute_children( counts[i] += add remaining -= add - # Build assignment list assignments: list[int] = [] for pid, c in zip(parent_ids, counts): assignments.extend([pid] * c) - # Shuffle so children aren't ordered by parent rng.shuffle(assignments) - # Trim or pad to exact count if len(assignments) > total_children: assignments = assignments[:total_children] elif len(assignments) < total_children: - # Fill remainder with random parents extra = rng.choices(parent_ids, k=total_children - len(assignments)) assignments.extend(extra) @@ -253,18 +165,7 @@ def to_sql( self, dialect: str = "sqlite", ) -> str: - """Generate all tables and return as SQL INSERT statements. - - Parameters - ---------- - dialect : str - SQL dialect: ``"sqlite"``, ``"mysql"``, or ``"postgresql"``. - - Returns - ------- - str - SQL INSERT statements for all tables, ordered parents-first. - """ + """Generate all tables and return as SQL INSERT statements.""" data = self.generate() parts: list[str] = [] _str = str @@ -276,7 +177,6 @@ def to_sql( columns = list(rows[0].keys()) - # Quote identifiers per dialect if dialect == "mysql": col_list = ", ".join(f"`{c}`" for c in columns) tbl = f"`{table_name}`" diff --git a/src/dataforge/schema.py b/src/dataforge/schema.py index 3b20488..c13dcf7 100644 --- a/src/dataforge/schema.py +++ b/src/dataforge/schema.py @@ -1,29 +1,4 @@ -"""Schema — zero-overhead bulk data generation via pre-resolved fields. - -A ``Schema`` pre-resolves provider/method lookups once at creation time, -then generates rows with a tight loop over pre-bound callables — no -per-row field resolution, no ``getattr`` calls during generation. - -Usage:: - - from dataforge import DataForge - - forge = DataForge(seed=42) - schema = forge.schema(["first_name", "email", "city"]) - rows = schema.generate(count=1_000_000) - - # Lambda / correlated fields: - schema = forge.schema({ - "name": "full_name", - "email": "email", - "username": lambda row: row["name"].lower().replace(" ", "."), - }) - - # Typed schema — values preserve native Python types: - schema = forge.schema(["first_name", "port", "boolean"]) - rows = schema.generate(count=10) - # rows[0]["port"] → 8080 (int, not str) -""" +"""Schema — zero-overhead bulk data generation via pre-resolved fields.""" from __future__ import annotations @@ -36,7 +11,6 @@ if TYPE_CHECKING: from dataforge.core import DataForge -# Sentinel for columns that depend on the current row _ROW_LAMBDA = object() @@ -48,32 +22,16 @@ def _open_file( compress: bool | None = None, newline: str | None = None, ) -> Iterator[Any]: - """Context manager that opens a file, auto-detecting gzip from extension. - - Parameters - ---------- - path : str - File path. If it ends with ``.gz``, gzip compression is used - unless *compress* is explicitly ``False``. - mode : str - Open mode (``"w"`` or ``"wb"``). - encoding : str - Text encoding (ignored for binary modes). - compress : bool | None - Force gzip on/off. ``None`` = auto-detect from extension. - newline : str | None - Newline mode for text files (e.g. ``""`` for CSV). - """ + """Context manager that opens a file, auto-detecting gzip from extension.""" use_gzip = compress if compress is not None else path.endswith(".gz") if use_gzip: - # gzip.open returns a binary stream; wrap in TextIOWrapper for text mode if "b" not in mode: raw = _gzip.open(path, mode + "b") f: Any = _io.TextIOWrapper(raw, encoding=encoding, newline=newline) # type: ignore[arg-type] try: yield f finally: - f.close() # closes underlying raw too + f.close() else: f = _gzip.open(path, mode) # type: ignore[assignment] try: @@ -89,37 +47,7 @@ def _open_file( class Schema: - """Pre-resolved generation blueprint for maximum throughput. - - All field lookups are performed **once** during ``__init__``. - Subsequent ``generate()`` calls execute only the bound methods - with zero overhead from name resolution. - - Values are preserved in their native Python types by default. - Export methods (``to_csv``, ``to_jsonl``, ``to_sql``) convert - values to strings as needed by the output format. - - Parameters - ---------- - forge : DataForge - The parent generator instance. - fields : list[str] | dict[str, str | Callable] - Fields to generate. String values are resolved to provider - methods. Callable values receive the current row dict and - can reference previously generated columns. - null_fields : dict[str, float] | None - Optional mapping of column names to null probabilities - (0.0–1.0). After generation, values in the specified columns - are randomly replaced with ``None`` at the given rate. - Example: ``{"email": 0.2}`` makes ~20% of email values ``None``. - unique_together : list[tuple[str, ...]] | None - Optional list of column-name tuples that must be unique - **in combination**. For example, - ``[("first_name", "last_name")]`` ensures no two rows share - the same (first_name, last_name) pair. Rows that violate - the constraint are re-generated up to a configurable retry - limit. - """ + """Pre-resolved generation blueprint for maximum throughput.""" __slots__ = ( "_columns", @@ -145,7 +73,6 @@ def __init__( unique_together: "list[tuple[str, ...]] | None" = None, chaos: "Any | None" = None, ) -> None: - # Check for dict-based field specs (constraint engine) has_dict_specs = False if isinstance(fields, dict): for v in fields.values(): @@ -153,20 +80,16 @@ def __init__( has_dict_specs = True break - # Only store forge ref and chaos when actually needed — avoids - # extra attribute assignments in the common (standard) path. self._forge_ref = forge if (has_dict_specs or chaos is not None) else None # type: ignore[assignment] self._chaos = chaos if has_dict_specs: - # Use constraint engine for two-pass generation from dataforge.constraints import build_dependency_order independent, dependent_order, constraint_map = build_dependency_order( fields # type: ignore[arg-type] ) - # Build columns and callables for independent columns only columns: list[str] = [] callables: list[object] = [] row_lambdas: dict[int, Callable[..., Any]] = {} @@ -189,11 +112,9 @@ def __init__( columns.append(col_name) callables.append(method) - # Add placeholders for dependent columns (filled per-row) for col_name, _constraint in dependent_order: columns.append(col_name) callables.append(_ROW_LAMBDA) - # Don't add to row_lambdas — handled by constraint engine self._columns = tuple(columns) self._callables = tuple(callables) @@ -202,8 +123,6 @@ def __init__( self._dependent_order = dependent_order self._constraints = constraint_map else: - # Standard path — no constraints - # Normalize to (column_name, field_spec) pairs if isinstance(fields, list): field_defs: list[tuple[str, str | Callable[..., Any]]] = [ (f, f) for f in fields @@ -218,35 +137,25 @@ def __init__( for idx, (col_name, field_spec) in enumerate(field_defs): columns.append(col_name) if callable(field_spec): - # Row-dependent lambda — stored separately, executed - # per-row after batch columns are generated. callables.append(_ROW_LAMBDA) row_lambdas[idx] = field_spec else: - # String field name — resolve to provider method provider_attr, method_name = forge._resolve_field(field_spec) provider = getattr(forge, provider_attr) method = getattr(provider, method_name) callables.append(method) - # Store as tuples for fastest iteration (bytecode LOAD_FAST) self._columns = tuple(columns) self._callables = tuple(callables) self._row_lambdas = row_lambdas - # Standard path: use None sentinels — avoids creating - # empty containers and saves 3 allocations per Schema. self._independent_cols = None # type: ignore[assignment] self._dependent_order = None # type: ignore[assignment] self._constraints = None # type: ignore[assignment] - # Remember the original field spec for schema serialization self._fields_spec: list[str] | dict[str, Any] = fields - # Nullable field support: store (column_index, probability) pairs - # and the RNG for fast null injection self._rng = forge._engine._rng if null_fields: - # Validate all column names exist col_set = set(columns) for name in null_fields: if name not in col_set: @@ -262,7 +171,6 @@ def __init__( else: self._null_fields = {} - # unique_together: pre-compute column index tuples for fast lookup if unique_together: col_set = set(columns) idx_groups: list[tuple[int, ...]] = [] @@ -280,38 +188,12 @@ def __init__( self._unique_together = [] self._unique_together_indices = [] - # ------------------------------------------------------------------ - # Core generation - # ------------------------------------------------------------------ - def _generate_columns(self, count: int) -> list[list[Any]]: - """Generate column data in bulk (column-first). - - Shared by :meth:`generate`, :meth:`stream`, and export helpers. - Each field is generated in one batch call via its ``count=N`` - path — no per-row field resolution overhead. - - Values are preserved in their native Python types. No ``str()`` - coercion is applied — that responsibility belongs to export - methods that need string output. - - Row-lambda columns are filled with ``None`` here and - populated later by :meth:`_apply_row_lambdas`. - - Parameters - ---------- - count : int - Number of values per column. - - Returns - ------- - list[list[Any]] - """ + """Generate column data in bulk (column-first).""" col_data: list[list[Any]] = [] _sentinel = _ROW_LAMBDA for fn in self._callables: if fn is _sentinel: - # Placeholder — filled by _apply_row_lambdas col_data.append([None] * count) elif count == 1: val = fn() # type: ignore[operator] @@ -320,14 +202,11 @@ def _generate_columns(self, count: int) -> list[list[Any]]: values = fn(count=count) # type: ignore[operator] col_data.append(values if isinstance(values, list) else [values]) - # Apply null injection if any null_fields are configured null_fields = self._null_fields if null_fields: _rng = self._rng for col_idx, prob in null_fields.items(): col = col_data[col_idx] - # Use bulk index selection via sample() instead of - # per-element random() — fewer Python-level calls. n_nulls = _rng.binomialvariate(count, prob) if n_nulls > 0: for i in _rng.sample(range(count), k=min(n_nulls, count)): @@ -337,20 +216,7 @@ def _generate_columns(self, count: int) -> list[list[Any]]: @staticmethod def _stringify_columns(col_data: list[list[Any]]) -> list[list[str]]: - """Convert column data to strings for text-based exports. - - Optimized: skips conversion for columns that are already all - strings (the common case for most providers). - - Parameters - ---------- - col_data : list[list[Any]] - Native-typed column data. - - Returns - ------- - list[list[str]] - """ + """Convert column data to strings for text-based exports.""" result: list[list[str]] = [] _str = str _isinstance = isinstance @@ -358,25 +224,13 @@ def _stringify_columns(col_data: list[list[Any]]) -> list[list[str]]: if col and _isinstance(col[0], _str): result.append(col) # type: ignore[arg-type] elif not col or col[0] is None: - # First element is None or column is empty — must stringify - # all elements to be safe. result.append([_str(v) if v is not None else "" for v in col]) else: result.append([_str(v) if v is not None else "" for v in col]) return result def _generate_string_columns(self, count: int) -> list[list[str]]: - """Generate columns and stringify them, handling row lambdas. - - Shared helper used by CSV, Parquet, Arrow, and Polars exports. - Avoids duplicating the generate→lambda→stringify pattern in - every export method. - - Returns - ------- - list[list[str]] - Stringified column data. - """ + """Generate columns and stringify them, handling row lambdas.""" columns = self._columns col_data = self._generate_columns(count) _str = str @@ -390,13 +244,7 @@ def _generate_string_columns(self, count: int) -> list[list[str]]: return self._stringify_columns(col_data) def _apply_row_lambdas(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Apply row-dependent lambdas to generated rows in-place. - - Each lambda receives the current row dict and its return - value is stored in the row with its native type. - Lambdas are applied in column order, so later lambdas - can reference earlier lambda-generated columns. - """ + """Apply row-dependent lambdas to generated rows in-place.""" if not self._row_lambdas: return rows columns = self._columns @@ -406,41 +254,16 @@ def _apply_row_lambdas(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]] return rows def generate(self, count: int = 10) -> list[dict[str, Any]]: - """Generate *count* rows as a list of dicts. - - Uses **column-first generation**: each field is generated in - bulk via its ``count=N`` batch path, then columns are zipped - into row dicts. This replaces ``count × num_fields`` scalar - calls with ``num_fields`` batch calls — significantly faster - for large counts. - - Values are preserved in their native Python types (``int``, - ``float``, ``bool``, ``str``, etc.). - - When ``unique_together`` constraints are active, duplicate - combinations are detected and replaced with freshly generated - rows (up to 100 retry rounds). - - Parameters - ---------- - count : int - Number of rows to generate. - - Returns - ------- - list[dict[str, Any]] - """ + """Generate *count* rows as a list of dicts with native Python types.""" if count == 0: return [] columns = self._columns col_data = self._generate_columns(count) - # Zip columns into row dicts — transposed vectorized assembly rows = [dict(zip(columns, row)) for row in zip(*col_data)] rows = self._apply_row_lambdas(rows) - # Apply constraint-based dependent columns (two-pass) dep_order = self._dependent_order if dep_order: engine = self._forge_ref._engine @@ -449,11 +272,9 @@ def generate(self, count: int = 10) -> list[dict[str, Any]]: for col_name, constraint in dep_order: row[col_name] = constraint.generate(row, engine, forge) - # Enforce unique_together constraints if self._unique_together: rows = self._enforce_unique_together(rows, count) - # Apply chaos transformer if configured chaos = self._chaos if chaos is not None: rows = self._apply_chaos(rows) @@ -465,7 +286,6 @@ def _apply_chaos(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: chaos = self._chaos if chaos is None: return rows - # Accept ChaosTransformer instance or config dict if isinstance(chaos, dict): from dataforge.chaos import ChaosTransformer @@ -477,17 +297,10 @@ def _apply_chaos(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: def _enforce_unique_together( self, rows: list[dict[str, Any]], target: int ) -> list[dict[str, Any]]: - """Re-generate rows until all unique_together constraints are met. - - Optimized: maintains a persistent ``seen`` set across rounds - so only replacement rows need to be re-checked, avoiding a - full table rescan every round. - """ + """Re-generate rows until all unique_together constraints are met.""" columns = self._columns _MAX_ROUNDS = 100 - # Build persistent seen sets — one per constraint group. - # These survive across rounds so we only re-check new rows. seen_per_group: list[set[tuple[Any, ...]]] = [ set() for _ in self._unique_together_indices ] @@ -507,7 +320,6 @@ def _enforce_unique_together( if dup_indices: all_ok = False - # Re-generate only the duplicate rows n_dups = len(dup_indices) new_col_data = self._generate_columns(n_dups) new_rows = [dict(zip(columns, row)) for row in zip(*new_col_data)] @@ -518,7 +330,6 @@ def _enforce_unique_together( if all_ok: return rows - # After max rounds, return what we have (best-effort) import warnings warnings.warn( @@ -535,33 +346,10 @@ def stream( count: int, batch_size: int | None = None, ) -> Iterator[dict[str, Any]]: - """Yield rows lazily in batches — avoids materializing all rows. - - Internally generates data in column-first batches for - performance, but yields rows one at a time. - - Parameters - ---------- - count : int - Total number of rows to yield. - batch_size : int | None - Internal batch size for column-first generation. - When ``None`` (default), the batch size is auto-tuned - based on the number of columns and total count to - balance throughput and memory usage. - - Yields - ------ - dict[str, Any] - """ + """Yield rows lazily in batches — avoids materializing all rows.""" columns = self._columns num_cols = len(columns) - # Auto-tune batch size when not explicitly set. - # More columns → smaller batches to bound memory; fewer columns - # → larger batches to amortize per-batch overhead. The floor - # of 1000 keeps overhead low; the ceiling avoids over-allocating - # when count is small. if batch_size is None: batch_size = min(count, max(1000, 100_000 // max(num_cols, 1))) @@ -571,7 +359,6 @@ def stream( while remaining > 0: chunk = min(remaining, batch_size) col_data = self._generate_columns(chunk) - # Yield row dicts — transposed vectorized assembly if row_lambdas: batch_rows = [dict(zip(columns, row)) for row in zip(*col_data)] self._apply_row_lambdas(batch_rows) @@ -586,30 +373,7 @@ async def async_stream( count: int, batch_size: int | None = None, ) -> AsyncIterator[dict[str, Any]]: - """Yield rows lazily via ``async for`` — one row at a time. - - Internally uses the same column-first batch generation as - :meth:`stream` for maximum throughput. Each batch is generated - synchronously (CPU-bound work), then rows are yielded with an - ``await``-compatible suspend point between batches so the event - loop can service other coroutines. - - Usage:: - - async for row in schema.async_stream(100_000): - await process(row) - - Parameters - ---------- - count : int - Total number of rows to yield. - batch_size : int | None - Internal batch size. Auto-tuned when ``None``. - - Yields - ------ - dict[str, Any] - """ + """Yield rows lazily via ``async for`` — one row at a time.""" import asyncio columns = self._columns @@ -634,13 +398,8 @@ async def async_stream( for row in zip(*col_data): yield dict(zip(columns, row)) remaining -= chunk - # Yield control to the event loop between batches await _sleep(0) - # ------------------------------------------------------------------ - # Export helpers - # ------------------------------------------------------------------ - def to_csv( self, count: int = 10, @@ -649,26 +408,7 @@ def to_csv( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate rows and return as CSV string. - - Parameters - ---------- - count : int - Number of rows. - path : str | None - Optional file path to write. - delimiter : str - Field delimiter (default: comma). - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - """ + """Generate rows and return as CSV string.""" import csv import io @@ -684,7 +424,6 @@ def to_csv( _str = str if self._row_lambdas: - # Row-lambda path: must materialize rows rows = [dict(zip(columns, row)) for row in zip(*col_data)] self._apply_row_lambdas(rows) writer.writerows( @@ -714,32 +453,7 @@ def stream_to_csv( encoding: str = "utf-8", compress: bool | None = None, ) -> int: - """Stream rows to a CSV file without materializing all data. - - Writes rows in batches to keep memory usage constant - regardless of *count*. - - Parameters - ---------- - path : str - File path to write. - count : int - Total number of rows. - batch_size : int | None - Internal batch size. Auto-tuned when ``None``. - delimiter : str - Field delimiter (default: comma). - encoding : str - Character encoding (default: utf-8). - compress : bool | None - If ``True``, gzip the output. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - int - Number of rows written. - """ + """Stream rows to a CSV file without materializing all data.""" import csv columns = self._columns @@ -772,33 +486,11 @@ def to_jsonl( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate rows and return as JSON Lines string. - - Values are serialized in their native types — integers stay - as numbers, booleans as ``true``/``false``, etc. - - Parameters - ---------- - count : int - Number of rows. - path : str | None - Optional file path to write. - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - """ + """Generate rows and return as JSON Lines string.""" import json rows = self.generate(count) _dumps = json.dumps - # Build final string with trailing newline in one pass — - # avoids an extra string copy from ``content += "\n"``. if not rows: return "" content = "\n".join(_dumps(row, ensure_ascii=False) for row in rows) + "\n" @@ -817,26 +509,7 @@ def to_json( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate rows and return as a JSON array string. - - Parameters - ---------- - count : int - Number of rows. - path : str | None - Optional file path to write. - indent : int - JSON indentation level (default: 2). - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - """ + """Generate rows and return as a JSON array string.""" import json rows = self.generate(count) @@ -856,30 +529,7 @@ def stream_to_jsonl( encoding: str = "utf-8", compress: bool | None = None, ) -> int: - """Stream rows to a JSON Lines file without materializing all data. - - Writes rows in batches to keep memory usage constant - regardless of *count*. - - Parameters - ---------- - path : str - File path to write. - count : int - Total number of rows. - batch_size : int | None - Internal batch size. Auto-tuned when ``None``. - encoding : str - Character encoding (default: utf-8). - compress : bool | None - If ``True``, gzip the output. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - int - Number of rows written. - """ + """Stream rows to a JSON Lines file without materializing all data.""" import json columns = self._columns @@ -900,14 +550,11 @@ def stream_to_jsonl( if row_lambdas: batch_rows = [dict(zip(columns, row)) for row in zip(*col_data)] self._apply_row_lambdas(batch_rows) - # Buffer entire batch into a single write call _write( "\n".join(_dumps(row, ensure_ascii=False) for row in batch_rows) + "\n" ) else: - # Buffer entire batch — single write per batch instead - # of 2× write per row (data + newline). _write( "\n".join( _dumps(dict(zip(columns, row)), ensure_ascii=False) @@ -929,44 +576,20 @@ def to_sql( encoding: str = "utf-8", compress: bool | None = None, ) -> str: - """Generate rows and return as SQL INSERT statements. - - Parameters - ---------- - table : str - Target table name. - count : int - Number of rows. - dialect : str - SQL dialect: ``"sqlite"``, ``"mysql"``, or ``"postgresql"``. - path : str | None - If provided, write SQL to this file path. - encoding : str - Character encoding for file output (default: utf-8). - compress : bool | None - If ``True``, gzip the output file. ``None`` auto-detects - from a ``.gz`` file extension. - - Returns - ------- - str - """ + """Generate rows and return as SQL INSERT statements.""" rows = self.generate(count) if not rows: return "" columns = self._columns - # Quote identifiers per dialect if dialect == "mysql": col_list = ", ".join(f"`{c}`" for c in columns) tbl = f"`{table}`" - else: # sqlite, postgresql — both use double quotes + else: col_list = ", ".join(f'"{c}"' for c in columns) tbl = f'"{table}"' - # Multi-row INSERT: batch 1000 rows per statement for - # significantly fewer SQL statements and better throughput. _BATCH = 1000 prefix = f"INSERT INTO {tbl} ({col_list}) VALUES" parts: list[str] = [] @@ -993,19 +616,7 @@ def to_sql( return content def to_dataframe(self, count: int = 10) -> "Any": - """Generate rows as a pandas DataFrame. - - Requires ``pandas`` to be installed. - - Parameters - ---------- - count : int - Number of rows. - - Returns - ------- - pandas.DataFrame - """ + """Generate rows as a pandas DataFrame.""" try: import pandas as pd except ModuleNotFoundError as exc: @@ -1018,16 +629,12 @@ def to_dataframe(self, count: int = 10) -> "Any": col_data = self._generate_columns(count) if self._row_lambdas: - # Row-lambda path: must materialize rows for inter-column refs rows = [dict(zip(columns, row)) for row in zip(*col_data)] rows = self._apply_row_lambdas(rows) if self._unique_together: rows = self._enforce_unique_together(rows, count) return pd.DataFrame(rows) - # Null injection already applied inside _generate_columns. - # Build DataFrame directly from columnar data — avoids double - # transposition (col→row dicts→DataFrame re-columnarizes). return pd.DataFrame(dict(zip(columns, col_data))) def to_parquet( @@ -1036,28 +643,7 @@ def to_parquet( count: int = 10, batch_size: int | None = None, ) -> int: - """Generate rows and write as a Parquet file. - - Uses **PyArrow** for zero-copy columnar writes. Data is - generated in batches and written as row-groups so memory - stays bounded even for very large counts. - - Requires ``pyarrow`` to be installed. - - Parameters - ---------- - path : str - File path to write. - count : int - Number of rows. - batch_size : int | None - Rows per row-group. Auto-tuned when ``None``. - - Returns - ------- - int - Number of rows written. - """ + """Generate rows and write as a Parquet file via PyArrow.""" try: import pyarrow as pa import pyarrow.parquet as pq @@ -1089,33 +675,11 @@ def to_parquet( return written - # ------------------------------------------------------------------ - # Schema serialization - # ------------------------------------------------------------------ - def to_schema_dict(self, count: int = 10) -> dict[str, Any]: - """Export this schema's definition as a serializable dict. - - The returned dict can be saved to JSON/YAML/TOML via - :func:`dataforge.schema_io.save_schema` and later loaded - to recreate an equivalent schema. - - Callable (lambda) fields are **not** serializable and are - silently omitted. - - Parameters - ---------- - count : int - Default row count to include in the dict. - - Returns - ------- - dict[str, Any] - """ + """Export this schema's definition as a serializable dict.""" from dataforge.schema_io import schema_to_dict fields = self._fields_spec - # Filter out lambdas — they can't be serialized if isinstance(fields, dict): serializable: list[str] | dict[str, str] = { k: v for k, v in fields.items() if isinstance(v, str) @@ -1123,7 +687,6 @@ def to_schema_dict(self, count: int = 10) -> dict[str, Any]: else: serializable = list(fields) - # Reverse-map null_fields from index → column name null_fields: dict[str, float] | None = None if self._null_fields: columns = self._columns @@ -1131,7 +694,6 @@ def to_schema_dict(self, count: int = 10) -> dict[str, Any]: columns[idx]: prob for idx, prob in self._null_fields.items() } - # Reverse-map unique_together from index tuples → name tuples unique_together: list[tuple[str, ...]] | None = None if self._unique_together: columns = self._columns @@ -1152,22 +714,7 @@ def save_schema( count: int = 10, format: str | None = None, ) -> None: - """Save this schema's definition to a file. - - Supports JSON, YAML, and TOML formats. The format is - auto-detected from the file extension when *format* is - ``None``. - - Parameters - ---------- - path : str - File path to write. - count : int - Default row count to include in the definition. - format : str | None - Output format (``"json"``, ``"yaml"``, ``"toml"``). - Auto-detected from extension when ``None``. - """ + """Save this schema's definition to a file (JSON, YAML, or TOML).""" from dataforge.schema_io import save_schema d = self.to_schema_dict(count=count) @@ -1176,38 +723,12 @@ def save_schema( def __repr__(self) -> str: return f"Schema(columns={list(self._columns)!r})" - # ------------------------------------------------------------------ - # Arrow / Polars output - # ------------------------------------------------------------------ - def to_arrow( self, count: int = 10, batch_size: int | None = None, ) -> "Any": - """Generate rows and return as a PyArrow Table. - - Uses **column-first generation** directly into Arrow arrays — - no intermediate row-dict materialisation. This is the fastest - bulk export path because the data never leaves columnar form. - - When *count* exceeds *batch_size*, data is generated in batches - and concatenated via ``pyarrow.concat_tables`` for bounded - memory usage during generation. - - Requires ``pyarrow`` to be installed. - - Parameters - ---------- - count : int - Number of rows. - batch_size : int | None - Rows per internal batch. Auto-tuned when ``None``. - - Returns - ------- - pyarrow.Table - """ + """Generate rows and return as a PyArrow Table.""" try: import pyarrow as pa except ModuleNotFoundError as exc: @@ -1225,12 +746,10 @@ def to_arrow( schema = pa.schema([(col, pa.string()) for col in columns]) if count <= batch_size: - # Single-shot: no concat overhead str_data = self._generate_string_columns(count) arrays = [pa.array(col, type=pa.string()) for col in str_data] return pa.table(dict(zip(columns, arrays)), schema=schema) - # Multi-batch: generate batches → concat batches: list[Any] = [] remaining = count while remaining > 0: @@ -1247,29 +766,7 @@ def to_polars( count: int = 10, batch_size: int | None = None, ) -> "Any": - """Generate rows and return as a Polars DataFrame. - - Uses **column-first generation** directly into Polars Series — - no intermediate row-dict materialisation. This is significantly - faster than converting via pandas because we skip the pandas - intermediate entirely. - - When *count* exceeds *batch_size*, data is generated in batches - and concatenated via ``polars.concat`` for bounded memory. - - Requires ``polars`` to be installed. - - Parameters - ---------- - count : int - Number of rows. - batch_size : int | None - Rows per internal batch. Auto-tuned when ``None``. - - Returns - ------- - polars.DataFrame - """ + """Generate rows and return as a Polars DataFrame.""" try: import polars as pl except ModuleNotFoundError as exc: @@ -1291,7 +788,6 @@ def to_polars( schema={col: pl.Utf8 for col in columns}, ) - # Multi-batch: generate batches → concat frames: list[Any] = [] remaining = count while remaining > 0: @@ -1307,10 +803,6 @@ def to_polars( return pl.concat(frames) - # ------------------------------------------------------------------ - # Streaming to message queues - # ------------------------------------------------------------------ - def stream_to( self, emitter: Any, @@ -1318,24 +810,7 @@ def stream_to( batch_size: int = 100, rate_limit: float | None = None, ) -> int: - """Stream generated data to an emitter (HTTP, Kafka, RabbitMQ). - - Parameters - ---------- - emitter : StreamEmitter - The target emitter instance. - count : int - Total rows to emit. - batch_size : int - Rows per batch. - rate_limit : float | None - Max rows per second. ``None`` = unlimited. - - Returns - ------- - int - Number of rows emitted. - """ + """Stream generated data to an emitter (HTTP, Kafka, RabbitMQ).""" from dataforge.streaming import stream_batch_to_emitter, TokenBucketRateLimiter limiter = None @@ -1353,25 +828,7 @@ def stream_to_http( headers: dict[str, str] | None = None, rate_limit: float | None = None, ) -> int: - """Stream generated data to an HTTP endpoint via POST. - - Parameters - ---------- - url : str - Target URL. - count : int - Total rows. - batch_size : int - Rows per batch POST. - headers : dict | None - Additional HTTP headers. - rate_limit : float | None - Max rows per second. - - Returns - ------- - int - """ + """Stream generated data to an HTTP endpoint via POST.""" from dataforge.streaming import HttpEmitter emitter = HttpEmitter(url=url, headers=headers, batch_mode=True) @@ -1387,27 +844,7 @@ def stream_to_kafka( batch_size: int = 100, rate_limit: float | None = None, ) -> int: - """Stream generated data to a Kafka topic. - - Requires ``confluent-kafka``. - - Parameters - ---------- - bootstrap_servers : str - Kafka bootstrap servers. - topic : str - Kafka topic. - count : int - Total rows. - batch_size : int - Rows per batch. - rate_limit : float | None - Max rows per second. - - Returns - ------- - int - """ + """Stream generated data to a Kafka topic.""" from dataforge.streaming import KafkaEmitter emitter = KafkaEmitter(bootstrap_servers=bootstrap_servers, topic=topic) diff --git a/src/dataforge/schema_io.py b/src/dataforge/schema_io.py index 53ae2be..a5759a5 100644 --- a/src/dataforge/schema_io.py +++ b/src/dataforge/schema_io.py @@ -1,40 +1,4 @@ -"""Schema I/O — serialize and deserialize schema definitions. - -Supports JSON, YAML, and TOML formats for defining schemas as config -files that can be loaded by the CLI or Python API. - -A schema definition is a dict with the following structure:: - - { - "fields": {"name": "full_name", "email": "email"}, - "null_fields": {"email": 0.2}, - "unique_together": [["name", "email"]], - "count": 1000 - } - -Or with a simple field list:: - - { - "fields": ["first_name", "last_name", "email"], - "count": 100 - } - -Relational schemas use the ``tables`` key:: - - { - "tables": { - "users": { - "fields": ["first_name", "email"], - "count": 100 - }, - "orders": { - "fields": ["date", "city"], - "count": 500, - "parent": "users" - } - } - } -""" +"""Schema I/O — serialize and deserialize schema definitions.""" from __future__ import annotations @@ -48,28 +12,10 @@ def schema_to_dict( null_fields: dict[str, float] | None = None, unique_together: list[tuple[str, ...]] | None = None, ) -> dict[str, Any]: - """Convert schema parameters to a serializable dict. - - Parameters - ---------- - fields : list[str] | dict[str, str] - Field specifications. - count : int - Default row count. - null_fields : dict[str, float] | None - Null probability mapping. - unique_together : list[tuple[str, ...]] | None - Uniqueness constraints. - - Returns - ------- - dict[str, Any] - """ + """Convert schema parameters to a serializable dict.""" d: dict[str, Any] = {} - # Normalize fields to the most compact representation if isinstance(fields, dict): - # Check if all keys equal values (simple list form) if all(k == v for k, v in fields.items()): d["fields"] = list(fields.keys()) else: @@ -96,23 +42,7 @@ def dict_to_schema_args( dict[str, float] | None, list[tuple[str, ...]] | None, ]: - """Parse a schema dict back into constructor arguments. - - Parameters - ---------- - d : dict[str, Any] - Schema definition dict. - - Returns - ------- - tuple - ``(fields, count, null_fields, unique_together)`` - - Raises - ------ - ValueError - If ``fields`` key is missing. - """ + """Parse a schema dict back into constructor arguments.""" if "fields" not in d: raise ValueError("Schema definition must contain a 'fields' key.") @@ -141,33 +71,12 @@ def dict_to_schema_args( return fields, count, null_fields, unique_together -# ------------------------------------------------------------------ -# File I/O -# ------------------------------------------------------------------ - - def save_schema( d: dict[str, Any], path: str, format: str | None = None, ) -> None: - """Save a schema definition dict to a file. - - Parameters - ---------- - d : dict[str, Any] - Schema definition dict (from :func:`schema_to_dict`). - path : str - File path to write. - format : str | None - Output format: ``"json"``, ``"yaml"``, or ``"toml"``. - Auto-detected from file extension when ``None``. - - Raises - ------ - ValueError - If the format cannot be determined or is unsupported. - """ + """Save a schema definition dict to a file.""" fmt = format or _detect_format(path) if fmt == "json": @@ -186,28 +95,7 @@ def load_schema( path: str, format: str | None = None, ) -> dict[str, Any]: - """Load a schema definition dict from a file. - - Parameters - ---------- - path : str - File path to read. - format : str | None - Input format: ``"json"``, ``"yaml"``, or ``"toml"``. - Auto-detected from file extension when ``None``. - - Returns - ------- - dict[str, Any] - Parsed schema definition. - - Raises - ------ - ValueError - If the format cannot be determined or is unsupported. - FileNotFoundError - If the file does not exist. - """ + """Load a schema definition dict from a file.""" fmt = format or _detect_format(path) if fmt == "json": @@ -238,11 +126,6 @@ def _detect_format(path: str) -> str: ) -# ------------------------------------------------------------------ -# JSON (stdlib — zero dependencies) -# ------------------------------------------------------------------ - - def _save_json(d: dict[str, Any], path: str) -> None: with open(path, "w", encoding="utf-8") as f: _json.dump(d, f, indent=2, ensure_ascii=False) @@ -259,14 +142,6 @@ def _load_json(path: str) -> dict[str, Any]: return data -# ------------------------------------------------------------------ -# YAML (pure-Python fallback — zero dependencies) -# ------------------------------------------------------------------ -# We implement a minimal YAML parser/emitter that handles the subset -# needed for schema definitions (strings, numbers, lists, dicts, -# booleans, null). This avoids requiring PyYAML as a dependency. - - def _save_yaml(d: dict[str, Any], path: str) -> None: lines = _yaml_dump(d, indent=0) with open(path, "w", encoding="utf-8") as f: @@ -288,11 +163,9 @@ def _yaml_dump(obj: Any, indent: int) -> list[str]: lines.append(f"{prefix}{key}:") for item in val: if isinstance(item, list): - # List of lists (unique_together) items_str = ", ".join(_yaml_scalar(v) for v in item) lines.append(f"{prefix} - [{items_str}]") elif isinstance(item, dict): - # First key inline, rest indented first = True for k2, v2 in item.items(): if first: @@ -324,7 +197,6 @@ def _yaml_scalar(val: Any) -> str: if isinstance(val, float): return str(val) if isinstance(val, str): - # Quote if the string could be misinterpreted if ( val == "" or val in ("true", "false", "null", "yes", "no", "on", "off") @@ -333,7 +205,6 @@ def _yaml_scalar(val: Any) -> str: or val[0] == "#" or "," in val ): - # Use double-quoted form with escaping escaped = val.replace("\\", "\\\\").replace('"', '\\"') return f'"{escaped}"' return val @@ -341,11 +212,7 @@ def _yaml_scalar(val: Any) -> str: def _load_yaml(path: str) -> dict[str, Any]: - """Minimal YAML parser for schema definitions. - - Handles the subset of YAML needed for schema files: - mappings, sequences, strings, numbers, booleans, null. - """ + """Minimal YAML parser for schema definitions.""" with open(path, "r", encoding="utf-8") as f: text = f.read() @@ -355,13 +222,11 @@ def _load_yaml(path: str) -> dict[str, Any]: def _yaml_parse(text: str) -> dict[str, Any]: """Parse a YAML string into a dict.""" lines = text.split("\n") - # Remove empty lines and comments cleaned: list[tuple[int, str]] = [] for line in lines: stripped = line.rstrip() if not stripped or stripped.lstrip().startswith("#"): continue - # Calculate indent level content = stripped.lstrip() indent_chars = len(stripped) - len(content) cleaned.append((indent_chars, content)) @@ -387,7 +252,6 @@ def _yaml_parse_mapping( if indent > base_indent and i > start: break - # Parse key: value colon_pos = content.find(":") if colon_pos == -1: i += 1 @@ -397,20 +261,16 @@ def _yaml_parse_mapping( rest = content[colon_pos + 1 :].strip() if rest: - # Inline value result[key] = _yaml_parse_value(rest) i += 1 else: - # Block value — check next line if i + 1 < len(lines): next_indent, next_content = lines[i + 1] if next_indent > indent: if next_content.startswith("- "): - # Sequence val, i = _yaml_parse_sequence(lines, i + 1, next_indent) result[key] = val else: - # Nested mapping val, i = _yaml_parse_mapping(lines, i + 1, next_indent) result[key] = val else: @@ -442,7 +302,6 @@ def _yaml_parse_sequence( item_str = content[2:].strip() - # Check for inline list [a, b, c] if item_str.startswith("[") and item_str.endswith("]"): inner = item_str[1:-1] items = [_yaml_parse_scalar(s.strip()) for s in inner.split(",")] @@ -453,13 +312,11 @@ def _yaml_parse_sequence( and not item_str.startswith('"') and not item_str.startswith("'") ): - # Inline mapping item mapping: dict[str, Any] = {} colon_pos = item_str.find(":") k = item_str[:colon_pos].strip() v = item_str[colon_pos + 1 :].strip() mapping[k] = _yaml_parse_value(v) if v else None - # Read continuation lines at deeper indent while i + 1 < len(lines): next_indent, next_content = lines[i + 1] if next_indent <= indent: @@ -484,14 +341,12 @@ def _yaml_parse_value(s: str) -> Any: if not s: return None - # Inline list if s.startswith("[") and s.endswith("]"): inner = s[1:-1] if not inner.strip(): return [] return [_yaml_parse_scalar(item.strip()) for item in inner.split(",")] - # Inline dict if s.startswith("{") and s.endswith("}"): inner = s[1:-1] if not inner.strip(): @@ -511,29 +366,24 @@ def _yaml_parse_scalar(s: str) -> Any: if not s: return None - # Remove quotes if (s.startswith('"') and s.endswith('"')) or ( s.startswith("'") and s.endswith("'") ): return s[1:-1].replace('\\"', '"').replace("\\\\", "\\") - # Booleans if s.lower() in ("true", "yes", "on"): return True if s.lower() in ("false", "no", "off"): return False - # Null if s.lower() in ("null", "~"): return None - # Integer try: return int(s) except ValueError: pass - # Float try: return float(s) except ValueError: @@ -542,11 +392,6 @@ def _yaml_parse_scalar(s: str) -> Any: return s -# ------------------------------------------------------------------ -# TOML (Python 3.11+ stdlib tomllib, or minimal fallback) -# ------------------------------------------------------------------ - - def _save_toml(d: dict[str, Any], path: str) -> None: """Write a schema dict as TOML.""" lines = _toml_dump(d) @@ -558,7 +403,6 @@ def _save_toml(d: dict[str, Any], path: str) -> None: def _toml_dump(d: dict[str, Any], prefix: str = "") -> list[str]: """Minimal TOML emitter for schema dicts.""" lines: list[str] = [] - # Emit simple key-value pairs first, then tables simple_keys: list[str] = [] table_keys: list[str] = [] @@ -588,7 +432,6 @@ def _toml_dump(d: dict[str, Any], prefix: str = "") -> list[str]: else: lines.append(f"{k2} = {_toml_value(v2)}") elif isinstance(val, list): - # Array of arrays or array of tables for item in val: if isinstance(item, dict): lines.append("") @@ -596,9 +439,7 @@ def _toml_dump(d: dict[str, Any], prefix: str = "") -> list[str]: for k2, v2 in item.items(): lines.append(f"{k2} = {_toml_value(v2)}") elif isinstance(item, (list, tuple)): - # unique_together: array of arrays — emit as inline pass - # For unique_together arrays, emit the whole thing as inline if val and isinstance(val[0], (list, tuple)): inner = ", ".join( "[" + ", ".join(f'"{c}"' for c in group) + "]" for group in val @@ -611,7 +452,6 @@ def _toml_dump(d: dict[str, Any], prefix: str = "") -> list[str]: def _toml_value(val: Any) -> str: """Format a value for TOML output.""" if val is None: - # TOML doesn't have null — use empty string return '""' if isinstance(val, bool): return "true" if val else "false" @@ -629,141 +469,8 @@ def _toml_value(val: Any) -> str: def _load_toml(path: str) -> dict[str, Any]: - """Load a TOML file. - - Uses stdlib ``tomllib`` (Python 3.11+) when available, - otherwise falls back to a minimal parser. - """ - try: - import tomllib # Python 3.11+ - except ModuleNotFoundError: - try: - import tomli as tomllib # type: ignore[no-redef] - except ModuleNotFoundError: - return _load_toml_fallback(path) + """Load a TOML file using stdlib tomllib (Python 3.11+).""" + import tomllib with open(path, "rb") as f: return tomllib.load(f) - - -def _load_toml_fallback(path: str) -> dict[str, Any]: - """Minimal TOML parser for schema definitions. - - Handles the subset of TOML needed for schema files: - bare keys, string/number values, arrays, and tables. - """ - with open(path, "r", encoding="utf-8") as f: - text = f.read() - - result: dict[str, Any] = {} - current_table: dict[str, Any] = result - - for line in text.split("\n"): - stripped = line.strip() - if not stripped or stripped.startswith("#"): - continue - - # Table header [key] or [[key]] - if stripped.startswith("[[") and stripped.endswith("]]"): - table_key = stripped[2:-2].strip() - parts = [p.strip() for p in table_key.split(".")] - # Array of tables - current_table = result - for part in parts[:-1]: - current_table = current_table.setdefault(part, {}) - last = parts[-1] - if last not in current_table: - current_table[last] = [] - entry: dict[str, Any] = {} - current_table[last].append(entry) - current_table = entry - continue - - if stripped.startswith("[") and stripped.endswith("]"): - table_key = stripped[1:-1].strip() - parts = [p.strip() for p in table_key.split(".")] - current_table = result - for part in parts: - current_table = current_table.setdefault(part, {}) - continue - - # Key = value - if "=" in stripped: - eq_pos = stripped.index("=") - key = stripped[:eq_pos].strip().strip('"').strip("'") - val_str = stripped[eq_pos + 1 :].strip() - current_table[key] = _toml_parse_value(val_str) - - return result - - -def _toml_parse_value(s: str) -> Any: - """Parse a TOML value string.""" - s = s.strip() - if not s: - return "" - - # Quoted string - if s.startswith('"""') and s.endswith('"""'): - return s[3:-3] - if s.startswith("'''") and s.endswith("'''"): - return s[3:-3] - if s.startswith('"') and s.endswith('"'): - return s[1:-1].replace('\\"', '"').replace("\\\\", "\\") - if s.startswith("'") and s.endswith("'"): - return s[1:-1] - - # Boolean - if s == "true": - return True - if s == "false": - return False - - # Array - if s.startswith("[") and s.endswith("]"): - return _toml_parse_array(s[1:-1].strip()) - - # Integer - try: - return int(s) - except ValueError: - pass - - # Float - try: - return float(s) - except ValueError: - pass - - return s - - -def _toml_parse_array(s: str) -> list[Any]: - """Parse a TOML array interior (without outer brackets).""" - if not s: - return [] - - items: list[Any] = [] - depth = 0 - current = "" - - for ch in s: - if ch == "[": - depth += 1 - current += ch - elif ch == "]": - depth -= 1 - current += ch - elif ch == "," and depth == 0: - val = current.strip() - if val: - items.append(_toml_parse_value(val)) - current = "" - else: - current += ch - - val = current.strip() - if val: - items.append(_toml_parse_value(val)) - - return items diff --git a/src/dataforge/seeder.py b/src/dataforge/seeder.py index 66e820e..64fa4f1 100644 --- a/src/dataforge/seeder.py +++ b/src/dataforge/seeder.py @@ -1,32 +1,4 @@ -"""Database seeding — populate databases with realistic fake data. - -Uses SQLAlchemy (optional dependency) to introspect table structures, -generate matching fake data, and insert it with dialect-specific -optimizations (PostgreSQL COPY, MySQL FK checks, SQLite pragmas). - -Usage:: - - from dataforge import DataForge - from dataforge.seeder import DatabaseSeeder - - forge = DataForge(seed=42) - seeder = DatabaseSeeder(forge, "sqlite:///test.db") - - # Seed a single table - seeder.seed_table("users", count=1000) - - # Seed with field overrides - seeder.seed_table("users", count=1000, field_overrides={ - "email": "email", - "created_at": "datetime", - }) - - # Seed related tables - seeder.seed_relational({ - "users": {"count": 100}, - "orders": {"count": 500, "parent": "users"}, - }) -""" +"""Database seeding — populate databases with realistic fake data.""" from __future__ import annotations @@ -36,26 +8,8 @@ from dataforge.core import DataForge -# Column-name → DataForge field heuristic (reuse from core) -def _get_heuristic_map() -> dict[str, str]: - """Import and return field heuristic mappings.""" - from dataforge.core import _FIELD_ALIASES, _SA_TYPE_MAP - - return _FIELD_ALIASES, _SA_TYPE_MAP - - class DatabaseSeeder: - """Database seeder with SQLAlchemy table introspection. - - Parameters - ---------- - forge : DataForge - The DataForge instance for generating data. - connection_string : str - SQLAlchemy connection string (e.g. ``"sqlite:///test.db"``). - echo : bool - If True, echo SQL statements to stdout. - """ + """Database seeder with SQLAlchemy table introspection.""" __slots__ = ("_forge", "_connection_string", "_echo", "_engine", "_metadata") @@ -95,18 +49,7 @@ def _get_metadata(self) -> Any: return self._metadata def _introspect_table(self, table_name: str) -> dict[str, str]: - """Introspect a table and map columns to DataForge fields. - - Parameters - ---------- - table_name : str - Name of the database table. - - Returns - ------- - dict[str, str] - Column name → DataForge field name. - """ + """Introspect a table and map columns to DataForge fields.""" from dataforge.core import _FIELD_ALIASES, _SA_TYPE_MAP from dataforge.registry import get_field_map @@ -124,26 +67,21 @@ def _introspect_table(self, table_name: str) -> dict[str, str]: for col in table.columns: col_name = col.name - # Skip auto-increment primary keys if col.primary_key and col.autoincrement: continue - # Skip foreign keys (handled separately in relational seeding) if col.foreign_keys: continue - # Tier 1: exact name match if col_name in field_map: mapped[col_name] = col_name continue - # Tier 2: alias match alias = _FIELD_ALIASES.get(col_name) if alias and alias in field_map: mapped[col_name] = alias continue - # Tier 3: column type fallback type_name = type(col.type).__name__ type_field = _SA_TYPE_MAP.get(type_name) if type_field and type_field in field_map: @@ -159,30 +97,12 @@ def seed_table( field_overrides: dict[str, str] | None = None, batch_size: int = 1000, ) -> int: - """Seed a single table with fake data. - - Parameters - ---------- - table_name : str - Name of the table to seed. - count : int - Number of rows to insert. - field_overrides : dict[str, str] | None - Override column → field mappings. - batch_size : int - Insert batch size. - - Returns - ------- - int - Number of rows inserted. - """ + """Seed a single table with fake data.""" engine = self._get_engine() metadata = self._get_metadata() table = metadata.tables[table_name] dialect = engine.dialect.name - # Build field mapping field_map = self._introspect_table(table_name) if field_overrides: field_map.update(field_overrides) @@ -193,10 +113,8 @@ def seed_table( f"Use field_overrides to specify mappings." ) - # Generate data using Schema schema = self._forge.schema(field_map) - # Dialect-specific optimizations with engine.begin() as conn: self._apply_dialect_optimizations(conn, dialect, before=True) @@ -218,28 +136,11 @@ def seed_relational( tables: dict[str, dict[str, Any]], batch_size: int = 1000, ) -> dict[str, int]: - """Seed multiple related tables with referential integrity. - - Uses the existing ``RelationalSchema`` for data generation, - then inserts the results into the database. - - Parameters - ---------- - tables : dict[str, dict] - Table specifications (same format as ``forge.relational()``). - batch_size : int - Insert batch size per table. - - Returns - ------- - dict[str, int] - Number of rows inserted per table. - """ + """Seed multiple related tables with referential integrity.""" engine = self._get_engine() metadata = self._get_metadata() dialect = engine.dialect.name - # For each table, auto-detect fields if not specified for name, spec in tables.items(): if "fields" not in spec: field_overrides = spec.get("field_overrides", {}) @@ -247,11 +148,9 @@ def seed_relational( detected.update(field_overrides) spec["fields"] = detected - # Generate data with referential integrity rel_schema = self._forge.relational(tables) data = rel_schema.generate() - # Insert in topological order result: dict[str, int] = {} with engine.begin() as conn: self._apply_dialect_optimizations(conn, dialect, before=True) @@ -262,7 +161,6 @@ def seed_relational( table = metadata.tables[table_name] rows = data[table_name] - # Insert in batches inserted = 0 for batch_start in range(0, len(rows), batch_size): batch = rows[batch_start : batch_start + batch_size] @@ -297,20 +195,15 @@ def _apply_dialect_optimizations( conn.execute(text("PRAGMA synchronous = OFF")) conn.execute(text("PRAGMA cache_size = -64000")) except Exception: - pass # PRAGMAs may fail inside transactions + pass else: try: conn.execute(text("PRAGMA synchronous = FULL")) except Exception: - pass # PRAGMAs may fail inside transactions + pass def list_tables(self) -> list[str]: - """List all tables in the database. - - Returns - ------- - list[str] - """ + """List all tables in the database.""" metadata = self._get_metadata() return sorted(metadata.tables.keys()) diff --git a/src/dataforge/streaming.py b/src/dataforge/streaming.py index 90f748f..5cb1d16 100644 --- a/src/dataforge/streaming.py +++ b/src/dataforge/streaming.py @@ -1,25 +1,4 @@ -"""Streaming to message queues — emit generated data to HTTP, Kafka, RabbitMQ. - -Provides abstract and concrete emitters for streaming fake data to -external systems in real time with rate limiting. - -Usage:: - - from dataforge import DataForge - from dataforge.streaming import HttpEmitter, TokenBucketRateLimiter - - forge = DataForge(seed=42) - schema = forge.schema(["first_name", "email", "city"]) - - # Stream to HTTP endpoint - emitter = HttpEmitter("https://api.example.com/ingest") - schema.stream_to(emitter, count=10000, rate_limit=100) - - # With rate limiting - limiter = TokenBucketRateLimiter(rate=50, burst=10) - emitter = HttpEmitter("https://api.example.com/ingest") - stream_to_emitter(schema, emitter, count=1000, rate_limiter=limiter) -""" +"""Streaming to message queues — emit generated data to HTTP, Kafka, RabbitMQ.""" from __future__ import annotations @@ -31,21 +10,8 @@ from dataforge.schema import Schema -# ------------------------------------------------------------------ -# Rate limiter -# ------------------------------------------------------------------ - - class TokenBucketRateLimiter: - """Token bucket rate limiter using ``time.monotonic()``. - - Parameters - ---------- - rate : float - Tokens per second (sustained rate). - burst : int - Maximum burst size (bucket capacity). - """ + """Token bucket rate limiter using ``time.monotonic()``.""" __slots__ = ("_rate", "_burst", "_tokens", "_last_time") @@ -65,22 +31,12 @@ def acquire(self, n: int = 1) -> None: if self._tokens >= n: self._tokens -= n return - # Sleep for the time needed to accumulate enough tokens deficit = n - self._tokens _time.sleep(deficit / self._rate) -# ------------------------------------------------------------------ -# Abstract emitter -# ------------------------------------------------------------------ - - class StreamEmitter: - """Abstract base class for stream emitters. - - Subclasses must implement :meth:`emit` and optionally - :meth:`open` and :meth:`close` for resource management. - """ + """Abstract base class for stream emitters.""" __slots__ = () @@ -92,7 +48,7 @@ def emit(self, row: dict[str, Any]) -> None: raise NotImplementedError def emit_batch(self, rows: list[dict[str, Any]]) -> None: - """Emit a batch of rows. Default: emit one by one.""" + """Emit a batch of rows.""" for row in rows: self.emit(row) @@ -107,28 +63,8 @@ def __exit__(self, *args: Any) -> None: self.close() -# ------------------------------------------------------------------ -# HTTP emitter (zero-dep, stdlib urllib) -# ------------------------------------------------------------------ - - class HttpEmitter(StreamEmitter): - """Stream data to an HTTP endpoint via POST requests. - - Uses stdlib ``urllib`` — zero external dependencies. - - Parameters - ---------- - url : str - Target URL for POST requests. - headers : dict[str, str] | None - Additional HTTP headers. - batch_mode : bool - If True, emit_batch sends the whole batch as a JSON array. - If False, each row is sent individually. - timeout : float - Request timeout in seconds. - """ + """Stream data to an HTTP endpoint via POST requests.""" __slots__ = ("_url", "_headers", "_batch_mode", "_timeout") @@ -175,25 +111,8 @@ def __repr__(self) -> str: return f"HttpEmitter(url={self._url!r})" -# ------------------------------------------------------------------ -# Kafka emitter (optional confluent-kafka) -# ------------------------------------------------------------------ - - class KafkaEmitter(StreamEmitter): - """Stream data to Apache Kafka. - - Requires ``confluent-kafka`` to be installed. - - Parameters - ---------- - bootstrap_servers : str - Kafka bootstrap servers. - topic : str - Kafka topic to produce to. - config : dict | None - Additional Kafka producer configuration. - """ + """Stream data to Apache Kafka (requires ``confluent-kafka``).""" __slots__ = ("_servers", "_topic", "_config", "_producer") @@ -242,29 +161,8 @@ def __repr__(self) -> str: return f"KafkaEmitter(servers={self._servers!r}, topic={self._topic!r})" -# ------------------------------------------------------------------ -# RabbitMQ emitter (optional pika) -# ------------------------------------------------------------------ - - class RabbitMQEmitter(StreamEmitter): - """Stream data to RabbitMQ. - - Requires ``pika`` to be installed. - - Parameters - ---------- - host : str - RabbitMQ host. - queue : str - Queue name. - exchange : str - Exchange name. - routing_key : str - Routing key. - port : int - RabbitMQ port. - """ + """Stream data to RabbitMQ (requires ``pika``).""" __slots__ = ( "_host", @@ -325,11 +223,6 @@ def __repr__(self) -> str: return f"RabbitMQEmitter(host={self._host!r}, queue={self._queue!r})" -# ------------------------------------------------------------------ -# Streaming helper -# ------------------------------------------------------------------ - - def stream_to_emitter( schema: "Schema", emitter: StreamEmitter, @@ -337,28 +230,7 @@ def stream_to_emitter( batch_size: int = 100, rate_limiter: TokenBucketRateLimiter | None = None, ) -> int: - """Stream schema-generated data to an emitter. - - Uses batch generation and batch emission for better throughput. - - Parameters - ---------- - schema : Schema - The DataForge Schema to generate data from. - emitter : StreamEmitter - The target emitter. - count : int - Total number of rows to emit. - batch_size : int - Rows per batch. - rate_limiter : TokenBucketRateLimiter | None - Optional rate limiter. - - Returns - ------- - int - Number of rows emitted. - """ + """Stream schema-generated data to an emitter.""" emitted = 0 remaining = count @@ -375,44 +247,4 @@ def stream_to_emitter( return emitted -def stream_batch_to_emitter( - schema: "Schema", - emitter: StreamEmitter, - count: int = 1000, - batch_size: int = 100, - rate_limiter: TokenBucketRateLimiter | None = None, -) -> int: - """Stream schema-generated data in batches to an emitter. - - Parameters - ---------- - schema : Schema - The DataForge Schema to generate data from. - emitter : StreamEmitter - The target emitter. - count : int - Total number of rows to emit. - batch_size : int - Rows per batch. - rate_limiter : TokenBucketRateLimiter | None - Optional rate limiter. - - Returns - ------- - int - Number of rows emitted. - """ - emitted = 0 - remaining = count - - with emitter: - while remaining > 0: - chunk = min(remaining, batch_size) - rows = schema.generate(count=chunk) - if rate_limiter is not None: - rate_limiter.acquire(chunk) - emitter.emit_batch(rows) - emitted += chunk - remaining -= chunk - - return emitted +stream_batch_to_emitter = stream_to_emitter diff --git a/src/dataforge/timeseries.py b/src/dataforge/timeseries.py index af5bdab..0ff40a9 100644 --- a/src/dataforge/timeseries.py +++ b/src/dataforge/timeseries.py @@ -1,35 +1,4 @@ -"""Time-series generation — synthetic time-series data with trends and patterns. - -Generates realistic time-series data with configurable trend, seasonality, -noise, anomalies, regime changes, missing data gaps, and spiky patterns. - -Usage:: - - from dataforge import DataForge, TimeSeriesSchema - - forge = DataForge(seed=42) - ts = TimeSeriesSchema( - forge, - start="2024-01-01", - end="2024-12-31", - interval="1h", - fields={ - "temperature": { - "trend": 0.01, - "seasonality": {"period": 24, "amplitude": 5.0}, - "noise": 0.5, - "base": 20.0, - }, - "humidity": { - "trend": -0.005, - "seasonality": {"period": 24, "amplitude": 10.0}, - "noise": 2.0, - "base": 60.0, - }, - }, - ) - rows = ts.generate() -""" +"""Time-series generation — synthetic time-series data with trends and patterns.""" from __future__ import annotations @@ -41,10 +10,6 @@ if TYPE_CHECKING: from dataforge.core import DataForge -# ------------------------------------------------------------------ -# Interval parsing -# ------------------------------------------------------------------ - _INTERVAL_UNITS: dict[str, int] = { "s": 1, "m": 60, @@ -63,20 +28,15 @@ def _parse_interval(interval: str) -> int: num_str = interval[: -len(suffix)].strip() num = int(num_str) if num_str else 1 return num * multiplier - # Try pure numeric (assume seconds) return int(interval) def _parse_datetime(dt_str: str) -> float: - """Parse an ISO datetime string to a POSIX timestamp. - - Naive datetimes (without timezone info) are treated as UTC. - """ + """Parse an ISO datetime string to a POSIX timestamp.""" if "T" in dt_str: dt = _datetime.datetime.fromisoformat(dt_str) else: dt = _datetime.datetime.fromisoformat(dt_str + "T00:00:00") - # Treat naive datetimes as UTC if dt.tzinfo is None: dt = dt.replace(tzinfo=_datetime.timezone.utc) return dt.timestamp() @@ -92,40 +52,8 @@ def _timestamp_to_iso(ts: float) -> str: return dt.isoformat(timespec="seconds").replace("+00:00", "Z") -# ------------------------------------------------------------------ -# TimeSeriesSchema -# ------------------------------------------------------------------ - - class TimeSeriesSchema: - """Pre-configured time-series generator with trend, seasonality, and noise. - - Parameters - ---------- - forge : DataForge - The parent generator instance. - start : str - Start datetime (ISO format). - end : str - End datetime (ISO format). - interval : str - Time step between points (e.g. ``"1h"``, ``"30m"``, ``"1d"``). - fields : dict[str, dict] - Field specifications. Each field config can include: - - - ``base`` — base value (default: 0.0) - - ``trend`` — linear trend per step (default: 0.0) - - ``seasonality`` — dict with ``period`` (in steps) and - ``amplitude`` (default: no seasonality) - - ``noise`` — Gaussian noise std dev (default: 0.0) - - ``anomaly_rate`` — probability of anomaly per point (default: 0.0) - - ``anomaly_scale`` — anomaly multiplier (default: 3.0) - - ``spike_rate`` — probability of spike per point (default: 0.0) - - ``spike_scale`` — spike multiplier (default: 5.0) - - ``min_val`` / ``max_val`` — clamp range - - ``regime_changes`` — list of ``{"at_step": N, "base": X, "trend": Y}`` - - ``missing_rate`` — probability of missing data per point (default: 0.0) - """ + """Pre-configured time-series generator with trend, seasonality, and noise.""" __slots__ = ( "_forge", @@ -152,7 +80,6 @@ def __init__( self._fields = fields or {} self._rng = forge._engine._rng - # Pre-compute timestamps ts_list: list[float] = [] t = self._start while t <= self._end: @@ -166,13 +93,7 @@ def num_points(self) -> int: return len(self._timestamps) def generate(self) -> list[dict[str, Any]]: - """Generate the full time-series as a list of row dicts. - - Returns - ------- - list[dict[str, Any]] - Each dict has a ``"timestamp"`` key plus one key per field. - """ + """Generate the full time-series as a list of row dicts.""" n = self.num_points if n == 0: return [] @@ -180,17 +101,14 @@ def generate(self) -> list[dict[str, Any]]: rng = self._rng timestamps = self._timestamps - # Pre-convert all timestamps (avoids per-row function call overhead) _to_iso = _timestamp_to_iso ts_strings = [_to_iso(ts) for ts in timestamps] - # Column-first generation: build all field columns, then assemble rows once field_columns: list[tuple[str, list[Any]]] = [] for field_name, config in self._fields.items(): values = self._generate_field(config, n, rng) field_columns.append((field_name, values)) - # Assemble rows in a single pass if field_columns: rows: list[dict[str, Any]] = [None] * n # type: ignore[list-item] for i in range(n): @@ -221,7 +139,6 @@ def _generate_field( min_val = config.get("min_val") max_val = config.get("max_val") - # Pre-compute clamping as floats once has_min = min_val is not None has_max = max_val is not None if has_min: @@ -229,7 +146,6 @@ def _generate_field( if has_max: max_val_f = float(max_val) - # Seasonality season_cfg = config.get("seasonality") has_season = season_cfg is not None if has_season: @@ -240,13 +156,11 @@ def _generate_field( else: period = amplitude = phase = 0.0 - # Pre-compute feature flags for tight loop has_noise = noise_std > 0.0 has_anomaly = anomaly_rate > 0.0 has_spike = spike_rate > 0.0 has_missing = missing_rate > 0.0 - # Regime changes: sorted by step — pre-check emptiness regimes = config.get("regime_changes") has_regimes = bool(regimes) if has_regimes: @@ -257,7 +171,6 @@ def _generate_field( else: regime_map = None # type: ignore[assignment] - # Generate values — tight loop with pre-computed flags values: list[Any] = [None] * n current_base = base current_trend = trend @@ -266,11 +179,9 @@ def _generate_field( _sin = _math.sin _pi2 = 2.0 * _math.pi - # Pre-compute anomaly noise scale anomaly_noise = noise_std * anomaly_scale if has_noise else anomaly_scale for i in range(n): - # Check for regime change (skip dict lookup when no regimes) if has_regimes and i in regime_map: rc = regime_map[i] if "base" in rc: @@ -278,32 +189,24 @@ def _generate_field( if "trend" in rc: current_trend = float(rc["trend"]) - # Missing data if has_missing and _random() < missing_rate: - # values[i] already None continue - # Base + trend val = current_base + current_trend * i - # Seasonality (sinusoidal) if has_season: val += amplitude * _sin(_pi2 * (i + phase) / period) - # Noise if has_noise: val += _gauss(0.0, noise_std) - # Anomaly injection if has_anomaly and _random() < anomaly_rate: val += _gauss(0.0, anomaly_noise) - # Spike injection if has_spike and _random() < spike_rate: direction = 1.0 if _random() > 0.5 else -1.0 val += direction * abs(val) * spike_scale if val != 0 else spike_scale - # Clamping if has_min and val < min_val_f: val = min_val_f if has_max and val > max_val_f: @@ -314,17 +217,7 @@ def _generate_field( return values def stream(self, batch_size: int = 1000) -> Iterator[dict[str, Any]]: - """Yield rows lazily in batches. - - Parameters - ---------- - batch_size : int - Number of rows per batch. - - Yields - ------ - dict[str, Any] - """ + """Yield rows lazily in batches.""" rows = self.generate() yield from rows @@ -333,20 +226,7 @@ def to_csv( path: str | None = None, delimiter: str = ",", ) -> str: - """Export time-series as CSV. - - Parameters - ---------- - path : str | None - File path to write. Returns string if None. - delimiter : str - CSV delimiter. - - Returns - ------- - str - CSV content. - """ + """Export time-series as CSV.""" import csv import io @@ -377,19 +257,7 @@ def to_json( path: str | None = None, indent: int = 2, ) -> str: - """Export time-series as JSON array. - - Parameters - ---------- - path : str | None - File path to write. - indent : int - JSON indentation. - - Returns - ------- - str - """ + """Export time-series as JSON array.""" import json rows = self.generate() @@ -400,12 +268,7 @@ def to_json( return content def to_dataframe(self) -> Any: - """Export as pandas DataFrame. - - Returns - ------- - pandas.DataFrame - """ + """Export as pandas DataFrame.""" try: import pandas as pd except ModuleNotFoundError as exc: diff --git a/src/dataforge/tui/app.py b/src/dataforge/tui/app.py index 1723b50..6f22622 100644 --- a/src/dataforge/tui/app.py +++ b/src/dataforge/tui/app.py @@ -34,9 +34,7 @@ ) -# ------------------------------------------------------------------ # Export dialog -# ------------------------------------------------------------------ class ExportDialog(ModalScreen[dict[str, Any] | None]): @@ -118,9 +116,7 @@ def action_cancel(self) -> None: self.dismiss(None) -# ------------------------------------------------------------------ # Main TUI application -# ------------------------------------------------------------------ class DataForgeTUI(App): diff --git a/src/dataforge/unique.py b/src/dataforge/unique.py index ef4f880..2cf7251 100644 --- a/src/dataforge/unique.py +++ b/src/dataforge/unique.py @@ -1,26 +1,4 @@ -"""UniqueProxy — wrapper for unique value generation. - -Intercepts provider method calls and ensures each returned value is -unique within the lifetime of the proxy (or until :meth:`clear` is -called). - -Usage:: - - forge = DataForge(seed=42) - forge.unique.person.first_name() # guaranteed unique per call - forge.unique.clear() # reset tracking - -Performance ------------ -The proxy adds a thin ``set``-membership check per scalar value -(O(1) amortised) and retries on collision. Batch calls are -generated in bulk with a single ``set`` deduplication pass, -requesting extra items to compensate for expected collisions. - -The proxy itself is **lazily created** — accessing ``forge.unique`` -for the first time constructs it; all subsequent accesses return -the cached instance. -""" +"""UniqueProxy — wrapper for unique value generation.""" from __future__ import annotations @@ -67,7 +45,6 @@ def _generate_batch(self, count: int, **kwargs: Any) -> list[Any]: max_total_retries = count * 100 retries = 0 - # Start with 20% over-sample; adapt based on collision rate oversample_ratio = 0.20 while remaining > 0: if retries > max_total_retries: @@ -76,7 +53,6 @@ def _generate_batch(self, count: int, **kwargs: Any) -> list[Any]: f"{retries} retries for {self._method!r}. " f"Generated {len(result)}/{count}." ) - # Adaptive: increase over-sampling as saturation grows request = remaining + max(int(remaining * oversample_ratio), 10) batch = method(count=request, **kwargs) batch_collisions = 0 @@ -91,10 +67,8 @@ def _generate_batch(self, count: int, **kwargs: Any) -> list[Any]: retries += 1 batch_collisions += 1 - # Adapt over-sample ratio based on collision rate in this batch if batch_collisions > 0 and len(batch) > 0: collision_rate = batch_collisions / len(batch) - # Scale up: at 50% collision rate, request 2x; at 90%, ~10x oversample_ratio = min( collision_rate / (1 - collision_rate + 0.01), 10.0 ) @@ -133,20 +107,7 @@ def clear(self) -> None: class UniqueProxy: - """Top-level unique proxy — accessed via ``forge.unique``. - - Lazily wraps each provider the first time it is accessed. - Maintains per-method seen-value sets across calls. - - Examples - -------- - >>> forge = DataForge(seed=42) - >>> a = forge.unique.person.first_name() - >>> b = forge.unique.person.first_name() - >>> a != b # guaranteed unique - True - >>> forge.unique.clear() # reset all tracking - """ + """Top-level unique proxy — accessed via ``forge.unique``.""" __slots__ = ("_forge", "_proxies") @@ -166,14 +127,7 @@ def __getattr__(self, name: str) -> Any: return provider def clear(self, provider_name: str | None = None) -> None: - """Clear tracked unique values. - - Parameters - ---------- - provider_name : str | None - If given, clear only that provider's tracking. - If ``None``, clear all providers. - """ + """Clear tracked unique values.""" if provider_name is not None: proxy = self._proxies.get(provider_name) if proxy is not None: diff --git a/tests/test_address.py b/tests/test_address.py index d6fefcf..ea4e70d 100644 --- a/tests/test_address.py +++ b/tests/test_address.py @@ -12,8 +12,6 @@ class TestAddressScalar: - """Tests for single-item address generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -96,8 +94,6 @@ def test_coordinate_returns_tuple(self) -> None: class TestAddressBatch: - """Tests for batch address generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_advanced_realism.py b/tests/test_advanced_realism.py index dcdfafe..1ee6a30 100644 --- a/tests/test_advanced_realism.py +++ b/tests/test_advanced_realism.py @@ -13,9 +13,7 @@ from dataforge import DataForge -# ------------------------------------------------------------------ # # Weighted blood type distribution -# ------------------------------------------------------------------ # class TestRealisticBloodType: @@ -41,7 +39,6 @@ def test_count_one_returns_str(self) -> None: assert isinstance(result, str) def test_distribution_weighted(self) -> None: - """O+ and A+ should dominate; AB- should be very rare.""" results = self.forge.medical.realistic_blood_type(count=10_000) counts = Counter(results) # O+ (~37.4%) and A+ (~35.7%) together should be >50% of total @@ -52,7 +49,6 @@ def test_distribution_weighted(self) -> None: assert rare < 300, f"AB- = {rare}, expected <300" def test_schema_field_resolution(self) -> None: - """realistic_blood_type should be usable in Schema via field map.""" rows = self.forge.to_dict( fields=["realistic_blood_type"], count=10, @@ -70,9 +66,7 @@ def test_reproducible_with_seed(self) -> None: assert r1 == r2 -# ------------------------------------------------------------------ # # Weighted engine methods -# ------------------------------------------------------------------ # class TestWeightedEngine: @@ -103,9 +97,7 @@ def test_weighted_choice_respects_weights(self) -> None: assert heavy_count > 900, f"heavy={heavy_count}, expected >900" -# ------------------------------------------------------------------ # # Lambda / callable fields in Schema -# ------------------------------------------------------------------ # class TestSchemaLambdaFields: @@ -113,7 +105,6 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_lambda_basic(self) -> None: - """Lambda can transform a previously generated column.""" schema = self.forge.schema( { "name": "first_name", @@ -126,7 +117,6 @@ def test_lambda_basic(self) -> None: assert row["upper_name"] == row["name"].upper() def test_lambda_multiple(self) -> None: - """Multiple lambdas can reference each other in order.""" schema = self.forge.schema( { "first": "first_name", @@ -145,7 +135,6 @@ def test_lambda_multiple(self) -> None: assert row["email_like"] == expected_email def test_lambda_count_one(self) -> None: - """count=1 should still work with lambdas.""" schema = self.forge.schema( { "city": "city", @@ -157,7 +146,6 @@ def test_lambda_count_one(self) -> None: assert rows[0]["label"] == f"City: {rows[0]['city']}" def test_lambda_stream(self) -> None: - """Lambdas should work via stream() as well.""" schema = self.forge.schema( { "name": "first_name", @@ -170,7 +158,6 @@ def test_lambda_stream(self) -> None: assert row["upper"] == row["name"].upper() def test_lambda_generate_empty(self) -> None: - """count=0 should return empty list even with lambdas.""" schema = self.forge.schema( { "name": "first_name", @@ -180,7 +167,6 @@ def test_lambda_generate_empty(self) -> None: assert schema.generate(count=0) == [] def test_lambda_no_lambdas_unchanged(self) -> None: - """Schemas without lambdas should behave identically to before.""" s1 = self.forge.schema(["first_name", "email"]) f2 = DataForge(locale="en_US", seed=42) s2 = f2.schema({"first_name": "first_name", "email": "email"}) @@ -189,7 +175,6 @@ def test_lambda_no_lambdas_unchanged(self) -> None: assert rows1 == rows2 def test_lambda_to_csv(self) -> None: - """Lambdas should work in to_csv().""" schema = self.forge.schema( { "name": "first_name", @@ -202,7 +187,6 @@ def test_lambda_to_csv(self) -> None: assert lines[0].strip() == "name,upper" def test_lambda_to_jsonl(self) -> None: - """Lambdas should work in to_jsonl().""" import json schema = self.forge.schema( @@ -218,7 +202,6 @@ def test_lambda_to_jsonl(self) -> None: assert row["upper"] == row["name"].upper() def test_lambda_stream_to_csv(self) -> None: - """Lambdas should work in stream_to_csv().""" schema = self.forge.schema( { "name": "first_name", @@ -237,7 +220,6 @@ def test_lambda_stream_to_csv(self) -> None: os.unlink(path) def test_lambda_stream_to_jsonl(self) -> None: - """Lambdas should work in stream_to_jsonl().""" import json schema = self.forge.schema( @@ -259,7 +241,6 @@ def test_lambda_stream_to_jsonl(self) -> None: os.unlink(path) def test_lambda_non_string_return(self) -> None: - """Lambda returning non-string preserves native type.""" schema = self.forge.schema( { "name": "first_name", @@ -272,7 +253,6 @@ def test_lambda_non_string_return(self) -> None: assert isinstance(row["name_len"], int) def test_lambda_repr(self) -> None: - """Schema repr should list all columns including lambda ones.""" schema = self.forge.schema( { "name": "first_name", diff --git a/tests/test_ai_chat.py b/tests/test_ai_chat.py index a8ff187..f87a67a 100644 --- a/tests/test_ai_chat.py +++ b/tests/test_ai_chat.py @@ -1,4 +1,4 @@ -"""Tests for the AiChatProvider (compound, _needs_forge=True).""" +"""Tests for AI Chat methods (merged into LlmProvider).""" from dataforge import DataForge @@ -8,20 +8,19 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_returns_str(self) -> None: - result = self.forge.ai_chat.chat_role() + result = self.forge.llm.chat_role() assert isinstance(result, str) assert result in {"system", "user", "assistant", "tool"} def test_batch(self) -> None: - results = self.forge.ai_chat.chat_role(count=100) + results = self.forge.llm.chat_role(count=100) assert isinstance(results, list) assert len(results) == 100 valid = {"system", "user", "assistant", "tool"} assert all(r in valid for r in results) def test_weighted_distribution(self) -> None: - """user and assistant should appear more often than system and tool.""" - results = self.forge.ai_chat.chat_role(count=1000) + results = self.forge.llm.chat_role(count=1000) counts = {r: results.count(r) for r in {"user", "assistant", "system", "tool"}} # user and assistant have weight 40 each, system 15, tool 5 assert counts["user"] > counts["system"] @@ -33,11 +32,11 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_returns_str(self) -> None: - result = self.forge.ai_chat.chat_model() + result = self.forge.llm.chat_model() assert isinstance(result, str) def test_batch(self) -> None: - results = self.forge.ai_chat.chat_model(count=50) + results = self.forge.llm.chat_model(count=50) assert len(results) == 50 @@ -46,12 +45,12 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_returns_str(self) -> None: - result = self.forge.ai_chat.chat_content() + result = self.forge.llm.chat_content() assert isinstance(result, str) assert len(result) > 10 def test_batch(self) -> None: - results = self.forge.ai_chat.chat_content(count=50) + results = self.forge.llm.chat_content(count=50) assert len(results) == 50 @@ -60,13 +59,13 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_returns_str(self) -> None: - result = self.forge.ai_chat.chat_tokens() + result = self.forge.llm.chat_tokens() assert isinstance(result, str) val = int(result) assert 1 <= val <= 16384 def test_batch(self) -> None: - results = self.forge.ai_chat.chat_tokens(count=50) + results = self.forge.llm.chat_tokens(count=50) assert len(results) == 50 @@ -75,11 +74,11 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_returns_str(self) -> None: - result = self.forge.ai_chat.chat_finish_reason() + result = self.forge.llm.chat_finish_reason() assert isinstance(result, str) def test_batch(self) -> None: - results = self.forge.ai_chat.chat_finish_reason(count=50) + results = self.forge.llm.chat_finish_reason(count=50) assert len(results) == 50 @@ -88,7 +87,7 @@ def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_returns_dict(self) -> None: - msg = self.forge.ai_chat.chat_message() + msg = self.forge.llm.chat_message() assert isinstance(msg, dict) assert "role" in msg assert "model" in msg @@ -99,7 +98,7 @@ def test_returns_dict(self) -> None: assert len(msg["content"]) > 0 def test_batch(self) -> None: - msgs = self.forge.ai_chat.chat_message(count=20) + msgs = self.forge.llm.chat_message(count=20) assert isinstance(msgs, list) assert len(msgs) == 20 for msg in msgs: @@ -109,8 +108,8 @@ def test_batch(self) -> None: assert "content" in msg def test_deterministic(self) -> None: - a = DataForge(seed=99).ai_chat.chat_message() - b = DataForge(seed=99).ai_chat.chat_message() + a = DataForge(seed=99).llm.chat_message() + b = DataForge(seed=99).llm.chat_message() assert a == b @@ -139,7 +138,6 @@ def test_schema_fields(self) -> None: assert row["chat_role"] in {"system", "user", "assistant", "tool"} def test_schema_mixed_providers(self) -> None: - """AI chat fields should work alongside other provider fields.""" rows = self.forge.to_dict( fields=["chat_role", "chat_model", "first_name", "email"], count=5, diff --git a/tests/test_ai_prompt.py b/tests/test_ai_prompt.py index a53abae..b04276f 100644 --- a/tests/test_ai_prompt.py +++ b/tests/test_ai_prompt.py @@ -153,7 +153,6 @@ def test_schema_fields(self) -> None: assert len(row["system_prompt"]) > 0 def test_schema_all_fields(self) -> None: - """All _field_map entries should be resolvable.""" fields = [ "user_prompt", "coding_prompt", diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index cbafd87..80b4767 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -12,9 +12,7 @@ from dataforge.anonymizer import Anonymizer -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -36,9 +34,7 @@ def sample_rows() -> list[dict]: ] -# ------------------------------------------------------------------ # Construction -# ------------------------------------------------------------------ class TestAnonymizerConstruction: @@ -60,9 +56,7 @@ def test_slots(self, anon: Anonymizer) -> None: anon.nonexistent = True # type: ignore[attr-defined] -# ------------------------------------------------------------------ # Deterministic seed derivation -# ------------------------------------------------------------------ class TestSeedDerivation: @@ -82,9 +76,7 @@ def test_different_fields_different_seeds(self, anon: Anonymizer) -> None: assert s1 != s2 -# ------------------------------------------------------------------ # Anonymize rows -# ------------------------------------------------------------------ class TestAnonymizeRows: @@ -146,7 +138,6 @@ def test_different_secrets_different_output( assert any(a["name"] != b["name"] for a, b in zip(r1, r2)) def test_same_value_same_fake(self, anon: Anonymizer) -> None: - """Duplicate real values should map to the same fake value.""" rows = [ {"name": "Alice Smith"}, {"name": "Alice Smith"}, @@ -157,9 +148,7 @@ def test_same_value_same_fake(self, anon: Anonymizer) -> None: assert result[0]["name"] != result[2]["name"] -# ------------------------------------------------------------------ # Cache management -# ------------------------------------------------------------------ class TestCache: @@ -182,9 +171,7 @@ def test_repr_reflects_cache( assert "cached_mappings=0" not in r -# ------------------------------------------------------------------ # Format-preserving anonymization -# ------------------------------------------------------------------ class TestFormatPreserving: @@ -202,9 +189,7 @@ def test_phone_format_preservation(self, anon: Anonymizer) -> None: assert "-" in result -# ------------------------------------------------------------------ # CSV anonymization -# ------------------------------------------------------------------ class TestAnonymizeCSV: @@ -240,7 +225,6 @@ def test_csv_anonymization(self, anon: Anonymizer) -> None: os.unlink(output_path) def test_csv_batch_processing(self, anon: Anonymizer) -> None: - """Ensure batch_size works correctly with more rows.""" with tempfile.NamedTemporaryFile( mode="w", suffix=".csv", delete=False, newline="", encoding="utf-8" ) as f: diff --git a/tests/test_automotive.py b/tests/test_automotive.py index ff796e2..a8e81c1 100644 --- a/tests/test_automotive.py +++ b/tests/test_automotive.py @@ -11,8 +11,6 @@ class TestAutomotiveScalar: - """Tests for single-item automotive data generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -35,7 +33,6 @@ def test_vin_length(self) -> None: assert len(result) == 17, f"VIN wrong length: {result}" def test_vin_no_invalid_chars(self) -> None: - """VINs must not contain I, O, or Q.""" for _ in range(100): result = self.forge.automotive.vin() assert "I" not in result @@ -43,7 +40,6 @@ def test_vin_no_invalid_chars(self) -> None: assert "Q" not in result def test_vin_check_digit(self) -> None: - """Verify the VIN check digit is computed correctly.""" from dataforge.providers.automotive import _VIN_TRANSLITERATE, _VIN_WEIGHTS for _ in range(50): @@ -91,8 +87,6 @@ def test_deterministic_with_seed(self) -> None: class TestAutomotiveBatch: - """Tests for batch automotive data generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_backend.py b/tests/test_backend.py index b03f84c..aa5c56b 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -4,8 +4,6 @@ class TestRandomEngineScalar: - """Tests for single-item generation.""" - def test_choice_returns_string(self) -> None: engine = RandomEngine(seed=1) data = ("a", "b", "c") @@ -33,8 +31,6 @@ def test_numerify_preserves_non_hash(self) -> None: class TestRandomEngineChoices: - """Tests for batch generation.""" - def test_choices_returns_list(self) -> None: engine = RandomEngine(seed=1) data = ("a", "b", "c") @@ -52,8 +48,6 @@ def test_choices_large_batch(self) -> None: class TestRandomEngineSeed: - """Tests for reproducibility.""" - def test_same_seed_same_output(self) -> None: engine1 = RandomEngine(seed=42) engine2 = RandomEngine(seed=42) @@ -81,8 +75,6 @@ def test_reseed_resets_state(self) -> None: class TestWeightedChoices: - """Tests for weighted_choices on RandomEngine.""" - def test_weighted_choices_returns_list(self) -> None: engine = RandomEngine(seed=1) data = ("a", "b", "c") diff --git a/tests/test_barcode.py b/tests/test_barcode.py index 7dfc699..8a122eb 100644 --- a/tests/test_barcode.py +++ b/tests/test_barcode.py @@ -24,8 +24,6 @@ def _validate_isbn10_check_digit(code: str) -> bool: class TestBarcodeScalar: - """Tests for single-item barcode generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -75,8 +73,6 @@ def test_isbn10_length(self) -> None: class TestBarcodeCheckDigits: - """Validate check digits are computed correctly.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -106,8 +102,6 @@ def test_isbn10_check_digit_valid(self) -> None: class TestBarcodeBatch: - """Tests for batch barcode generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_bulk.py b/tests/test_bulk.py index 7578d48..2c9a58f 100644 --- a/tests/test_bulk.py +++ b/tests/test_bulk.py @@ -11,8 +11,6 @@ class TestToDict: - """Tests for DataForge.to_dict().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -61,7 +59,6 @@ def test_reproducible_with_seed(self) -> None: assert rows1 == rows2 def test_all_provider_fields(self) -> None: - """Verify that shorthand fields from various providers work.""" fields = [ "first_name", "last_name", @@ -82,8 +79,6 @@ def test_all_provider_fields(self) -> None: class TestToCsv: - """Tests for DataForge.to_csv().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -130,8 +125,6 @@ def test_csv_empty_count(self) -> None: class TestToJsonl: - """Tests for DataForge.to_jsonl().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -169,8 +162,6 @@ def test_jsonl_write_to_file(self) -> None: class TestToSql: - """Tests for DataForge.to_sql().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -207,8 +198,6 @@ def test_sql_postgresql_dialect(self) -> None: class TestCopy: - """Tests for DataForge.copy().""" - def test_copy_returns_new_instance(self) -> None: forge = DataForge(locale="en_US", seed=42) copy = forge.copy() @@ -224,13 +213,10 @@ def test_copy_with_seed(self) -> None: class TestToDataframe: - """Tests for DataForge.to_dataframe().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) def test_requires_pandas(self) -> None: - """to_dataframe should work if pandas is installed, or raise if not.""" try: import pandas # noqa: F401 diff --git a/tests/test_chaos.py b/tests/test_chaos.py index ef3ca45..482b039 100644 --- a/tests/test_chaos.py +++ b/tests/test_chaos.py @@ -8,9 +8,7 @@ from dataforge.chaos import ChaosTransformer -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -24,9 +22,7 @@ def sample_rows(forge: DataForge) -> list[dict]: return schema.generate(count=100) -# ------------------------------------------------------------------ # ChaosTransformer construction -# ------------------------------------------------------------------ class TestChaosTransformerConstruction: @@ -58,9 +54,7 @@ def test_seed_reproducibility(self, sample_rows: list[dict]) -> None: assert a == b -# ------------------------------------------------------------------ # Null injection -# ------------------------------------------------------------------ class TestNullInjection: @@ -81,9 +75,7 @@ def test_zero_null_rate(self, sample_rows: list[dict]) -> None: assert null_count == 0 -# ------------------------------------------------------------------ # Type mismatch injection -# ------------------------------------------------------------------ class TestTypeMismatch: @@ -100,9 +92,7 @@ def test_type_mismatch_injects(self, sample_rows: list[dict]) -> None: assert non_str_count > 0 -# ------------------------------------------------------------------ # Boundary value injection -# ------------------------------------------------------------------ class TestBoundaryInjection: @@ -115,9 +105,7 @@ def test_boundary_values_injected(self, sample_rows: list[dict]) -> None: assert len(boundary_hits) > 0 -# ------------------------------------------------------------------ # Duplicate injection -# ------------------------------------------------------------------ class TestDuplicateInjection: @@ -128,9 +116,7 @@ def test_duplicates_added(self, sample_rows: list[dict]) -> None: assert len(result) >= len(sample_rows) -# ------------------------------------------------------------------ # String-specific transformations -# ------------------------------------------------------------------ class TestStringTransformations: @@ -185,9 +171,7 @@ def test_truncation(self, sample_rows: list[dict]) -> None: assert shorter_count > 0 -# ------------------------------------------------------------------ # Column targeting -# ------------------------------------------------------------------ class TestColumnTargeting: @@ -201,9 +185,7 @@ def test_only_target_columns(self, sample_rows: list[dict]) -> None: assert row["city"] is not None -# ------------------------------------------------------------------ # Empty input -# ------------------------------------------------------------------ class TestEdgeCases: @@ -220,9 +202,7 @@ def test_does_not_mutate_input(self, sample_rows: list[dict]) -> None: assert sample_rows == originals -# ------------------------------------------------------------------ # Schema integration -# ------------------------------------------------------------------ class TestChaosSchemaIntegration: diff --git a/tests/test_cli.py b/tests/test_cli.py index 21b4598..ee9162e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -11,8 +11,6 @@ class TestCliListFields: - """Test --list-fields flag.""" - def test_list_fields_returns_zero(self) -> None: result = main(["--list-fields"]) assert result == 0 @@ -26,8 +24,6 @@ def test_list_fields_output(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliErrorHandling: - """Test error cases.""" - def test_unknown_field_returns_one(self) -> None: result = main(["--count", "1", "nonexistent_field_xyz"]) assert result == 1 @@ -40,8 +36,6 @@ def test_unknown_field_stderr(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliTextFormat: - """Test default text output format.""" - def test_default_format(self, capsys: pytest.CaptureFixture[str]) -> None: result = main(["--count", "3", "--seed", "42", "first_name", "email"]) assert result == 0 @@ -51,7 +45,6 @@ def test_default_format(self, capsys: pytest.CaptureFixture[str]) -> None: assert len(lines) == 5 def test_default_fields(self, capsys: pytest.CaptureFixture[str]) -> None: - """With no fields specified, should default to first_name, last_name, email.""" result = main(["--count", "2", "--seed", "42"]) assert result == 0 captured = capsys.readouterr() @@ -64,8 +57,6 @@ def test_default_fields(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliCsvFormat: - """Test CSV output format.""" - def test_csv_output(self, capsys: pytest.CaptureFixture[str]) -> None: result = main( ["--count", "5", "--format", "csv", "--seed", "42", "first_name", "city"] @@ -98,8 +89,6 @@ def test_csv_headers_match_fields(self, capsys: pytest.CaptureFixture[str]) -> N class TestCliJsonFormat: - """Test JSON output format.""" - def test_json_output(self, capsys: pytest.CaptureFixture[str]) -> None: result = main( ["--count", "3", "--format", "json", "--seed", "42", "first_name", "email"] @@ -114,8 +103,6 @@ def test_json_output(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliJsonlFormat: - """Test JSONL output format.""" - def test_jsonl_output(self, capsys: pytest.CaptureFixture[str]) -> None: result = main( [ @@ -140,8 +127,6 @@ def test_jsonl_output(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliLocale: - """Test locale option.""" - def test_locale_option(self) -> None: result = main( ["--count", "3", "--locale", "de_DE", "--seed", "42", "first_name"] @@ -150,8 +135,6 @@ def test_locale_option(self) -> None: class TestCliSeed: - """Test seed option for reproducibility.""" - def test_seed_reproducible(self, capsys: pytest.CaptureFixture[str]) -> None: main(["--count", "5", "--format", "json", "--seed", "123", "first_name"]) out1 = capsys.readouterr().out @@ -172,8 +155,6 @@ def test_different_seeds_differ(self, capsys: pytest.CaptureFixture[str]) -> Non class TestCliAllFields: - """Test that every field in the registry can be generated without error.""" - @pytest.mark.parametrize("field", sorted(get_field_map().keys())) def test_field_generates(self, field: str) -> None: result = main(["--count", "1", "--seed", "42", field]) @@ -181,8 +162,6 @@ def test_field_generates(self, field: str) -> None: class TestCliNoHeader: - """Test --no-header flag.""" - def test_text_no_header(self, capsys: pytest.CaptureFixture[str]) -> None: result = main(["--count", "3", "--no-header", "--seed", "42", "first_name"]) assert result == 0 @@ -214,8 +193,6 @@ def test_csv_no_header(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliOutput: - """Test --output flag.""" - def test_output_to_file(self, tmp_path) -> None: out_file = str(tmp_path / "out.csv") result = main( diff --git a/tests/test_color.py b/tests/test_color.py index 61a26fc..9e62f1a 100644 --- a/tests/test_color.py +++ b/tests/test_color.py @@ -7,8 +7,6 @@ class TestColorScalar: - """Tests for single-item color generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -79,8 +77,6 @@ def test_hsl_string_format(self) -> None: class TestColorBatch: - """Tests for batch color generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_company.py b/tests/test_company.py index 9231cc5..7bdb206 100644 --- a/tests/test_company.py +++ b/tests/test_company.py @@ -4,8 +4,6 @@ class TestCompanyScalar: - """Tests for single-item company generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -32,8 +30,6 @@ def test_job_title_returns_str(self) -> None: class TestCompanyBatch: - """Tests for batch company generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -54,8 +50,6 @@ def test_catch_phrase_batch(self) -> None: class TestCompanyLocales: - """Tests for company across locales.""" - def test_de_DE_company(self) -> None: forge = DataForge(locale="de_DE", seed=42) name = forge.company.company_name() diff --git a/tests/test_constraints.py b/tests/test_constraints.py index 22680b4..370bb02 100644 --- a/tests/test_constraints.py +++ b/tests/test_constraints.py @@ -16,9 +16,7 @@ ) -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -26,16 +24,11 @@ def forge() -> DataForge: return DataForge(locale="en_US", seed=42) -# ------------------------------------------------------------------ # Unit tests for individual constraint classes -# ------------------------------------------------------------------ class TestDependsOnConstraint: - """Test geographic dependency constraint.""" - def test_state_depends_on_country(self, forge: DataForge) -> None: - """state depending on country should pick from that country's states.""" from dataforge.data.correlations.geo import COUNTRY_STATES c = DependsOnConstraint("address.state", "state", "country") @@ -64,7 +57,6 @@ def test_currency_depends_on_country(self, forge: DataForge) -> None: assert val == "JPY" def test_unknown_country_fallback(self, forge: DataForge) -> None: - """Unknown country should still produce a value (fallback provinces).""" c = DependsOnConstraint("address.state", "state", "country") row = {"country": "Atlantis"} val = c.generate(row, forge._engine, forge) @@ -72,8 +64,6 @@ def test_unknown_country_fallback(self, forge: DataForge) -> None: class TestTemporalConstraint: - """Test temporal ordering constraint.""" - def test_after_reference(self, forge: DataForge) -> None: c = TemporalConstraint("date", "end_date", "after", "start_date", (1, 30)) row = {"start_date": "2024-01-01"} @@ -87,7 +77,6 @@ def test_before_reference(self, forge: DataForge) -> None: assert val < "2024-12-31" def test_none_reference_fallback(self, forge: DataForge) -> None: - """If reference is None, should still generate a value.""" c = TemporalConstraint("date", "end_date", "after", "start_date", (1, 30)) row = {"start_date": None} val = c.generate(row, forge._engine, forge) @@ -95,8 +84,6 @@ def test_none_reference_fallback(self, forge: DataForge) -> None: class TestCorrelateConstraint: - """Test statistical correlation constraint.""" - def test_basic_correlation(self, forge: DataForge) -> None: c = CorrelateConstraint("value", "y", "x", correlation=0.9, mean=0.0, std=1.0) row = {"x": 2.0} @@ -110,7 +97,6 @@ def test_no_reference_fallback(self, forge: DataForge) -> None: assert isinstance(val, float) def test_correlation_bounded(self) -> None: - """Correlation should be clamped to [-1, 1].""" c = CorrelateConstraint("v", "y", "x", correlation=5.0) assert c.correlation == 1.0 c2 = CorrelateConstraint("v", "y", "x", correlation=-5.0) @@ -118,8 +104,6 @@ def test_correlation_bounded(self) -> None: class TestConditionalConstraint: - """Test conditional value pools.""" - def test_conditional_picks_from_pool(self, forge: DataForge) -> None: pools = {"M": ("Mr.",), "F": ("Ms.", "Mrs.")} c = ConditionalConstraint("title", "title", "gender", pools, ("Mx.",)) @@ -136,8 +120,6 @@ def test_conditional_default_pool(self, forge: DataForge) -> None: class TestRangeConstraint: - """Test numeric range constraint.""" - def test_static_range(self, forge: DataForge) -> None: c = RangeConstraint("price", "price", min_val=10.0, max_val=100.0, precision=2) row = {} @@ -153,21 +135,16 @@ def test_dynamic_range_from_ref(self, forge: DataForge) -> None: assert val >= 50.0 def test_inverted_bounds_swapped(self, forge: DataForge) -> None: - """If min > max, they should be swapped.""" c = RangeConstraint("v", "v", min_val=100.0, max_val=10.0) row = {} val = c.generate(row, forge._engine, forge) assert 10.0 <= val <= 100.0 -# ------------------------------------------------------------------ # parse_field_spec tests -# ------------------------------------------------------------------ class TestParseFieldSpec: - """Test parsing of dict-based field specs.""" - def test_depends_on_spec(self) -> None: spec = {"field": "address.city", "depends_on": "country"} constraint, deps = parse_field_spec("city", spec) @@ -202,21 +179,16 @@ def test_range_spec(self) -> None: assert isinstance(constraint, RangeConstraint) def test_plain_field_spec(self) -> None: - """A dict with only 'field' should return no constraint.""" spec = {"field": "email"} constraint, deps = parse_field_spec("email", spec) assert constraint is None assert deps == [] -# ------------------------------------------------------------------ # build_dependency_order tests -# ------------------------------------------------------------------ class TestBuildDependencyOrder: - """Test DAG building and topological sort.""" - def test_simple_dag(self) -> None: specs = { "country": "country", @@ -248,16 +220,11 @@ def test_circular_dependency_raises(self) -> None: build_dependency_order(specs) -# ------------------------------------------------------------------ # Full Schema integration tests -# ------------------------------------------------------------------ class TestConstraintSchemaIntegration: - """Test constraint-based schemas end-to-end via forge.schema().""" - def test_geographic_chain(self, forge: DataForge) -> None: - """country → state → city chain should produce consistent data.""" from dataforge.data.correlations.geo import ( COUNTRY_STATES, STATE_CITIES, @@ -300,7 +267,6 @@ def test_temporal_ordering(self, forge: DataForge) -> None: assert row["end_date"] > row["start_date"] def test_mixed_independent_and_dependent(self, forge: DataForge) -> None: - """Schema with both plain fields and constraints.""" schema = forge.schema( { "name": "first_name", diff --git a/tests/test_core.py b/tests/test_core.py index dbb440d..9f4dc52 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,8 +4,6 @@ class TestDataForgeInit: - """Tests for DataForge initialization.""" - def test_default_locale(self) -> None: forge = DataForge() assert forge.locale == "en_US" @@ -34,8 +32,6 @@ def test_invalid_locale_falls_back_to_en_US(self) -> None: class TestDataForgeSeed: - """Tests for seed reproducibility.""" - def test_seeded_person_reproducible(self) -> None: forge1 = DataForge(seed=99) forge2 = DataForge(seed=99) @@ -55,8 +51,6 @@ def test_reseed_produces_same_output(self) -> None: class TestDataForgeLazyLoading: - """Tests for lazy provider loading.""" - def test_person_not_loaded_initially(self) -> None: forge = DataForge() assert "person" not in forge._providers # noqa: SLF001 diff --git a/tests/test_crypto.py b/tests/test_crypto.py index 3ca2e87..66d5b89 100644 --- a/tests/test_crypto.py +++ b/tests/test_crypto.py @@ -6,8 +6,6 @@ class TestCryptoScalar: - """Tests for single-item crypto hash generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -47,8 +45,6 @@ def test_deterministic_with_seed(self) -> None: class TestCryptoBatch: - """Tests for batch crypto hash generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -71,7 +67,6 @@ def test_sha256_batch(self) -> None: assert all(re.match(r"^[0-9a-f]{64}$", h) for h in result) def test_batch_uniqueness(self) -> None: - """Batch of 1000 hashes should have high uniqueness.""" result = self.forge.crypto.sha256(count=1000) unique = set(result) # With 256-bit hashes, collisions are astronomically unlikely diff --git a/tests/test_datetime.py b/tests/test_datetime.py index 6fc343f..7ce38f4 100644 --- a/tests/test_datetime.py +++ b/tests/test_datetime.py @@ -7,8 +7,6 @@ class TestDateTimeScalar: - """Tests for single-item datetime generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -72,8 +70,6 @@ def test_date_with_start_end(self) -> None: class TestDateTimeBatch: - """Tests for batch datetime generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -123,8 +119,6 @@ def test_unix_timestamp_batch(self) -> None: class TestDateTimeLocaleIndependent: - """DateTime should work the same regardless of locale.""" - def test_dt_works_with_any_locale(self) -> None: for locale in ("en_US", "de_DE", "fr_FR", "es_ES", "ja_JP"): forge = DataForge(locale=locale, seed=42) diff --git a/tests/test_education.py b/tests/test_education.py index 861d48b..eef010b 100644 --- a/tests/test_education.py +++ b/tests/test_education.py @@ -9,8 +9,6 @@ class TestEducationScalar: - """Tests for single-item education data generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -38,8 +36,6 @@ def test_deterministic_with_seed(self) -> None: class TestEducationBatch: - """Tests for batch education data generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -62,7 +58,6 @@ def test_field_of_study_batch(self) -> None: assert all(f in _FIELDS_OF_STUDY for f in result) def test_variety(self) -> None: - """Large batch should include multiple distinct values.""" result = self.forge.education.university(count=500) unique = set(result) # With 50 universities, 500 draws should hit many of them diff --git a/tests/test_file.py b/tests/test_file.py index 126207d..819108e 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -13,8 +13,6 @@ class TestFileScalar: - """Tests for single-item file generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -74,8 +72,6 @@ def test_file_category_returns_str(self) -> None: class TestFileBatch: - """Tests for batch file generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_finance.py b/tests/test_finance.py index e0cd213..bccd102 100644 --- a/tests/test_finance.py +++ b/tests/test_finance.py @@ -19,8 +19,6 @@ def _luhn_valid(number: str) -> bool: class TestFinanceScalar: - """Tests for single-item finance generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -138,7 +136,6 @@ def test_routing_number_returns_str(self) -> None: assert result.isdigit() def test_routing_number_aba_checksum(self) -> None: - """Verify ABA checksum is valid.""" for _ in range(100): rn = self.forge.finance.routing_number() digits = [int(d) for d in rn] @@ -155,8 +152,6 @@ def test_bitcoin_address_returns_str(self) -> None: class TestFinanceBatch: - """Tests for batch finance generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_inference.py b/tests/test_inference.py index fbc33ea..da5dfe7 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -19,9 +19,7 @@ ) -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -34,9 +32,7 @@ def inferrer(forge: DataForge) -> SchemaInferrer: return SchemaInferrer(forge) -# ------------------------------------------------------------------ # Base type detection -# ------------------------------------------------------------------ class TestDetectBaseType: @@ -47,7 +43,6 @@ def test_all_ints(self) -> None: assert _detect_base_type([1, 2, 3]) == "int" def test_string_ints(self) -> None: - """Numeric strings should be detected as int.""" assert _detect_base_type(["1", "2", "3"]) == "int" def test_string_floats(self) -> None: @@ -72,19 +67,15 @@ def test_mixed_types(self) -> None: assert result in ("str", "mixed") def test_with_nulls(self) -> None: - """Nulls should be excluded from type decision.""" result = _detect_base_type([None, 1, 2, None, 3]) assert result == "int" -# ------------------------------------------------------------------ # Semantic type detection -# ------------------------------------------------------------------ class TestDetectSemanticType: def test_email_column_name(self) -> None: - """Column named 'email' should match via alias.""" result = _detect_semantic_type("email", ["test@x.com"], "str") assert result == "email" @@ -93,7 +84,6 @@ def test_phone_column_name(self) -> None: assert result == "phone_number" def test_email_pattern_detection(self) -> None: - """Regex should detect emails even if column name is generic.""" values = ["alice@test.com", "bob@test.com", "carol@test.com"] result = _detect_semantic_type("contact_info", values, "str") assert result == "email" @@ -137,14 +127,11 @@ def test_no_match(self) -> None: assert result is None def test_prefixed_column_name(self) -> None: - """user_email should strip prefix and match 'email'.""" result = _detect_semantic_type("user_email", ["test@x.com"], "str") assert result is not None -# ------------------------------------------------------------------ # Null rate computation -# ------------------------------------------------------------------ class TestComputeNullRate: @@ -164,9 +151,7 @@ def test_empty_input(self) -> None: assert _compute_null_rate([]) == 0.0 -# ------------------------------------------------------------------ # Statistics computation -# ------------------------------------------------------------------ class TestComputeStats: @@ -193,9 +178,7 @@ def test_count_always_present(self) -> None: assert stats["count"] == 3 -# ------------------------------------------------------------------ # ColumnAnalysis -# ------------------------------------------------------------------ class TestColumnAnalysis: @@ -211,9 +194,7 @@ def test_slots(self) -> None: ca.nonexistent = True # type: ignore[attr-defined] -# ------------------------------------------------------------------ # SchemaInferrer — from_records -# ------------------------------------------------------------------ class TestSchemaInferrerFromRecords: @@ -263,9 +244,7 @@ def test_sample_size_limit(self, forge: DataForge) -> None: assert len(rows) == 3 -# ------------------------------------------------------------------ # SchemaInferrer — from_csv -# ------------------------------------------------------------------ class TestSchemaInferrerFromCSV: @@ -307,9 +286,7 @@ def test_csv_with_custom_delimiter(self, inferrer: SchemaInferrer) -> None: os.unlink(path) -# ------------------------------------------------------------------ # SchemaInferrer — describe -# ------------------------------------------------------------------ class TestSchemaInferrerDescribe: @@ -330,9 +307,7 @@ def test_describe_after_inference(self, inferrer: SchemaInferrer) -> None: assert "mapped" in desc.lower() -# ------------------------------------------------------------------ # SchemaInferrer repr -# ------------------------------------------------------------------ class TestSchemaInferrerRepr: diff --git a/tests/test_integrations.py b/tests/test_integrations.py index 6531b1f..72fa3e9 100644 --- a/tests/test_integrations.py +++ b/tests/test_integrations.py @@ -52,9 +52,7 @@ class _PydanticCheck(pydantic.BaseModel): _has_sqlalchemy = False -# --------------------------------------------------------------- # Fixtures -# --------------------------------------------------------------- @pytest.fixture @@ -62,15 +60,11 @@ def forge() -> DataForge: return DataForge(locale="en_US", seed=42) -# --------------------------------------------------------------- # to_arrow -# --------------------------------------------------------------- @pytest.mark.skipif(not _has_pyarrow, reason="pyarrow not installed") class TestToArrow: - """Tests for Schema.to_arrow() and DataForge.to_arrow().""" - def test_basic_arrow_table(self, forge: DataForge) -> None: s = forge.schema(["first_name", "email", "city"]) table = s.to_arrow(count=100) @@ -85,7 +79,6 @@ def test_arrow_single_row(self, forge: DataForge) -> None: assert len(table.column("first_name")) == 1 def test_arrow_large_count_batched(self, forge: DataForge) -> None: - """Multi-batch path should produce correct total.""" s = forge.schema(["first_name", "email"]) table = s.to_arrow(count=5000, batch_size=1000) assert table.num_rows == 5000 @@ -119,27 +112,21 @@ def test_arrow_with_lambda(self, forge: DataForge) -> None: assert u == n.upper() def test_arrow_small_count(self, forge: DataForge) -> None: - """Small counts should work fine in single-shot path.""" s = forge.schema(["first_name"]) table = s.to_arrow(count=2) assert table.num_rows == 2 def test_delegation_to_arrow(self, forge: DataForge) -> None: - """DataForge.to_arrow() should delegate to Schema.to_arrow().""" table = forge.to_arrow(["first_name", "email"], count=20) assert table.num_rows == 20 assert table.column_names == ["first_name", "email"] -# --------------------------------------------------------------- # to_polars -# --------------------------------------------------------------- @pytest.mark.skipif(not _has_polars, reason="polars not installed") class TestToPolars: - """Tests for Schema.to_polars() and DataForge.to_polars().""" - def test_basic_polars_df(self, forge: DataForge) -> None: s = forge.schema(["first_name", "email", "city"]) df = s.to_polars(count=100) @@ -188,9 +175,7 @@ def test_delegation_to_polars(self, forge: DataForge) -> None: assert df.columns == ["first_name", "email"] -# --------------------------------------------------------------- # schema_from_pydantic -# --------------------------------------------------------------- @pytest.mark.skipif( @@ -198,8 +183,6 @@ def test_delegation_to_polars(self, forge: DataForge) -> None: reason="pydantic not installed or incompatible with this Python version", ) class TestSchemaFromPydantic: - """Tests for DataForge.schema_from_pydantic().""" - def test_basic_pydantic_mapping(self, forge: DataForge) -> None: from pydantic import BaseModel @@ -217,7 +200,6 @@ class User(BaseModel): assert "city" in row def test_alias_mapping(self, forge: DataForge) -> None: - """Fields like 'phone' should map to 'phone_number'.""" from pydantic import BaseModel class Contact(BaseModel): @@ -277,7 +259,6 @@ class User(BaseModel): assert rows1 == rows2 def test_pydantic_many_aliases(self, forge: DataForge) -> None: - """Test several heuristic aliases work.""" from pydantic import BaseModel class Profile(BaseModel): @@ -298,15 +279,11 @@ class Profile(BaseModel): assert "uuid" in row -# --------------------------------------------------------------- # schema_from_sqlalchemy -# --------------------------------------------------------------- @pytest.mark.skipif(not _has_sqlalchemy, reason="sqlalchemy not installed") class TestSchemaFromSQLAlchemy: - """Tests for DataForge.schema_from_sqlalchemy().""" - def _make_base(self): from sqlalchemy.orm import DeclarativeBase diff --git a/tests/test_internet.py b/tests/test_internet.py index b4feb52..e24e01f 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -4,8 +4,6 @@ class TestInternetScalar: - """Tests for single-item internet generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -61,8 +59,6 @@ def test_safe_email_uses_example_domain(self) -> None: class TestInternetBatch: - """Tests for batch internet generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -103,8 +99,6 @@ def test_safe_email_batch(self) -> None: class TestInternetLocales: - """Tests for internet across locales.""" - def test_de_DE_email(self) -> None: forge = DataForge(locale="de_DE", seed=42) email = forge.internet.email() diff --git a/tests/test_llm.py b/tests/test_llm.py index 2712fee..f5da36e 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -51,7 +51,6 @@ def test_batch(self) -> None: assert all(len(k) > 40 for k in results) def test_has_prefix(self) -> None: - """API keys should start with a known prefix.""" prefixes = ( "sk-", "sk-proj-", @@ -383,7 +382,6 @@ def test_schema_fields(self) -> None: assert "token_count" in row def test_schema_all_field_map_entries(self) -> None: - """All _field_map entries should be resolvable.""" fields = [ "model_name", "provider_name", diff --git a/tests/test_locales.py b/tests/test_locales.py index 9d06a36..80b9261 100644 --- a/tests/test_locales.py +++ b/tests/test_locales.py @@ -23,8 +23,6 @@ class TestLocalePersonProvider: - """Verify person provider works for all locales.""" - @pytest.mark.parametrize("locale", LOCALES) def test_first_name(self, locale: str) -> None: forge = DataForge(locale=locale, seed=42) @@ -55,8 +53,6 @@ def test_full_name_batch(self, locale: str) -> None: class TestLocaleAddressProvider: - """Verify address provider works for all locales.""" - @pytest.mark.parametrize("locale", LOCALES) def test_city(self, locale: str) -> None: forge = DataForge(locale=locale, seed=42) @@ -81,8 +77,6 @@ def test_zip_code(self, locale: str) -> None: class TestLocaleInternetProvider: - """Verify internet provider works for all locales.""" - @pytest.mark.parametrize("locale", LOCALES) def test_email(self, locale: str) -> None: forge = DataForge(locale=locale, seed=42) @@ -100,8 +94,6 @@ def test_ipv4(self, locale: str) -> None: class TestLocaleCompanyProvider: - """Verify company provider works for all locales.""" - @pytest.mark.parametrize("locale", LOCALES) def test_company_name(self, locale: str) -> None: forge = DataForge(locale=locale, seed=42) @@ -117,8 +109,6 @@ def test_job_title(self, locale: str) -> None: class TestLocalePhoneProvider: - """Verify phone provider works for all locales.""" - @pytest.mark.parametrize("locale", LOCALES) def test_phone_number(self, locale: str) -> None: forge = DataForge(locale=locale, seed=42) diff --git a/tests/test_locales_new.py b/tests/test_locales_new.py index cab0337..668f7ae 100644 --- a/tests/test_locales_new.py +++ b/tests/test_locales_new.py @@ -14,8 +14,6 @@ def locale_forge(request) -> DataForge: class TestLocaleSmoke: - """Basic smoke tests for all new English locales.""" - def test_person_first_name(self, locale_forge: DataForge) -> None: name = locale_forge.person.first_name() assert isinstance(name, str) and len(name) > 0 @@ -53,7 +51,6 @@ def test_internet_safe_email(self, locale_forge: DataForge) -> None: assert isinstance(email, str) and "@" in email def test_schema_integration(self, locale_forge: DataForge) -> None: - """Schema should work with locale-specific data.""" schema = locale_forge.schema( ["first_name", "last_name", "email", "city", "phone_number"] ) @@ -66,8 +63,6 @@ def test_schema_integration(self, locale_forge: DataForge) -> None: class TestEnGB: - """en_GB specific tests.""" - @pytest.fixture def forge(self) -> DataForge: return DataForge(locale="en_GB", seed=42) @@ -81,7 +76,6 @@ def test_female_first_name(self, forge: DataForge) -> None: assert isinstance(name, str) and len(name) > 0 def test_uk_email_domain(self, forge: DataForge) -> None: - """At least some emails should use UK domains.""" emails = forge.internet.email(count=500) uk_domains = [e for e in emails if ".co.uk" in e or ".uk" in e] assert len(uk_domains) > 0, "Expected some UK email domains" @@ -94,14 +88,11 @@ def test_deterministic(self, forge: DataForge) -> None: class TestEnAU: - """en_AU specific tests.""" - @pytest.fixture def forge(self) -> DataForge: return DataForge(locale="en_AU", seed=42) def test_au_states(self, forge: DataForge) -> None: - """Australian states should be short abbreviations.""" from dataforge.locales.en_AU.address import states assert "NSW" in states @@ -116,8 +107,6 @@ def test_au_email_domain(self, forge: DataForge) -> None: class TestEnCA: - """en_CA specific tests.""" - @pytest.fixture def forge(self) -> DataForge: return DataForge(locale="en_CA", seed=42) @@ -136,7 +125,6 @@ def test_ca_email_domain(self, forge: DataForge) -> None: assert len(ca_domains) > 0, "Expected some CA email domains" def test_french_canadian_names(self, forge: DataForge) -> None: - """Canadian last names should include French-Canadian names.""" from dataforge.locales.en_CA.person import last_names french_names = {"Tremblay", "Roy", "Gagnon", "Bouchard", "Gauthier"} diff --git a/tests/test_lorem.py b/tests/test_lorem.py index 53649fe..9c83929 100644 --- a/tests/test_lorem.py +++ b/tests/test_lorem.py @@ -4,8 +4,6 @@ class TestLoremScalar: - """Tests for single-item lorem generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -42,8 +40,6 @@ def test_text_max_chars(self) -> None: class TestLoremBatch: - """Tests for batch lorem generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -65,8 +61,6 @@ def test_paragraph_batch(self) -> None: class TestLoremLocaleIndependent: - """Lorem should work the same regardless of locale.""" - def test_lorem_works_with_any_locale(self) -> None: for locale in ("en_US", "de_DE", "fr_FR", "es_ES", "ja_JP"): forge = DataForge(locale=locale, seed=42) diff --git a/tests/test_misc.py b/tests/test_misc.py index 0b0b136..b3af2ce 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -15,8 +15,6 @@ class TestMiscScalar: - """Tests for single-item misc generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -55,8 +53,6 @@ def test_null_or_returns_none_or_value(self) -> None: class TestMiscBatch: - """Tests for batch misc generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -74,8 +70,6 @@ def test_boolean_batch(self) -> None: class TestMiscSeed: - """Tests for seed reproducibility.""" - def test_uuid4_reproducible_with_same_seed(self) -> None: forge1 = DataForge(locale="en_US", seed=42) forge2 = DataForge(locale="en_US", seed=42) @@ -88,8 +82,6 @@ def test_uuid4_batch_reproducible_with_same_seed(self) -> None: class TestUUID7Scalar: - """Tests for single-item uuid7 generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -108,7 +100,6 @@ def test_uuid7_is_valid_uuid(self) -> None: assert parsed.version == 7 def test_uuid7_time_ordered(self) -> None: - """UUID7 values generated sequentially should be time-ordered.""" import time u1 = self.forge.misc.uuid7() @@ -119,8 +110,6 @@ def test_uuid7_time_ordered(self) -> None: class TestUUID7Batch: - """Tests for batch uuid7 generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -138,10 +127,7 @@ def test_uuid7_batch_all_valid(self) -> None: class TestUUID7Seed: - """Tests for uuid7 seed reproducibility (random bits only).""" - def test_uuid7_random_bits_reproducible(self) -> None: - """With same seed, the random portion of uuid7 should match.""" forge1 = DataForge(locale="en_US", seed=42) forge2 = DataForge(locale="en_US", seed=42) u1 = uuid.UUID(forge1.misc.uuid7()) diff --git a/tests/test_network.py b/tests/test_network.py index 4ef50ba..7cfcd78 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -6,8 +6,6 @@ class TestNetworkScalar: - """Tests for single-item network generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -85,8 +83,6 @@ def test_http_status_code_format(self) -> None: class TestNetworkBatch: - """Tests for batch network generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_openapi.py b/tests/test_openapi.py index e93cbd3..ee907c6 100644 --- a/tests/test_openapi.py +++ b/tests/test_openapi.py @@ -12,9 +12,7 @@ from dataforge.openapi import OpenAPIParser, _TYPE_FORMAT_MAP, _PROPERTY_NAME_MAP -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -27,9 +25,7 @@ def parser(forge: DataForge) -> OpenAPIParser: return OpenAPIParser(forge) -# ------------------------------------------------------------------ # Construction -# ------------------------------------------------------------------ class TestOpenAPIParserConstruction: @@ -41,9 +37,7 @@ def test_slots(self, parser: OpenAPIParser) -> None: parser.nonexistent = True # type: ignore[attr-defined] -# ------------------------------------------------------------------ # Type format map -# ------------------------------------------------------------------ class TestTypeFormatMap: @@ -63,9 +57,7 @@ def test_date_time_mapping(self) -> None: assert _TYPE_FORMAT_MAP[("string", "date-time")] == "datetime" -# ------------------------------------------------------------------ # Property name map -# ------------------------------------------------------------------ class TestPropertyNameMap: @@ -79,9 +71,7 @@ def test_city_maps(self) -> None: assert _PROPERTY_NAME_MAP["city"] == "city" -# ------------------------------------------------------------------ # JSON Schema parsing -# ------------------------------------------------------------------ class TestFromJsonSchema: @@ -130,7 +120,6 @@ def test_boolean_field(self, parser: OpenAPIParser) -> None: assert isinstance(row["active"], bool) def test_property_name_heuristic(self, parser: OpenAPIParser) -> None: - """Property names like 'email', 'city' should be auto-mapped.""" schema_def = { "type": "object", "properties": { @@ -149,7 +138,6 @@ def test_no_properties_raises(self, parser: OpenAPIParser) -> None: parser.from_json_schema(schema_def) def test_enum_skipped(self, parser: OpenAPIParser) -> None: - """Enum properties are currently skipped.""" schema_def = { "type": "object", "properties": { @@ -175,7 +163,6 @@ def test_array_skipped(self, parser: OpenAPIParser) -> None: assert "name" in rows[0] def test_string_fallback_to_word(self, parser: OpenAPIParser) -> None: - """Unknown string property should fall back to lorem.word.""" schema_def = { "type": "object", "properties": { @@ -188,9 +175,7 @@ def test_string_fallback_to_word(self, parser: OpenAPIParser) -> None: assert isinstance(rows[0]["xyzzy_field"], str) -# ------------------------------------------------------------------ # OpenAPI document parsing -# ------------------------------------------------------------------ class TestFromOpenAPI: @@ -280,9 +265,7 @@ def test_non_object_schemas_skipped(self, parser: OpenAPIParser) -> None: assert "User" in schemas -# ------------------------------------------------------------------ # File parsing -# ------------------------------------------------------------------ class TestFromFile: diff --git a/tests/test_person.py b/tests/test_person.py index 08731b0..733f7a8 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -5,8 +5,6 @@ class TestPersonScalar: - """Tests for single-item person generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -50,8 +48,6 @@ def test_female_first_name_returns_str(self) -> None: class TestPersonBatch: - """Tests for batch person generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_phase1.py b/tests/test_phase1.py index 96935fb..f134de7 100644 --- a/tests/test_phase1.py +++ b/tests/test_phase1.py @@ -9,14 +9,10 @@ from dataforge import DataForge, __version__ -# ====================================================================== # Native type preservation -# ====================================================================== class TestNativeTypes: - """Test that generate() and to_dict() preserve native Python types.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -75,7 +71,6 @@ def test_jsonl_output_preserves_types(self) -> None: assert isinstance(row["first_name"], str) def test_csv_stringifies(self) -> None: - """CSV output should convert all values to strings.""" schema = self.forge.schema(["port", "boolean"]) csv_str = schema.to_csv(count=3) reader = csv.DictReader(io.StringIO(csv_str)) @@ -93,7 +88,6 @@ def test_sql_handles_native_types(self) -> None: assert ";" in sql def test_generate_count_one_preserves_types(self) -> None: - """Even count=1 should return native types, not str-coerced.""" schema = self.forge.schema(["port", "boolean"]) rows = schema.generate(count=1) assert len(rows) == 1 @@ -101,14 +95,10 @@ def test_generate_count_one_preserves_types(self) -> None: assert isinstance(rows[0]["boolean"], bool) -# ====================================================================== -# Schema.to_json() -# ====================================================================== +# CSV delimiter support class TestSchemaToJson: - """Test the new Schema.to_json() method.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -134,14 +124,10 @@ def test_json_writes_to_file(self, tmp_path) -> None: assert "first_name" in data[0] -# ====================================================================== # CSV delimiter support -# ====================================================================== class TestCsvDelimiter: - """Test CSV delimiter parameter.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -172,7 +158,6 @@ def test_stream_csv_with_delimiter(self, tmp_path) -> None: assert "\t" in content def test_core_to_csv_delimiter(self) -> None: - """Test that core.to_csv passes delimiter through.""" result = self.forge.to_csv(["first_name", "email"], count=3, delimiter="|") lines = result.strip().split("\n") assert "|" in lines[0] @@ -187,14 +172,10 @@ def test_core_stream_to_csv_delimiter(self, tmp_path) -> None: assert "\t" in content -# ====================================================================== -# DataForge.to_json() convenience method -# ====================================================================== +# Introspection API: list_providers, list_fields class TestCoreToJson: - """Test core.to_json() convenience method.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -212,14 +193,10 @@ def test_writes_to_file(self, tmp_path) -> None: assert len(data) == 5 -# ====================================================================== # Introspection API: list_providers, list_fields -# ====================================================================== class TestIntrospectionAPI: - """Test DataForge.list_providers() and list_fields().""" - def test_list_providers_returns_sorted_list(self) -> None: providers = DataForge.list_providers() assert isinstance(providers, list) @@ -259,14 +236,10 @@ def test_list_fields_email_maps_to_internet(self) -> None: assert fields["email"] == ("internet", "email") -# ====================================================================== # Type-aware Pydantic mapping -# ====================================================================== class TestPydanticTypeAware: - """Test type-aware Pydantic model introspection.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -402,14 +375,10 @@ class BadModel(BaseModel): self.forge.schema_from_pydantic(BadModel) -# ====================================================================== # Type-aware SQLAlchemy mapping -# ====================================================================== class TestSQLAlchemyTypeAware: - """Test type-aware SQLAlchemy model introspection.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -530,14 +499,10 @@ class Item(Base): assert "first_name" in rows[0] -# ====================================================================== # CLI enhancements -# ====================================================================== class TestCliVersion: - """Test --version flag.""" - def test_version_output(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -549,8 +514,6 @@ def test_version_output(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliListProviders: - """Test --list-providers flag.""" - def test_list_providers_returns_zero(self) -> None: from dataforge.cli import main @@ -567,8 +530,6 @@ def test_list_providers_output(self, capsys: pytest.CaptureFixture[str]) -> None class TestCliSqlFormat: - """Test --format sql.""" - def test_sql_output(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -635,8 +596,6 @@ def test_sql_postgresql_dialect(self, capsys: pytest.CaptureFixture[str]) -> Non class TestCliTsvFormat: - """Test --format tsv.""" - def test_tsv_output(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -653,8 +612,6 @@ def test_tsv_output(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliDelimiter: - """Test --delimiter flag.""" - def test_custom_delimiter(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -679,8 +636,6 @@ def test_custom_delimiter(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliColumnRenaming: - """Test column renaming via Name=field_name syntax.""" - def test_column_renaming_json(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -741,8 +696,6 @@ def test_column_renaming_text(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliUnique: - """Test --unique flag.""" - def test_unique_values(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -767,8 +720,6 @@ def test_unique_values(self, capsys: pytest.CaptureFixture[str]) -> None: class TestCliStream: - """Test --stream flag.""" - def test_stream_requires_output(self, capsys: pytest.CaptureFixture[str]) -> None: from dataforge.cli import main @@ -849,14 +800,10 @@ def test_stream_not_supported_with_sql(self) -> None: assert result == 1 -# ====================================================================== # Backend new methods -# ====================================================================== class TestBackendNewMethods: - """Test new methods added to RandomEngine.""" - def setup_method(self) -> None: from dataforge.backend import RandomEngine @@ -950,14 +897,10 @@ def test_zipf(self) -> None: assert 1 <= val <= 100 -# ====================================================================== # SQL null handling -# ====================================================================== class TestSqlNullHandling: - """Test that None values produce SQL NULL.""" - def test_none_values_become_null(self) -> None: forge = DataForge(seed=42) schema = forge.schema( @@ -970,26 +913,10 @@ def test_none_values_become_null(self) -> None: assert "NULL" in sql -# ====================================================================== -# Version bump -# ====================================================================== - - -class TestVersionBump: - """Test version was bumped.""" - - def test_version_is_0_3_0(self) -> None: - assert __version__ == "0.3.0" - - -# ====================================================================== # Type resolution helpers -# ====================================================================== class TestTypeResolution: - """Test _resolve_type_annotation and _type_fallback helpers.""" - def test_resolve_plain_type(self) -> None: from dataforge.core import _resolve_type_annotation @@ -1022,19 +949,16 @@ def test_type_fallback_bool(self) -> None: assert _type_fallback(bool) == "boolean" def test_type_fallback_str_returns_none(self) -> None: - """str is too ambiguous — should return None.""" from dataforge.core import _type_fallback assert _type_fallback(str) is None def test_type_fallback_int_returns_none(self) -> None: - """int is too ambiguous — should return None.""" from dataforge.core import _type_fallback assert _type_fallback(int) is None def test_type_fallback_float_returns_none(self) -> None: - """float is too ambiguous — should return None.""" from dataforge.core import _type_fallback assert _type_fallback(float) is None diff --git a/tests/test_phase2.py b/tests/test_phase2.py index 3fafdc2..d4877db 100644 --- a/tests/test_phase2.py +++ b/tests/test_phase2.py @@ -12,19 +12,14 @@ from dataforge import DataForge -# ------------------------------------------------------------------ # Nullable field support -# ------------------------------------------------------------------ class TestNullableFields: - """Test null_fields parameter on Schema.""" - def setup_method(self) -> None: self.forge = DataForge(seed=42) def test_null_fields_basic(self) -> None: - """Fields with null_probability should produce some None values.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 0.5}, @@ -38,7 +33,6 @@ def test_null_fields_basic(self) -> None: assert all(r["first_name"] is not None for r in rows) def test_null_fields_zero_probability(self) -> None: - """Probability 0.0 should produce no None values.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 0.0}, @@ -47,7 +41,6 @@ def test_null_fields_zero_probability(self) -> None: assert all(r["email"] is not None for r in rows) def test_null_fields_full_probability(self) -> None: - """Probability 1.0 should make all values None.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 1.0}, @@ -56,7 +49,6 @@ def test_null_fields_full_probability(self) -> None: assert all(r["email"] is None for r in rows) def test_null_fields_multiple_columns(self) -> None: - """Multiple columns can have different null probabilities.""" schema = self.forge.schema( ["first_name", "email", "city"], null_fields={"email": 0.5, "city": 0.3}, @@ -70,7 +62,6 @@ def test_null_fields_multiple_columns(self) -> None: assert all(r["first_name"] is not None for r in rows) def test_null_fields_invalid_column_raises(self) -> None: - """Invalid column name in null_fields should raise ValueError.""" with pytest.raises(ValueError, match="not a column"): self.forge.schema( ["first_name", "email"], @@ -78,7 +69,6 @@ def test_null_fields_invalid_column_raises(self) -> None: ) def test_null_fields_none_arg(self) -> None: - """null_fields=None should work (no nulls).""" schema = self.forge.schema( ["first_name", "email"], null_fields=None, @@ -87,7 +77,6 @@ def test_null_fields_none_arg(self) -> None: assert all(r["email"] is not None for r in rows) def test_null_fields_in_csv(self) -> None: - """Null values should appear as empty strings in CSV output.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 1.0}, @@ -102,7 +91,6 @@ def test_null_fields_in_csv(self) -> None: assert parts[1] == "", f"Expected empty email, got {parts[1]!r}" def test_null_fields_in_json(self) -> None: - """Null values should appear as null in JSON output.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 1.0}, @@ -113,7 +101,6 @@ def test_null_fields_in_json(self) -> None: assert row["email"] is None def test_null_fields_in_sql(self) -> None: - """Null values should appear as NULL in SQL output.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 1.0}, @@ -122,7 +109,6 @@ def test_null_fields_in_sql(self) -> None: assert "NULL" in sql_output def test_null_fields_with_stream(self) -> None: - """Null injection should work with stream().""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 1.0}, @@ -132,7 +118,6 @@ def test_null_fields_with_stream(self) -> None: assert all(r["email"] is None for r in rows) def test_null_fields_with_lambda(self) -> None: - """Null fields should work alongside lambda fields.""" schema = self.forge.schema( { "name": "first_name", @@ -147,7 +132,6 @@ def test_null_fields_with_lambda(self) -> None: assert row["upper_name"] == row["name"].upper() def test_null_fields_via_core_schema(self) -> None: - """DataForge.schema() should pass null_fields through.""" schema = self.forge.schema( ["first_name", "email"], null_fields={"email": 1.0}, @@ -156,19 +140,14 @@ def test_null_fields_via_core_schema(self) -> None: assert all(r["email"] is None for r in rows) -# ------------------------------------------------------------------ # Encoding support -# ------------------------------------------------------------------ class TestEncoding: - """Test encoding parameter on export methods.""" - def setup_method(self) -> None: self.forge = DataForge(seed=42) def test_csv_encoding_utf8(self) -> None: - """Default UTF-8 encoding should work.""" with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: path = f.name try: @@ -181,7 +160,6 @@ def test_csv_encoding_utf8(self) -> None: os.unlink(path) def test_csv_encoding_latin1(self) -> None: - """Latin-1 encoding should be usable.""" with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: path = f.name try: @@ -194,7 +172,6 @@ def test_csv_encoding_latin1(self) -> None: os.unlink(path) def test_jsonl_encoding(self) -> None: - """JSONL encoding parameter should work.""" with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: path = f.name try: @@ -207,7 +184,6 @@ def test_jsonl_encoding(self) -> None: os.unlink(path) def test_json_encoding(self) -> None: - """JSON encoding parameter should work.""" with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f: path = f.name try: @@ -220,7 +196,6 @@ def test_json_encoding(self) -> None: os.unlink(path) def test_stream_csv_encoding(self) -> None: - """stream_to_csv should accept encoding parameter.""" with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: path = f.name try: @@ -234,7 +209,6 @@ def test_stream_csv_encoding(self) -> None: os.unlink(path) def test_stream_jsonl_encoding(self) -> None: - """stream_to_jsonl should accept encoding parameter.""" with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: path = f.name try: @@ -245,7 +219,6 @@ def test_stream_jsonl_encoding(self) -> None: os.unlink(path) def test_core_csv_encoding_passthrough(self) -> None: - """DataForge.to_csv() should pass encoding through.""" with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: path = f.name try: @@ -262,19 +235,14 @@ def test_core_csv_encoding_passthrough(self) -> None: os.unlink(path) -# ------------------------------------------------------------------ # Compression support (gzip) -# ------------------------------------------------------------------ class TestCompression: - """Test gzip compression on export methods.""" - def setup_method(self) -> None: self.forge = DataForge(seed=42) def test_csv_gzip_auto(self) -> None: - """CSV with .gz extension should auto-compress.""" with tempfile.NamedTemporaryFile(suffix=".csv.gz", delete=False) as f: path = f.name try: @@ -292,7 +260,6 @@ def test_csv_gzip_auto(self) -> None: os.unlink(path) def test_csv_gzip_explicit(self) -> None: - """CSV with compress=True should gzip even without .gz extension.""" with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: path = f.name try: @@ -305,7 +272,6 @@ def test_csv_gzip_explicit(self) -> None: os.unlink(path) def test_csv_gzip_suppress(self) -> None: - """compress=False should disable auto-gzip even with .gz extension.""" with tempfile.NamedTemporaryFile(suffix=".csv.gz", delete=False) as f: path = f.name try: @@ -319,7 +285,6 @@ def test_csv_gzip_suppress(self) -> None: os.unlink(path) def test_jsonl_gzip(self) -> None: - """JSONL with .gz extension should auto-compress.""" with tempfile.NamedTemporaryFile(suffix=".jsonl.gz", delete=False) as f: path = f.name try: @@ -336,7 +301,6 @@ def test_jsonl_gzip(self) -> None: os.unlink(path) def test_json_gzip(self) -> None: - """JSON with .gz extension should auto-compress.""" with tempfile.NamedTemporaryFile(suffix=".json.gz", delete=False) as f: path = f.name try: @@ -349,7 +313,6 @@ def test_json_gzip(self) -> None: os.unlink(path) def test_sql_gzip(self) -> None: - """SQL with .gz extension should auto-compress.""" with tempfile.NamedTemporaryFile(suffix=".sql.gz", delete=False) as f: path = f.name try: @@ -362,7 +325,6 @@ def test_sql_gzip(self) -> None: os.unlink(path) def test_stream_csv_gzip(self) -> None: - """stream_to_csv with .gz extension should auto-compress.""" with tempfile.NamedTemporaryFile(suffix=".csv.gz", delete=False) as f: path = f.name try: @@ -376,7 +338,6 @@ def test_stream_csv_gzip(self) -> None: os.unlink(path) def test_stream_jsonl_gzip(self) -> None: - """stream_to_jsonl with .gz extension should auto-compress.""" with tempfile.NamedTemporaryFile(suffix=".jsonl.gz", delete=False) as f: path = f.name try: @@ -390,7 +351,6 @@ def test_stream_jsonl_gzip(self) -> None: os.unlink(path) def test_core_csv_gzip_passthrough(self) -> None: - """DataForge.to_csv() should pass compress through.""" with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: path = f.name try: @@ -407,7 +367,6 @@ def test_core_csv_gzip_passthrough(self) -> None: os.unlink(path) def test_core_jsonl_gzip_passthrough(self) -> None: - """DataForge.to_jsonl() should pass compress through.""" with tempfile.NamedTemporaryFile(suffix=".jsonl.gz", delete=False) as f: path = f.name try: @@ -423,7 +382,6 @@ def test_core_jsonl_gzip_passthrough(self) -> None: os.unlink(path) def test_core_stream_csv_gzip_passthrough(self) -> None: - """DataForge.stream_to_csv() should pass compress through.""" with tempfile.NamedTemporaryFile(suffix=".csv.gz", delete=False) as f: path = f.name try: @@ -440,16 +398,11 @@ def test_core_stream_csv_gzip_passthrough(self) -> None: os.unlink(path) -# ------------------------------------------------------------------ # CLI — null-fields, encoding, compression -# ------------------------------------------------------------------ class TestCLIPhase2: - """Test CLI flags added in Phase 2.""" - def test_null_fields_flag(self) -> None: - """--null-fields should produce some null values.""" from dataforge.cli import main import io import sys @@ -478,7 +431,6 @@ def test_null_fields_flag(self) -> None: assert all(row["email"] is None for row in data) def test_null_fields_invalid_format(self) -> None: - """--null-fields with bad format should error.""" from dataforge.cli import main import io import sys @@ -498,7 +450,6 @@ def test_null_fields_invalid_format(self) -> None: assert ret == 1 def test_null_fields_invalid_probability(self) -> None: - """--null-fields with probability > 1 should error.""" from dataforge.cli import main import io import sys @@ -519,7 +470,6 @@ def test_null_fields_invalid_probability(self) -> None: assert ret == 1 def test_compress_flag_stream(self) -> None: - """--compress with --stream should produce gzip output.""" from dataforge.cli import main with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: @@ -557,7 +507,6 @@ def test_compress_flag_stream(self) -> None: os.unlink(path) def test_compress_flag_non_stream(self) -> None: - """--compress with -o should produce gzip output.""" from dataforge.cli import main with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: @@ -586,7 +535,6 @@ def test_compress_flag_non_stream(self) -> None: os.unlink(path) def test_encoding_flag(self) -> None: - """--encoding should be passed through to file output.""" from dataforge.cli import main with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f: @@ -616,66 +564,52 @@ def test_encoding_flag(self) -> None: os.unlink(path) -# ------------------------------------------------------------------ # Statistical distributions (from backend, added in Phase 1 but tested here) -# ------------------------------------------------------------------ class TestStatisticalDistributions: - """Test statistical distribution methods on RandomEngine.""" - def setup_method(self) -> None: from dataforge.backend import RandomEngine self.engine = RandomEngine(seed=42) def test_gauss_range(self) -> None: - """Gaussian values should be roughly within expected range.""" values = [self.engine.gauss(mu=100, sigma=10) for _ in range(1000)] mean = sum(values) / len(values) assert 90 < mean < 110 def test_gauss_int_clamped(self) -> None: - """gauss_int should clamp to [min, max].""" values = [self.engine.gauss_int(50, 20, 0, 100) for _ in range(1000)] assert all(0 <= v <= 100 for v in values) def test_exponential_positive(self) -> None: - """Exponential values should all be positive.""" values = [self.engine.exponential(1.0) for _ in range(100)] assert all(v > 0 for v in values) def test_log_normal_positive(self) -> None: - """Log-normal values should all be positive.""" values = [self.engine.log_normal(0, 1) for _ in range(100)] assert all(v > 0 for v in values) def test_triangular_bounds(self) -> None: - """Triangular values should be within [low, high].""" values = [self.engine.triangular(10.0, 20.0) for _ in range(100)] assert all(10.0 <= v <= 20.0 for v in values) def test_pareto_positive(self) -> None: - """Pareto values should be positive.""" values = [self.engine.pareto(2.0) for _ in range(100)] assert all(v > 0 for v in values) def test_beta_unit_interval(self) -> None: - """Beta values should be in (0, 1).""" values = [self.engine.beta(2.0, 5.0) for _ in range(100)] assert all(0 < v < 1 for v in values) def test_gamma_positive(self) -> None: - """Gamma values should be positive.""" values = [self.engine.gamma(2.0, 1.0) for _ in range(100)] assert all(v > 0 for v in values) def test_zipf_bounds(self) -> None: - """Zipf values should be in [1, n].""" values = [self.engine.zipf(1.5, 50) for _ in range(100)] assert all(1 <= v <= 50 for v in values) def test_vonmises_returns_float(self) -> None: - """Von Mises should return a float.""" val = self.engine.vonmises(0.0, 2.0) assert isinstance(val, float) diff --git a/tests/test_phase3.py b/tests/test_phase3.py index bb2eff9..8be9c25 100644 --- a/tests/test_phase3.py +++ b/tests/test_phase3.py @@ -5,9 +5,7 @@ from dataforge import DataForge, RelationalSchema -# ===================================================================== # Fixtures -# ===================================================================== @pytest.fixture @@ -16,16 +14,11 @@ def forge() -> DataForge: return DataForge(locale="en_US", seed=42) -# ===================================================================== # unique_together — Schema constraint -# ===================================================================== class TestUniqueTogether: - """Tests for the unique_together constraint on Schema.""" - def test_basic_unique_together(self, forge: DataForge) -> None: - """Pairs of (first_name, last_name) should be unique.""" schema = forge.schema( ["first_name", "last_name", "email"], unique_together=[("first_name", "last_name")], @@ -35,7 +28,6 @@ def test_basic_unique_together(self, forge: DataForge) -> None: assert len(pairs) == len(set(pairs)) def test_unique_together_single_column(self, forge: DataForge) -> None: - """unique_together with a single column acts like 'unique'.""" schema = forge.schema( ["city", "state"], unique_together=[("city",)], @@ -45,7 +37,6 @@ def test_unique_together_single_column(self, forge: DataForge) -> None: assert len(cities) == len(set(cities)) def test_unique_together_multiple_groups(self, forge: DataForge) -> None: - """Multiple unique_together groups should all be enforced.""" schema = forge.schema( ["first_name", "last_name", "city", "state"], unique_together=[ @@ -60,7 +51,6 @@ def test_unique_together_multiple_groups(self, forge: DataForge) -> None: assert len(city_pairs) == len(set(city_pairs)) def test_unique_together_with_null_fields(self, forge: DataForge) -> None: - """unique_together should work alongside null_fields.""" schema = forge.schema( ["first_name", "last_name", "email"], null_fields={"email": 0.3}, @@ -76,7 +66,6 @@ def test_unique_together_with_null_fields(self, forge: DataForge) -> None: assert null_count >= 0 # just ensure it doesn't crash def test_unique_together_invalid_column(self, forge: DataForge) -> None: - """Should raise ValueError for unknown column names.""" with pytest.raises(ValueError, match="unique_together column 'nonexistent'"): forge.schema( ["first_name", "email"], @@ -84,7 +73,6 @@ def test_unique_together_invalid_column(self, forge: DataForge) -> None: ) def test_unique_together_count_zero(self, forge: DataForge) -> None: - """Generating zero rows with unique_together should return [].""" schema = forge.schema( ["first_name", "last_name"], unique_together=[("first_name", "last_name")], @@ -93,7 +81,6 @@ def test_unique_together_count_zero(self, forge: DataForge) -> None: assert rows == [] def test_unique_together_count_one(self, forge: DataForge) -> None: - """Generating one row with unique_together should work.""" schema = forge.schema( ["first_name", "last_name"], unique_together=[("first_name", "last_name")], @@ -103,7 +90,6 @@ def test_unique_together_count_one(self, forge: DataForge) -> None: assert "first_name" in rows[0] def test_unique_together_preserves_row_count(self, forge: DataForge) -> None: - """Should always return exactly the requested number of rows.""" schema = forge.schema( ["first_name", "last_name", "email"], unique_together=[("first_name", "last_name")], @@ -113,7 +99,6 @@ def test_unique_together_preserves_row_count(self, forge: DataForge) -> None: assert len(rows) == count def test_unique_together_with_lambdas(self, forge: DataForge) -> None: - """unique_together should work with lambda/computed fields.""" schema = forge.schema( { "first_name": "first_name", @@ -130,7 +115,6 @@ def test_unique_together_with_lambdas(self, forge: DataForge) -> None: assert row["full"] == f"{row['first_name']} {row['last_name']}" def test_schema_repr_unchanged(self, forge: DataForge) -> None: - """Schema repr should still work with unique_together.""" schema = forge.schema( ["first_name", "email"], unique_together=[("first_name",)], @@ -138,16 +122,11 @@ def test_schema_repr_unchanged(self, forge: DataForge) -> None: assert "Schema" in repr(schema) -# ===================================================================== # RelationalSchema — multi-table data generation -# ===================================================================== class TestRelationalSchemaBasic: - """Basic RelationalSchema tests.""" - def test_single_table(self, forge: DataForge) -> None: - """A single table with no relationships should work.""" rel = forge.relational( { "users": { @@ -167,7 +146,6 @@ def test_single_table(self, forge: DataForge) -> None: assert "email" in row def test_auto_increment_ids(self, forge: DataForge) -> None: - """IDs should be 1-based auto-incrementing integers.""" rel = forge.relational( { "users": { @@ -181,7 +159,6 @@ def test_auto_increment_ids(self, forge: DataForge) -> None: assert ids == list(range(1, 11)) def test_default_count(self, forge: DataForge) -> None: - """Default count should be 10 when not specified.""" rel = forge.relational( { "users": { @@ -194,10 +171,7 @@ def test_default_count(self, forge: DataForge) -> None: class TestRelationalSchemaParentChild: - """Tests for parent-child FK relationships.""" - def test_simple_parent_child(self, forge: DataForge) -> None: - """Child table should have FK column referencing parent IDs.""" rel = forge.relational( { "users": { @@ -221,7 +195,6 @@ def test_simple_parent_child(self, forge: DataForge) -> None: assert order["users_id"] in parent_ids def test_custom_parent_key(self, forge: DataForge) -> None: - """Custom parent_key should be used as the FK column name.""" rel = forge.relational( { "users": { @@ -243,7 +216,6 @@ def test_custom_parent_key(self, forge: DataForge) -> None: assert order["user_id"] in parent_ids def test_three_level_hierarchy(self, forge: DataForge) -> None: - """Three-level hierarchy: users → orders → order_items.""" rel = forge.relational( { "users": { @@ -278,7 +250,6 @@ def test_three_level_hierarchy(self, forge: DataForge) -> None: assert item["order_id"] in order_ids def test_referential_integrity(self, forge: DataForge) -> None: - """Every FK value must point to an existing parent ID.""" rel = forge.relational( { "departments": { @@ -299,7 +270,6 @@ def test_referential_integrity(self, forge: DataForge) -> None: assert emp["dept_id"] in dept_ids def test_dict_fields_spec(self, forge: DataForge) -> None: - """Fields can be specified as a dict with column renaming.""" rel = forge.relational( { "products": { @@ -320,10 +290,7 @@ def test_dict_fields_spec(self, forge: DataForge) -> None: class TestRelationalSchemaCardinality: - """Tests for children_per_parent cardinality bounds.""" - def test_bounded_cardinality(self, forge: DataForge) -> None: - """Each parent should get between min and max children.""" rel = forge.relational( { "users": { @@ -353,7 +320,6 @@ def test_bounded_cardinality(self, forge: DataForge) -> None: assert count >= 0 # sanity check — some parents may get 0 if total is tight def test_cardinality_one_to_one(self, forge: DataForge) -> None: - """(1, 1) cardinality — each parent gets exactly one child.""" rel = forge.relational( { "users": { @@ -379,10 +345,7 @@ def test_cardinality_one_to_one(self, forge: DataForge) -> None: class TestRelationalSchemaTopologicalSort: - """Tests for topological ordering and error handling.""" - def test_topological_order(self, forge: DataForge) -> None: - """Tables should be generated parents-first.""" rel = forge.relational( { "order_items": { @@ -410,7 +373,6 @@ def test_topological_order(self, forge: DataForge) -> None: assert len(data["order_items"]) == 10 def test_circular_dependency_raises(self, forge: DataForge) -> None: - """Circular references should raise ValueError.""" with pytest.raises(ValueError, match="Circular dependency"): forge.relational( { @@ -428,7 +390,6 @@ def test_circular_dependency_raises(self, forge: DataForge) -> None: ) def test_undefined_parent_raises(self, forge: DataForge) -> None: - """Referencing a non-existent parent should raise ValueError.""" with pytest.raises(ValueError, match="references parent 'nonexistent'"): forge.relational( { @@ -441,7 +402,6 @@ def test_undefined_parent_raises(self, forge: DataForge) -> None: ) def test_multiple_children_same_parent(self, forge: DataForge) -> None: - """Multiple child tables can reference the same parent.""" rel = forge.relational( { "users": { @@ -471,10 +431,7 @@ def test_multiple_children_same_parent(self, forge: DataForge) -> None: class TestRelationalSchemaSQLOutput: - """Tests for RelationalSchema.to_sql() output.""" - def test_to_sql_basic(self, forge: DataForge) -> None: - """to_sql() should return valid INSERT statements.""" rel = forge.relational( { "users": { @@ -490,7 +447,6 @@ def test_to_sql_basic(self, forge: DataForge) -> None: assert '"email"' in sql def test_to_sql_parent_child(self, forge: DataForge) -> None: - """SQL output should include both parent and child tables.""" rel = forge.relational( { "users": { @@ -514,7 +470,6 @@ def test_to_sql_parent_child(self, forge: DataForge) -> None: assert users_pos < orders_pos def test_to_sql_mysql_dialect(self, forge: DataForge) -> None: - """MySQL dialect should use backtick quoting.""" rel = forge.relational( { "users": { @@ -529,7 +484,6 @@ def test_to_sql_mysql_dialect(self, forge: DataForge) -> None: assert "`first_name`" in sql def test_to_sql_null_values(self, forge: DataForge) -> None: - """SQL output should render None as NULL.""" rel = forge.relational( { "users": { @@ -544,8 +498,6 @@ def test_to_sql_null_values(self, forge: DataForge) -> None: class TestRelationalSchemaRepr: - """Test __repr__ method.""" - def test_repr(self, forge: DataForge) -> None: rel = forge.relational( { @@ -560,10 +512,7 @@ def test_repr(self, forge: DataForge) -> None: class TestRelationalSchemaWithNullFields: - """Tests for null_fields integration in relational schemas.""" - def test_null_fields_in_child_table(self, forge: DataForge) -> None: - """null_fields should work in child table specs.""" rel = forge.relational( { "users": { @@ -585,16 +534,11 @@ def test_null_fields_in_child_table(self, forge: DataForge) -> None: assert null_cities >= 0 # just ensure it doesn't crash -# ===================================================================== # Integration: forge.relational() convenience method -# ===================================================================== class TestForgeRelationalMethod: - """Tests for the DataForge.relational() convenience method.""" - def test_returns_relational_schema(self, forge: DataForge) -> None: - """forge.relational() should return a RelationalSchema.""" rel = forge.relational( { "users": {"fields": ["first_name"], "count": 5}, @@ -603,7 +547,6 @@ def test_returns_relational_schema(self, forge: DataForge) -> None: assert isinstance(rel, RelationalSchema) def test_generate_via_forge(self, forge: DataForge) -> None: - """Should be able to generate data via the convenience method.""" rel = forge.relational( { "users": {"fields": ["first_name"], "count": 5}, @@ -620,14 +563,10 @@ def test_generate_via_forge(self, forge: DataForge) -> None: assert len(data["orders"]) == 10 -# ===================================================================== # Import test -# ===================================================================== class TestImport: - """Test that RelationalSchema is importable from the package.""" - def test_import_from_package(self) -> None: from dataforge import RelationalSchema as RS @@ -639,16 +578,11 @@ def test_in_all(self) -> None: assert "RelationalSchema" in dataforge.__all__ -# ===================================================================== # Edge cases -# ===================================================================== class TestEdgeCases: - """Edge cases and boundary conditions.""" - def test_empty_parent_table(self, forge: DataForge) -> None: - """Child table with zero-row parent should handle gracefully.""" rel = forge.relational( { "users": { @@ -670,7 +604,6 @@ def test_empty_parent_table(self, forge: DataForge) -> None: assert order["user_id"] is None def test_large_hierarchy(self, forge: DataForge) -> None: - """Stress test with a deeper hierarchy.""" rel = forge.relational( { "companies": { @@ -706,7 +639,6 @@ def test_large_hierarchy(self, forge: DataForge) -> None: assert emp["dept_id"] in dept_ids def test_unique_together_schema_passthrough(self, forge: DataForge) -> None: - """unique_together passed through forge.schema() should work.""" schema = forge.schema( ["first_name", "last_name"], unique_together=[("first_name", "last_name")], diff --git a/tests/test_phase4.py b/tests/test_phase4.py index 14aa2b7..a47eb96 100644 --- a/tests/test_phase4.py +++ b/tests/test_phase4.py @@ -17,9 +17,7 @@ from dataforge import DataForge -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -27,9 +25,7 @@ def forge() -> DataForge: return DataForge(seed=42) -# ================================================================== # SocialMediaProvider -# ================================================================== class TestSocialMediaProvider: @@ -96,9 +92,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert all(r["hashtag"].startswith("#") for r in rows) -# ================================================================== # MusicProvider -# ================================================================== class TestMusicProvider: @@ -160,9 +154,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert all("genre" in r for r in rows) -# ================================================================== # SportsProvider -# ================================================================== class TestSportsProvider: @@ -213,9 +205,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # FoodProvider -# ================================================================== class TestFoodProvider: @@ -265,9 +255,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # LegalProvider -# ================================================================== class TestLegalProvider: @@ -321,9 +309,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # RealEstateProvider -# ================================================================== class TestRealEstateProvider: @@ -382,9 +368,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # WeatherProvider -# ================================================================== class TestWeatherProvider: @@ -449,9 +433,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # HardwareProvider -# ================================================================== class TestHardwareProvider: @@ -500,9 +482,7 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # LogisticsProvider -# ================================================================== class TestLogisticsProvider: @@ -566,14 +546,10 @@ def test_schema_integration(self, forge: DataForge) -> None: assert len(rows) == 5 -# ================================================================== # Registry & Cross-provider tests -# ================================================================== class TestRegistryDiscovery: - """Verify all 9 new providers are discovered by the registry.""" - def test_all_providers_in_registry(self) -> None: from dataforge.registry import get_provider_info @@ -639,8 +615,6 @@ def test_list_fields_includes_new(self) -> None: class TestBatchConsistency: - """Ensure batch generation matches scalar generation semantics.""" - @pytest.mark.parametrize( "provider_name,method_name", [ @@ -690,8 +664,6 @@ def test_composed_batch_returns_correct_count( class TestReproducibility: - """Verify seeded providers produce deterministic output.""" - @pytest.mark.parametrize( "provider_name,method_name", [ diff --git a/tests/test_phase5.py b/tests/test_phase5.py index bae8479..b91f898 100644 --- a/tests/test_phase5.py +++ b/tests/test_phase5.py @@ -29,9 +29,7 @@ from dataforge.decorators import provider, _wrap_with_count -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -46,9 +44,7 @@ def tmp_dir(): yield d -# ================================================================== # schema_to_dict / dict_to_schema_args -# ================================================================== class TestSchemaToDict: @@ -60,7 +56,6 @@ def test_list_fields(self) -> None: assert "unique_together" not in d def test_dict_fields_compact(self) -> None: - """When all keys == values, it should compact to list form.""" d = schema_to_dict({"first_name": "first_name", "email": "email"}, count=10) assert d["fields"] == ["first_name", "email"] @@ -134,9 +129,7 @@ def test_invalid_fields_type_raises(self) -> None: dict_to_schema_args({"fields": "not_valid"}) -# ================================================================== # JSON serialization round-trip -# ================================================================== class TestJsonRoundTrip: @@ -177,9 +170,7 @@ def test_with_unique_together(self, tmp_dir: str) -> None: assert loaded == original -# ================================================================== # YAML serialization round-trip -# ================================================================== class TestYamlRoundTrip: @@ -192,7 +183,6 @@ def test_simple_list(self, tmp_dir: str) -> None: assert loaded["count"] == original["count"] def test_yml_extension(self, tmp_dir: str) -> None: - """Test .yml extension is also recognized.""" path = os.path.join(tmp_dir, "schema.yml") original = schema_to_dict(["first_name"], count=5) save_schema(original, path) @@ -245,9 +235,7 @@ def test_yaml_comments_ignored(self) -> None: assert result["fields"] == ["email"] -# ================================================================== # TOML serialization round-trip -# ================================================================== class TestTomlRoundTrip: @@ -270,9 +258,7 @@ def test_dict_fields(self, tmp_dir: str) -> None: assert fields["Email"] == "email" -# ================================================================== # Format detection errors -# ================================================================== class TestFormatDetection: @@ -293,9 +279,7 @@ def test_load_nonexistent_file_raises(self) -> None: load_schema("nonexistent_file.json") -# ================================================================== # Schema.to_schema_dict() and Schema.save_schema() -# ================================================================== class TestSchemaSerializationMethods: @@ -365,9 +349,7 @@ def test_save_schema_toml(self, forge: DataForge, tmp_dir: str) -> None: assert loaded["fields"] == ["first_name", "email"] -# ================================================================== # DataForge.schema_from_dict() and schema_from_file() -# ================================================================== class TestSchemaFromDict: @@ -443,7 +425,6 @@ def test_explicit_format(self, forge: DataForge, tmp_dir: str) -> None: assert len(rows) == 3 def test_round_trip_generate(self, forge: DataForge, tmp_dir: str) -> None: - """Full round-trip: create schema -> save -> load -> generate.""" path = os.path.join(tmp_dir, "round_trip.json") # Create and save @@ -459,9 +440,7 @@ def test_round_trip_generate(self, forge: DataForge, tmp_dir: str) -> None: assert all("city" in r for r in rows) -# ================================================================== # CLI --schema and --save-schema flags -# ================================================================== class TestCliSchemaFlags: @@ -558,9 +537,7 @@ def test_schema_nonexistent_file(self, tmp_dir: str) -> None: assert ret == 1 -# ================================================================== # New Locales: sv_SE, da_DK, nb_NO, fi_FI, tr_TR -# ================================================================== NEW_LOCALES = ["sv_SE", "da_DK", "nb_NO", "fi_FI", "tr_TR"] @@ -632,7 +609,6 @@ def test_internet_email(self, locale: str) -> None: @pytest.mark.parametrize("locale", NEW_LOCALES) def test_schema_with_locale(self, locale: str) -> None: - """Test that new locales work with schema generation.""" forge = DataForge(locale=locale, seed=42) schema = forge.schema(["first_name", "city", "email"]) rows = schema.generate(count=10) @@ -643,8 +619,6 @@ def test_schema_with_locale(self, locale: str) -> None: class TestLocaleDataPresence: - """Verify that locale data modules contain expected attributes.""" - @pytest.mark.parametrize("locale", NEW_LOCALES) def test_person_data_has_names(self, locale: str) -> None: import importlib @@ -680,7 +654,6 @@ def test_phone_data_exists(self, locale: str) -> None: @pytest.mark.parametrize("locale", NEW_LOCALES) def test_data_is_immutable_tuples(self, locale: str) -> None: - """All locale data should be immutable tuples, not lists.""" import importlib mod = importlib.import_module(f"dataforge.locales.{locale}.person") @@ -688,14 +661,11 @@ def test_data_is_immutable_tuples(self, locale: str) -> None: assert isinstance(mod.last_names, tuple) -# ================================================================== # @provider decorator -# ================================================================== class TestProviderDecorator: def test_basic_transformation(self) -> None: - """@provider transforms a plain class into a BaseProvider subclass.""" from dataforge.providers.base import BaseProvider @provider("test_greet") @@ -712,7 +682,6 @@ def goodbye(self) -> str: assert "goodbye" in GreetProvider._field_map def test_scalar_return(self) -> None: - """count=1 returns a scalar.""" @provider("test_scalar") class ScalarProvider: @@ -726,7 +695,6 @@ def greeting(self) -> str: assert isinstance(result, str) def test_batch_return(self) -> None: - """count>1 returns a list.""" @provider("test_batch") class BatchProvider: @@ -741,7 +709,6 @@ def greeting(self) -> str: assert all(x == "Hi" for x in result) def test_custom_field_map(self) -> None: - """Explicit field_map overrides auto-generated one.""" @provider("test_custom_fm", field_map={"temp": "temperature"}) class CustomFM: @@ -751,7 +718,6 @@ def temperature(self) -> str: assert CustomFM._field_map == {"temp": "temperature"} def test_private_methods_excluded(self) -> None: - """Methods starting with _ are not included in field_map.""" @provider("test_private") class PrivateProvider: @@ -765,7 +731,6 @@ def _private_method(self) -> str: assert "_private_method" not in PrivateProvider._field_map def test_schema_integration(self) -> None: - """Provider created via @provider works with Schema.""" @provider("test_schema_int") class SchemaIntProvider: @@ -780,7 +745,6 @@ def value(self) -> str: assert all(r["test_schema_int.value"] == "test_value" for r in rows) def test_provider_name_preserved(self) -> None: - """The decorated class keeps its original name.""" @provider("test_name_pres") class MySpecialProvider: @@ -790,7 +754,6 @@ def foo(self) -> str: assert MySpecialProvider.__name__ == "MySpecialProvider" def test_slots_empty(self) -> None: - """Decorated providers have __slots__ = ().""" @provider("test_slots") class SlotProvider: @@ -800,7 +763,6 @@ def x(self) -> str: assert SlotProvider.__slots__ == () def test_locale_modules(self) -> None: - """locale_modules parameter is stored on the class.""" @provider("test_locale", locale_modules=("person",)) class LocaleProvider: @@ -846,9 +808,7 @@ def original(self): assert wrapped(d, count=1) == "x" -# ================================================================== # Edge cases / error handling -# ================================================================== class TestEdgeCases: @@ -867,7 +827,6 @@ def test_json_top_level_not_dict_raises(self, tmp_dir: str) -> None: load_schema(path) def test_schema_from_dict_preserves_fields_spec(self, forge: DataForge) -> None: - """_fields_spec should be set for serialization round-trip.""" schema = forge.schema(["first_name", "email"]) assert schema._fields_spec == ["first_name", "email"] diff --git a/tests/test_phone.py b/tests/test_phone.py index 795cad7..d8f646b 100644 --- a/tests/test_phone.py +++ b/tests/test_phone.py @@ -4,8 +4,6 @@ class TestPhoneScalar: - """Tests for single-item phone generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -27,8 +25,6 @@ def test_phone_contains_digits(self) -> None: class TestPhoneBatch: - """Tests for batch phone generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -44,8 +40,6 @@ def test_cell_batch(self) -> None: class TestPhoneLocales: - """Tests for phone across locales.""" - def test_de_DE_phone(self) -> None: forge = DataForge(locale="de_DE", seed=42) phone = forge.phone.phone_number() diff --git a/tests/test_profile.py b/tests/test_profile.py index e76d87b..37385b2 100644 --- a/tests/test_profile.py +++ b/tests/test_profile.py @@ -4,8 +4,6 @@ class TestProfileScalar: - """Tests for single-item profile generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -33,7 +31,6 @@ def test_profile_values_are_strings(self) -> None: assert isinstance(value, str), f"{key} is not str: {type(value)}" def test_profile_email_coherence(self) -> None: - """Email should contain first and last name.""" result = self.forge.profile.profile() email = result["email"] first = result["first_name"].lower() @@ -55,8 +52,6 @@ def test_deterministic_with_seed(self) -> None: class TestProfileBatch: - """Tests for batch profile generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -86,8 +81,6 @@ def test_profile_batch_all_have_keys(self) -> None: class TestProfileFieldMap: - """Tests for individual _field_map methods (Schema compatibility).""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -132,13 +125,11 @@ def test_profile_job_title(self) -> None: assert len(result) > 0 def test_field_map_batch(self) -> None: - """All _field_map methods should support batch mode.""" result = self.forge.profile.profile_first_name(count=10) assert isinstance(result, list) assert len(result) == 10 def test_schema_compatibility(self) -> None: - """ProfileProvider fields should work in Schema.""" schema = self.forge.schema( { "first_name": "profile_first_name", diff --git a/tests/test_pytest_plugin.py b/tests/test_pytest_plugin.py index 7474815..878243d 100644 --- a/tests/test_pytest_plugin.py +++ b/tests/test_pytest_plugin.py @@ -6,13 +6,10 @@ class TestForgeFixture: - """Tests for the ``forge`` fixture.""" - def test_fixture_is_dataforge(self, forge: DataForge) -> None: assert isinstance(forge, DataForge) def test_fixture_is_seeded(self, forge: DataForge) -> None: - """Seeded fixture should produce deterministic output.""" a = forge.person.first_name() # Create another with same seed — should match # (conftest.py overrides plugin fixture with seed=42) @@ -27,8 +24,6 @@ def test_fixture_generates_data(self, forge: DataForge) -> None: class TestFakeFixture: - """Tests for the ``fake`` alias fixture.""" - def test_fake_is_forge(self, forge: DataForge, fake: DataForge) -> None: assert fake is forge @@ -38,8 +33,6 @@ def test_fake_generates_data(self, fake: DataForge) -> None: class TestForgeUnseeded: - """Tests for the ``forge_unseeded`` fixture.""" - def test_unseeded_is_dataforge(self, forge_unseeded: DataForge) -> None: assert isinstance(forge_unseeded, DataForge) @@ -49,11 +42,8 @@ def test_unseeded_generates_data(self, forge_unseeded: DataForge) -> None: class TestForgeSeedMarker: - """Tests for the ``@pytest.mark.forge_seed`` marker.""" - @pytest.mark.forge_seed(42) def test_marker_sets_seed(self, forge: DataForge) -> None: - """forge_seed(42) should produce same output as DataForge(seed=42).""" a = forge.person.first_name() forge2 = DataForge(seed=42) b = forge2.person.first_name() @@ -61,7 +51,6 @@ def test_marker_sets_seed(self, forge: DataForge) -> None: @pytest.mark.forge_seed(99) def test_different_seed_different_output(self, forge: DataForge) -> None: - """Different seed should (very likely) produce different output.""" a = forge.person.first_name() forge2 = DataForge(seed=0) _b = forge2.person.first_name() diff --git a/tests/test_registration.py b/tests/test_registration.py index b180882..e7efd1f 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -52,8 +52,6 @@ def temperature(self, count: int = 1) -> str | list[str]: class TestRegisterProvider: - """Tests for DataForge.register_provider().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_schema.py b/tests/test_schema.py index eef3756..e2bb3be 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -6,8 +6,6 @@ class TestSchemaImport: - """Tests that Schema can be imported directly.""" - def test_schema_importable(self) -> None: assert Schema is not None @@ -18,8 +16,6 @@ def test_schema_created_via_forge(self) -> None: class TestSchemaGenerate: - """Tests for Schema.generate().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -51,8 +47,6 @@ def test_generate_count_zero(self) -> None: class TestSchemaStream: - """Tests for Schema.stream().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -72,8 +66,6 @@ def test_stream_is_lazy(self) -> None: class TestSchemaCsv: - """Tests for Schema.to_csv().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -103,8 +95,6 @@ def test_csv_with_dict_fields(self) -> None: class TestSchemaJsonl: - """Tests for Schema.to_jsonl().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -126,8 +116,6 @@ def test_jsonl_each_line_is_valid_json(self) -> None: class TestSchemaSql: - """Tests for Schema.to_sql().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -157,8 +145,6 @@ def test_sql_postgresql_uses_double_quotes(self) -> None: class TestSchemaRepr: - """Tests for Schema.__repr__().""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/tests/test_seeder.py b/tests/test_seeder.py index 29b6103..37d1b22 100644 --- a/tests/test_seeder.py +++ b/tests/test_seeder.py @@ -13,9 +13,7 @@ sa = pytest.importorskip("sqlalchemy") -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -72,9 +70,7 @@ def seeder(forge, engine): return s -# ------------------------------------------------------------------ # Construction -# ------------------------------------------------------------------ class TestDatabaseSeederConstruction: @@ -93,9 +89,7 @@ def test_lazy_engine(self, forge) -> None: assert s._engine is None -# ------------------------------------------------------------------ # Table introspection -# ------------------------------------------------------------------ class TestTableIntrospection: @@ -126,9 +120,7 @@ def test_introspect_nonexistent_table(self, seeder) -> None: seeder._introspect_table("nonexistent") -# ------------------------------------------------------------------ # Seed single table -# ------------------------------------------------------------------ class TestSeedTable: @@ -150,7 +142,6 @@ def test_seed_with_overrides(self, seeder, engine) -> None: assert count == 10 def test_seed_batched(self, seeder, engine) -> None: - """Verify batch_size works for larger inserts.""" count = seeder.seed_table("users", count=250, batch_size=100) assert count == 250 @@ -159,7 +150,6 @@ def test_seed_batched(self, seeder, engine) -> None: assert result.scalar() == 250 def test_seed_empty_mapping_raises(self, seeder, engine) -> None: - """Table with no mappable columns should raise.""" # The 'items' table has 'sku' which may be mapped via type fallback. # Verify by checking if introspection returns an empty map field_map = seeder._introspect_table("items") @@ -172,9 +162,7 @@ def test_seed_empty_mapping_raises(self, seeder, engine) -> None: assert count == 10 -# ------------------------------------------------------------------ # Dialect optimizations -# ------------------------------------------------------------------ class TestDialectOptimizations: diff --git a/tests/test_streaming.py b/tests/test_streaming.py index 823fa84..c9b1ece 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -21,8 +21,6 @@ def schema(forge: DataForge): class TestStreamToCsv: - """Tests for Schema.stream_to_csv.""" - def test_writes_file(self, schema) -> None: with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: path = f.name @@ -50,7 +48,6 @@ def test_csv_content(self, schema) -> None: os.unlink(path) def test_batch_size(self, schema) -> None: - """Small batch size should still produce correct output.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: path = f.name try: @@ -80,7 +77,6 @@ def test_zero_count(self, schema) -> None: os.unlink(path) def test_via_forge(self, forge: DataForge) -> None: - """Test convenience method on DataForge.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: path = f.name try: @@ -91,8 +87,6 @@ def test_via_forge(self, forge: DataForge) -> None: class TestStreamToJsonl: - """Tests for Schema.stream_to_jsonl.""" - def test_writes_file(self, schema) -> None: with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: path = f.name @@ -142,8 +136,6 @@ def test_via_forge(self, forge: DataForge) -> None: class TestToParquet: - """Tests for Schema.to_parquet (requires pyarrow).""" - @pytest.fixture(autouse=True) def _skip_no_pyarrow(self) -> None: pytest.importorskip("pyarrow") @@ -202,7 +194,6 @@ def test_via_forge(self, forge: DataForge) -> None: os.unlink(path) def test_no_pyarrow_error(self, forge: DataForge, monkeypatch) -> None: - """Verify clear error when pyarrow is not installed.""" import builtins real_import = builtins.__import__ @@ -219,8 +210,6 @@ def mock_import(name, *args, **kwargs): class TestAsyncStream: - """Tests for Schema.async_stream.""" - @pytest.fixture def schema(self, forge: DataForge): return forge.schema(["first_name", "email", "city"]) diff --git a/tests/test_streaming_new.py b/tests/test_streaming_new.py index d501df3..448d82c 100644 --- a/tests/test_streaming_new.py +++ b/tests/test_streaming_new.py @@ -19,9 +19,7 @@ ) -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -34,9 +32,7 @@ def schema(forge: DataForge): return forge.schema(["first_name", "email"]) -# ------------------------------------------------------------------ # TokenBucketRateLimiter -# ------------------------------------------------------------------ class TestTokenBucketRateLimiter: @@ -71,9 +67,7 @@ def test_slots(self) -> None: limiter.nonexistent = True # type: ignore[attr-defined] -# ------------------------------------------------------------------ # StreamEmitter (abstract base) -# ------------------------------------------------------------------ class TestStreamEmitter: @@ -121,9 +115,7 @@ def test_slots(self) -> None: emitter.nonexistent = True # type: ignore[attr-defined] -# ------------------------------------------------------------------ # HttpEmitter -# ------------------------------------------------------------------ class TestHttpEmitter: @@ -173,9 +165,7 @@ def test_emit_batch_non_batch_mode(self, mock_urlopen) -> None: assert mock_urlopen.call_count == 2 -# ------------------------------------------------------------------ # KafkaEmitter -# ------------------------------------------------------------------ class TestKafkaEmitter: @@ -206,9 +196,7 @@ def test_slots(self) -> None: emitter.nonexistent = True # type: ignore[attr-defined] -# ------------------------------------------------------------------ # RabbitMQEmitter -# ------------------------------------------------------------------ class TestRabbitMQEmitter: @@ -248,9 +236,7 @@ def test_close_noop_when_not_connected(self) -> None: emitter.close() -# ------------------------------------------------------------------ # stream_to_emitter helper -# ------------------------------------------------------------------ class TestStreamToEmitter: diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py index 518cc90..386bc52 100644 --- a/tests/test_timeseries.py +++ b/tests/test_timeseries.py @@ -15,9 +15,7 @@ ) -# ------------------------------------------------------------------ # Fixtures -# ------------------------------------------------------------------ @pytest.fixture @@ -25,9 +23,7 @@ def forge() -> DataForge: return DataForge(locale="en_US", seed=42) -# ------------------------------------------------------------------ # Interval parsing -# ------------------------------------------------------------------ class TestIntervalParsing: @@ -53,9 +49,7 @@ def test_min_suffix(self) -> None: assert _parse_interval("15min") == 900 -# ------------------------------------------------------------------ # Datetime parsing -# ------------------------------------------------------------------ class TestDatetimeParsing: @@ -74,9 +68,7 @@ def test_iso_roundtrip(self) -> None: assert "2024-06-15" in iso -# ------------------------------------------------------------------ # TimeSeriesSchema creation -# ------------------------------------------------------------------ class TestTimeSeriesSchema: @@ -113,9 +105,7 @@ def test_repr(self, forge: DataForge) -> None: assert "temp" in r -# ------------------------------------------------------------------ # Data generation -# ------------------------------------------------------------------ class TestTimeSeriesGeneration: @@ -266,9 +256,7 @@ def test_generate_empty_range(self, forge: DataForge) -> None: assert rows == [] -# ------------------------------------------------------------------ # Export methods -# ------------------------------------------------------------------ class TestTimeSeriesExport: @@ -311,14 +299,11 @@ def test_stream_yields_all_rows(self, forge: DataForge) -> None: assert len(rows) == 4 -# ------------------------------------------------------------------ # Integration via DataForge.timeseries() -# ------------------------------------------------------------------ class TestDataForgeTimeSeriesMethod: def test_timeseries_method_exists(self, forge: DataForge) -> None: - """DataForge should have a timeseries() method.""" assert hasattr(forge, "timeseries") def test_timeseries_via_forge(self, forge: DataForge) -> None: diff --git a/tests/test_tui.py b/tests/test_tui.py index 02db310..5280578 100644 --- a/tests/test_tui.py +++ b/tests/test_tui.py @@ -10,9 +10,7 @@ import pytest -# ------------------------------------------------------------------ # Helper -# ------------------------------------------------------------------ def _has_textual() -> bool: @@ -25,9 +23,7 @@ def _has_textual() -> bool: return False -# ------------------------------------------------------------------ # Import guards -# ------------------------------------------------------------------ class TestTUIImports: @@ -36,7 +32,6 @@ class TestTUIImports: reason="textual not installed", ) def test_launch_function_importable(self) -> None: - """The launch() function should be importable from tui package.""" from dataforge.tui import launch assert callable(launch) @@ -46,7 +41,6 @@ def test_launch_function_importable(self) -> None: reason="textual not installed", ) def test_app_class_importable(self) -> None: - """DataForgeTUI class should be importable when textual is available.""" from dataforge.tui.app import DataForgeTUI assert DataForgeTUI is not None @@ -61,9 +55,7 @@ def test_export_dialog_importable(self) -> None: assert ExportDialog is not None -# ------------------------------------------------------------------ # App construction (only if textual is installed) -# ------------------------------------------------------------------ @pytest.mark.skipif(not _has_textual(), reason="textual not installed") @@ -95,9 +87,7 @@ def test_forge_starts_none(self) -> None: assert app._forge is None -# ------------------------------------------------------------------ # Export dialog (only if textual is installed) -# ------------------------------------------------------------------ @pytest.mark.skipif(not _has_textual(), reason="textual not installed") diff --git a/tests/test_unique.py b/tests/test_unique.py index 7ba1053..8de900a 100644 --- a/tests/test_unique.py +++ b/tests/test_unique.py @@ -6,8 +6,6 @@ class TestUniqueScalar: - """Tests for unique scalar generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -36,7 +34,6 @@ def test_unique_boolean(self) -> None: assert {a, b} == {True, False} def test_unique_exhaustion_raises(self) -> None: - """When all possible values are exhausted, should raise.""" # boolean only has True/False self.forge.unique.misc.boolean() self.forge.unique.misc.boolean() @@ -45,8 +42,6 @@ def test_unique_exhaustion_raises(self) -> None: class TestUniqueBatch: - """Tests for unique batch generation.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -57,20 +52,17 @@ def test_batch_all_unique(self) -> None: assert len(set(results)) == 50 # all unique def test_batch_extends_uniqueness(self) -> None: - """Batch values don't repeat values from earlier calls.""" first = self.forge.unique.person.first_name() batch = self.forge.unique.person.first_name(count=10) assert first not in batch def test_batch_large(self) -> None: - results = self.forge.unique.address.city(count=100) - assert len(results) == 100 - assert len(set(results)) == 100 + results = self.forge.unique.address.city(count=40) + assert len(results) == 40 + assert len(set(results)) == 40 class TestUniqueClear: - """Tests for clearing unique tracking.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) @@ -99,8 +91,6 @@ def test_clear_one_doesnt_affect_other(self) -> None: class TestUniqueProxy: - """Tests for UniqueProxy behavior.""" - def setup_method(self) -> None: self.forge = DataForge(locale="en_US", seed=42) diff --git a/uv.lock b/uv.lock index 3d63014..d082e66 100644 --- a/uv.lock +++ b/uv.lock @@ -36,7 +36,7 @@ wheels = [ [[package]] name = "dataforge-py" -version = "0.3.0" +version = "0.4.0" source = { editable = "." } [package.optional-dependencies] @@ -83,7 +83,7 @@ provides-extras = ["kafka", "rabbitmq", "tui", "db", "all"] dev = [ { name = "pytest", specifier = ">=8.0" }, { name = "pytest-asyncio", specifier = ">=0.24" }, - { name = "ruff", specifier = ">=0.9" }, + { name = "ruff", specifier = ">=0.15.4,<0.16" }, ] [[package]]