diff --git a/dbzero/dbzero/atomic.py b/dbzero/dbzero/atomic.py index 621e0511..267e5955 100644 --- a/dbzero/dbzero/atomic.py +++ b/dbzero/dbzero/atomic.py @@ -1,23 +1,122 @@ +from __future__ import annotations + +from typing import Any, Dict +from .interfaces import Memo from .dbzero import begin_atomic, assign class AtomicManager: - def __enter__(self): - self.__ctx = begin_atomic() - return self.__ctx + """Context manager that provides atomic context functionality for dbzero operations. + + It is intended for use in a 'with' statement. + """ + + def __init__(self): + self.__ctx = None + + def __enter__(self) -> AtomicManager: + self.begin() + return self def __exit__(self, exc_type, exc_value, traceback): if exc_type is None: - self.__ctx.close() + self.close() else: - self.__ctx.cancel() - self.__ctx = None + self.cancel() + + def begin(self): + """Begin the atomic context""" + if self.__ctx is None: + self.__ctx = begin_atomic() + + def close(self): + """Close the atomic context, staging the changes for commit""" + if self.__ctx is None: + return + + self.__ctx.close() + self.__ctx = None + + def cancel(self): + """Cancel the atomic context, reverting all changes""" + if self.__ctx is None: + return + + self.__ctx.cancel() + self.__ctx = None + +def atomic() -> AtomicManager: + """Create a context manager to group multiple mutating operations into a single indivisible transaction. + + This function ensures that all modifications within the `with` block are applied together, or none are applied at all. + If the block completes successfully, all changes are merged into the current + transaction. If an exception occurs inside the block, or if the transaction is + manually canceled, all changes are reverted, leaving the data in its previous state. + + Returns + ------- + AtomicManager + A context manager that provides atomic context functionality. + + Examples + -------- + Grouping successful operations: + + >>> obj1 = MyMemoClass("initial value") + >>> with dbzero.atomic(): + ... obj1.value = "updated value" + ... obj2 = MyMemoClass("new object") + ... dbzero.tags(obj2).add("new") + >>> # Both changes are now visible + >>> assert obj1.value == "updated value" + + Automatic rollback on exception: -def atomic(): + >>> obj = MyMemoClass(value=100) + >>> try: + ... with dbzero.atomic(): + ... obj.value = 200 # This change will be reverted + ... raise ValueError("Something went wrong") + ... except ValueError: + ... print("Caught expected error.") + >>> # The object's value is unchanged + >>> assert obj.value == 100 + + Manual rollback with cancel(): + + >>> obj = MyMemoClass(value=100) + >>> with dbzero.atomic() as atomic: + ... obj.value = 200 + ... if obj.value > 150: + ... print("Value too high, canceling.") + ... atomic.cancel() + >>> # The changes were discarded + >>> assert obj.value == 100 + + Notes + ----- + An atomic() block does not immediately create a new, committed transaction or + increment the global state number. Instead, the changes are staged + and applied as part of the surrounding transaction, which is then committed + either manually via dbzero.commit() or by the autocommit mechanism. + """ return AtomicManager() -def atomic_assign(*args, **kwargs): +def atomic_assign(*objects: Memo, **attributes: Dict[str, Any]) -> None: + """Perform bulk attribute updates on one or more Memo objects within an atomic transaction. + + This is a helper function that performs `dbzero.assign` operation in an atomic context. + + Parameters + ---------- + *objects : Any + A variable number of Memo objects to modify. + **attributes : Dict[str, Any] + The attributes to update, provided as name=value pairs where each key is + the name of an attribute to update and the corresponding value is the new + value to assign. + """ with atomic(): - assign(*args, **kwargs) + assign(*objects, **attributes) diff --git a/dbzero/dbzero/compare.py b/dbzero/dbzero/compare.py index 21724b79..b96e3913 100644 --- a/dbzero/dbzero/compare.py +++ b/dbzero/dbzero/compare.py @@ -1,8 +1,68 @@ +from typing import Optional, List +from .interfaces import Memo, Tag import dbzero as db0 from .dbzero import _compare -def compare(obj_1, obj_2, tags=None): +def compare(obj_1: Memo, obj_2: Memo, tags: Optional[List[Tag]] = None) -> bool: + """Perform a deep, content-based comparison of two Memo objects to check if they are identical. + + By default, it only compares the objects' data fields. + When optional `tags` parameter is provided, their tag assignments are included in the comparison. + + Parameters + ---------- + obj_1 : Memo + The first Memo object for comparison. + obj_2 : Memo + The second Memo object for comparison. + tags : list of Tag, optional + A list of tags to include in the comparison. If provided, the method + will check if both objects have an identical assignment (or lack) of each + tag in the list. + + Returns + ------- + bool + True if the objects are of the same type and their data content (and + specified tags, if checked) are identical. False if the objects are of + different types, their data differs, or their specified tag assignments differ. + + Examples + -------- + Basic content comparison: + + >>> obj_1 = MemoTestClass(100) + >>> obj_2 = MemoTestClass(100) + >>> dbzero.compare(obj_1, obj_2) # Returns True + >>> + >>> # Change the content of one object + >>> obj_2.value = 200 + >>> dbzero.compare(obj_1, obj_2) # Returns False + + Comparing with tags: + + >>> obj_A = MemoTestClass(100) + >>> obj_B = MemoTestClass(100) + >>> + >>> dbzero.tags(obj_B).add("featured") + >>> dbzero.commit() + >>> + >>> # Default comparison ignores tags and returns True + >>> # because their content is the same. + >>> assert dbzero.compare(obj_A, obj_B) == True + >>> + >>> # Including the 'featured' tag in the comparison + >>> # returns False because obj_A lacks the tag. + >>> assert dbzero.compare(obj_A, obj_B, tags=['featured']) == False + >>> + >>> # Now add the tag to obj_A as well + >>> dbzero.tags(obj_A).add("featured") + >>> dbzero.commit() + >>> + >>> # The comparison now returns True + >>> assert dbzero.compare(obj_A, obj_B, tags=['featured']) == True + """ if _compare(obj_1, obj_2): # if objects are identical then also compare tags if tags is not None: diff --git a/dbzero/dbzero/dbzero.pyi b/dbzero/dbzero/dbzero.pyi index 27271ec0..dc5b6ebd 100644 --- a/dbzero/dbzero/dbzero.pyi +++ b/dbzero/dbzero/dbzero.pyi @@ -2,11 +2,16 @@ Type stubs for dbzero module. """ -from typing import Any, Optional, Iterable, Set, Dict, List, Tuple, Iterator, Union, Callable +from typing import Any, Optional, Iterable, Dict, List, Tuple, Union, Callable +from .interfaces import ( + Memo, MemoWeakProxy, QueryObject, Tag, TagSet, EnumValue, + ListObject, IndexObject, TupleObject, SetObject, DictObject, ByteArrayObject, + ObjectTagManager, Snapshot +) # Core workspace management functions -def init(db0_dir: str, config: Optional[Dict[str, Any]] = None) -> None: +def init(path: str, config: Optional[Dict[str, Any]] = None) -> None: """Initialize the dbzero environment in a specified directory and apply global configurations. This function sets up the underlying state management engine. @@ -16,7 +21,7 @@ def init(db0_dir: str, config: Optional[Dict[str, Any]] = None) -> None: Parameters ---------- - db0_dir : str + path : str The path to the data files directory. If the directory doesn't exist, it will be created. config : dict, optional @@ -41,7 +46,7 @@ def init(db0_dir: str, config: Optional[Dict[str, Any]] = None) -> None: """ ... -def open(prefix: str, mode: str = "rw", **kwargs: Any) -> None: +def open(prefix_name: str, open_mode: str = "rw", **kwargs: Any) -> None: """Open a data prefix and set it as the current working context. This function is the primary way to access a specific dataset within the dbzero environment. @@ -49,9 +54,9 @@ def open(prefix: str, mode: str = "rw", **kwargs: Any) -> None: Parameters ---------- - prefix : str + prefix_name : str The unique name for the data partition you want to open. - mode : {"rw", "r"}, default "rw" + open_mode : {"rw", "r"}, default "rw" The mode for opening the prefix. "rw" for read-write mode (allows both reading and modifying objects), "r" for read-only mode (prevents any changes to the data). @@ -83,7 +88,7 @@ def open(prefix: str, mode: str = "rw", **kwargs: Any) -> None: """ ... -def close(prefix: Optional[str] = None) -> None: +def close(prefix_name: Optional[str] = None) -> None: """Gracefully shut down dbzero, persisting changes and releasing resources. When called without arguments, closes all open prefixes and terminates the entire @@ -92,7 +97,7 @@ def close(prefix: Optional[str] = None) -> None: Parameters ---------- - prefix : str, optional + prefix_name : str, optional Optional name of the prefix to close. If omitted, all prefixes and the dbzero instance are closed. @@ -113,14 +118,14 @@ def close(prefix: Optional[str] = None) -> None: """ ... -def commit(prefix: Optional[str] = None) -> None: +def commit(prefix_name: Optional[str] = None) -> None: """Save all in-memory object changes to persistent storage. Finalizes the current open transaction, ensuring data is durable and consistent. Parameters ---------- - prefix : str, optional + prefix_name : str, optional Optional specific data prefix to commit. If omitted, commits changes for all open prefixes. @@ -143,7 +148,7 @@ def commit(prefix: Optional[str] = None) -> None: # Object retrieval and management -def fetch(id: Union[str, type], expected_type: Optional[type] = None, prefix: Optional[str] = None) -> Any: +def fetch(id: Union[str, type], type: Optional[type] = None, prefix: Optional[str] = None) -> Memo: """Retrieve a single object directly from memory using its unique identifier. The fastest way to access an object, operating in constant time O(1). @@ -156,7 +161,7 @@ def fetch(id: Union[str, type], expected_type: Optional[type] = None, prefix: Op * UUID string: Returns the specific object instance for that UUID * type (singleton class): Returns the unique instance of that singleton - expected_type : type, optional + type : type, optional Optional type to validate the retrieved object. Raises exception if the fetched object is not an instance of this type. prefix : str, optional @@ -165,8 +170,8 @@ def fetch(id: Union[str, type], expected_type: Optional[type] = None, prefix: Op Returns ------- - Any - The requested object instance. Subsequent calls with the same UUID return + Memo + The requested Memo object instance. Subsequent calls with the same UUID return the exact same Python object, not a copy. Raises @@ -197,7 +202,7 @@ def fetch(id: Union[str, type], expected_type: Optional[type] = None, prefix: Op """ ... -def exists(identifier: Union[str, type], object_type: Optional[type] = None, prefix: Optional[str] = None) -> bool: +def exists(id: Union[str, type], type: Optional[type] = None, prefix: Optional[str] = None) -> bool: """Check if a dbzero object exists. Can check by UUID or by singleton type. @@ -205,12 +210,12 @@ def exists(identifier: Union[str, type], object_type: Optional[type] = None, pre Parameters ---------- - identifier : str or type + id : str or type The object to check for. * str: Check for object with its unique identifier * type: Check for instance of this singleton type - object_type : type, optional + type : type, optional Optional expected type when checking by UUID. Verifies the found object is an instance of this type. prefix : str, optional @@ -220,7 +225,7 @@ def exists(identifier: Union[str, type], object_type: Optional[type] = None, pre Returns ------- bool - True if the object exists (and matches object_type if specified), False otherwise. + True if the object exists (and matches type if specified), False otherwise. Examples -------- @@ -246,7 +251,7 @@ def exists(identifier: Union[str, type], object_type: Optional[type] = None, pre """ ... -def uuid(obj: Any) -> str: +def uuid(obj: Memo, /) -> str: """Get the unique, persistent identifier (UUID) for a dbzero-managed object. Returns a stable handle that allows the object to be reliably fetched @@ -281,7 +286,7 @@ def uuid(obj: Any) -> str: """ ... -def load(obj: Any, *, exclude: Optional[Union[List[str], Tuple[str, ...]]] = None, **kwargs: Any) -> Any: +def load(obj: Any, /, *, exclude: Optional[Union[List[str], Tuple[str, ...]]] = None, **kwargs: Any) -> Any: """Recursively convert any object into its equivalent native Python representation. Useful for exporting application state for APIs or functions expecting standard Python types, @@ -361,7 +366,7 @@ def load(obj: Any, *, exclude: Optional[Union[List[str], Tuple[str, ...]]] = Non """ ... -def hash(obj: Any) -> int: +def hash(obj: Any, /) -> int: """Compute a deterministic 64-bit integer hash for any object. Generates a hash value guaranteed to be consistent across different Python @@ -407,14 +412,15 @@ def hash(obj: Any) -> int: """ ... -def set_prefix(instance: Any, prefix: Optional[str]) -> None: - """Set the persistence prefix for a @dbzero.memo class instance dynamically at runtime. +def set_prefix(object: Memo, prefix: Optional[str]) -> None: + """Set the persistence prefix for a Memo instance dynamically at runtime. Allows to control which data prefix an object belongs to. + MUST be called as the first statement inside __init__ constructor. Parameters ---------- - instance : Any + object : Memo The class instance being initialized. You should always pass 'self'. prefix : str, optional Name of the prefix (scope) for the instance. If None, uses current default prefix. @@ -432,30 +438,23 @@ def set_prefix(instance: Any, prefix: Optional[str]) -> None: ... ... def set(self, key, value): ... self.data[key] = value - - Notes - ----- - **Use constraints:** - - * MUST be called as the first statement inside __init__ constructor - * Objects can only hold direct references to others from the same prefix """ ... -def materialized(obj: Any) -> Any: - """Provide a reference to a @dbzero.memo object that is safe for use within its own __init__. +def materialized(obj: Memo, /) -> Memo: + """Provide a reference to a Memo object that is safe for use within its own __init__. Solves the chicken-and-egg problem where an object isn't fully initialized in dbzero until its __init__ completes, but you need to reference it during construction. Parameters ---------- - obj : Any + obj : Memo The object instance (typically 'self') being initialized. Returns ------- - Any + Memo A stable handle to the object that can be used with other dbzero functions. Examples @@ -486,7 +485,7 @@ def materialized(obj: Any) -> Any: """ ... -def assign(*objects: Any, **attributes: Any) -> None: +def assign(*objects: Memo, **attributes: Dict[str, Any]) -> None: """Perform bulk attribute updates on one or more Memo objects. Convenient way to set multiple attributes to new values in a single, @@ -494,9 +493,9 @@ def assign(*objects: Any, **attributes: Any) -> None: Parameters ---------- - *objects : Any + *objects : Memo One or more Memo objects whose attributes you want to update. - **attributes : dict + **attributes : Dict[str, Any] The attributes to update, provided as name=value pairs. Examples @@ -526,13 +525,13 @@ def assign(*objects: Any, **attributes: Any) -> None: """ ... -def touch(*objects: Any) -> None: - """Mark one or more memo objects as modified without changing their data. +def touch(*objects: Memo) -> None: + """Mark one or more Memo objects as modified without changing their data. Parameters ---------- - *objects : Any - The memo object(s) to be touched/marked as modified. + *objects : Memo + The Memo object(s) to be touched/marked as modified. Examples -------- @@ -550,7 +549,7 @@ def touch(*objects: Any) -> None: ... def rename_field(class_obj: type, old_name: str, new_name: str) -> None: - """Rename a field for a given class. + """Rename a field for a given Memo class. Modifies the internal field layout for all existing and future instances of the class. After execution, field is accessible only by its new name. @@ -625,7 +624,7 @@ def clear_cache() -> None: """ ... -def set_cache_size(size: int) -> None: +def set_cache_size(size: int, /) -> None: """Set the maximum size of the in-memory cache in bytes. Allows dynamic adjustment of memory ceiling for cache at runtime. Increase @@ -647,7 +646,7 @@ def set_cache_size(size: int) -> None: # Collection creation functions -def list(iterable: Optional[Iterable[Any]] = None) -> List: +def list(iterable: Optional[Iterable[Any]] = None, /) -> ListObject: """Create a persistent, mutable sequence object. Parameters @@ -657,8 +656,8 @@ def list(iterable: Optional[Iterable[Any]] = None) -> List: Returns ------- - dbzero.list - A new dbzero.list object that has the same interface as python list. + ListObject + A new ListObject that has the same interface as Python list. Examples -------- @@ -681,7 +680,7 @@ def list(iterable: Optional[Iterable[Any]] = None) -> List: """ ... -def index() -> Any: +def index() -> IndexObject: """Create a persistent, ordered data structure for efficient queries. An index is like a dictionary where keys are always sorted, enabling fast range scans @@ -689,13 +688,8 @@ def index() -> Any: Returns ------- - dbzero.index - A new dbzero.index object supporting following methods: - - * add(key, value): Associate a value with a sortable key - * remove(key, value): Remove specific key-value pair - * select(min_key, max_key, null_first): Query objects within key range - * sort(iterable, desc, null_first): Sort objects by their keys in this index + IndexObject + A new IndexObject. Examples -------- @@ -735,7 +729,7 @@ def index() -> Any: """ ... -def tuple(iterable: Iterable[Any] = ()) -> Tuple: +def tuple(iterable: Iterable[Any] = (), /) -> TupleObject: """Create a persistent, immutable sequence object. Parameters @@ -746,8 +740,8 @@ def tuple(iterable: Iterable[Any] = ()) -> Tuple: Returns ------- - dbzero.tuple - A new dbzero.tuple object that has the same interface as python tuple. + TupleObject + A new TupleObject that has the same interface as Python tuple. Examples -------- @@ -780,7 +774,7 @@ def tuple(iterable: Iterable[Any] = ()) -> Tuple: """ ... -def set(iterable: Optional[Iterable[Any]] = None) -> Set: +def set(iterable: Optional[Iterable[Any]] = None, /) -> SetObject: """Create a persistent, mutable, unordered collection of unique elements. Parameters @@ -791,8 +785,8 @@ def set(iterable: Optional[Iterable[Any]] = None) -> Set: Returns ------- - dbzero.set - A new dbzero.set object that has the same interface as python set. + SetObject + A new SetObject that has the same interface as Python set. Examples -------- @@ -822,20 +816,20 @@ def set(iterable: Optional[Iterable[Any]] = None) -> Set: """ ... -def dict(iterable: Optional[Iterable], **kwargs: Any) -> Dict: +def dict(iterable: Optional[Iterable], /, **kwargs: Any) -> DictObject: """Create a persistent, mutable mapping object. Parameters ---------- iterable : Iterable, optional - Mapping object ot iterable of key-value pairs. + Mapping object or iterable of key-value pairs. **kwargs : Any Initialize dictionary with keyword arguments Returns ------- - dbzero.dict - A new dbzero.dict object that has the same interface as python dict. + DictObject + A new DictObject object that has the same interface as Python dict. Examples -------- @@ -864,7 +858,7 @@ def dict(iterable: Optional[Iterable], **kwargs: Any) -> Dict: """ ... -def bytearray(source: Union[bytes, Iterable[int]] = b'') -> Any: +def bytearray(source: Union[bytes, Iterable[int]] = b'', /) -> ByteArrayObject: """Create a mutable persisted sequence of bytes. Parameters @@ -875,8 +869,8 @@ def bytearray(source: Union[bytes, Iterable[int]] = b'') -> Any: Returns ------- - dbzero.bytearray - A new dbzero.bytearray object that has the same interface as python bytearray. + ByteArrayObject + A new ByteArrayObject that has the same interface as Python bytearray. Examples -------- @@ -895,24 +889,18 @@ def bytearray(source: Union[bytes, Iterable[int]] = b'') -> Any: # Tag and query functions -def tags(*objects: Any) -> Any: - """Get a tag controller for managing tags on dbzero objects. - - Returns a special tag controller object that provides interface to add or remove - tags from the specified object(s). +def tags(*objects: Memo) -> ObjectTagManager: + """Get a tag manager instance for Memo objects. Parameters ---------- - *objects : Any - One or more dbzero-managed objects to manage tags for. + *objects : Memo + One or more Memo objects to manage tags for. Returns ------- - Any - A TagController instance with methods: - - * add(*tags): Associate one or more tags with the object(s) - * remove(*tags): Disassociate one or more tags from the object(s) + ObjectTagManager + A ObjectTagManager instance for given Memo objects. Examples -------- @@ -938,7 +926,7 @@ def tags(*objects: Any) -> Any: """ ... -def find(*query_criteria: Any, prefix: Optional[str] = None) -> Iterator[Any]: +def find(*query_criteria: Union[Tag, List[Tag], Tuple[Tag], QueryObject, TagSet], prefix: Optional[str] = None) -> QueryObject: """Query for memo objects based on search criteria such as tags, types, or subqueries. The primary way to search for objects. All top-level criteria are combined @@ -946,7 +934,7 @@ def find(*query_criteria: Any, prefix: Optional[str] = None) -> Iterator[Any]: Parameters ---------- - *query_criteria : Any + *query_criteria : Union[Tag, List[Tag], Tuple[Tag], Query, TagSet] Variable number of criteria to filter objects: * Type: A class to filter by type (includes subclasses) @@ -954,17 +942,16 @@ def find(*query_criteria: Any, prefix: Optional[str] = None) -> Iterator[Any]: * Object tag: Any memo object used as a tag * List of tags (OR): Objects with at least one of the specified tags * Tuple of tags (AND): Objects with all of the specified tags - * Subquery: Result of another dbzero.find or index.select query - * Logical NOT: Use dbzero.no() to negate a query + * Query: Result of another query + * TagSet: Logical set operation. prefix : str, optional Optional data prefix to run the query on. - If omitted, uses the current default prefix. + If omitted, the prefix to run the query is resolved from query criteria. Returns ------- - Iterator[Any] - A lazily-evaluated iterable query object. Supports iteration, len(), slicing, - and boolean evaluation. Actual lookup is deferred until iteration. + Query + An iterable query object. Examples -------- @@ -1003,7 +990,7 @@ def find(*query_criteria: Any, prefix: Optional[str] = None) -> Iterator[Any]: """ ... -def no(predicate: Union[str, Any]) -> Any: +def no(predicate: Union[str, QueryObject], /) -> TagSet: """Create a negative predicate (NOT condition) for find queries. Allows to exclude objects that match the given predicate, @@ -1011,13 +998,13 @@ def no(predicate: Union[str, Any]) -> Any: Parameters ---------- - predicate : str or Any + predicate : str or Query The condition to negate. Returns ------- - Any - A predicate object that dbzero.find interprets as a logical NOT operation. + TagSet + A predicate object representing logical NOT operation. Examples -------- @@ -1046,22 +1033,22 @@ def no(predicate: Union[str, Any]) -> Any: """ ... -def as_tag(obj: Any) -> Any: +def as_tag(obj: Union[Memo, MemoWeakProxy, type]) -> Tag: """Create a searchable Tag object from a Memo instance or class. - Allows to use any object or class as a label for other objects. + Allows to use Memo object or class as a label for other objects. Tags created from objects are stable identifiers that will work even if the original object is deleted. Parameters ---------- - obj : Any - The object instance or class to convert into a tag. + obj : Union[Memo, type] + The Memo object or class to convert into a tag. Returns ------- - Any - A special Tag object + Tag + Objects' tag. Examples -------- @@ -1084,34 +1071,31 @@ def as_tag(obj: Any) -> Any: """ ... -def split_by(groups: Union[List[str], List[Any]], query: Any, exclusive: bool = True) -> Any: +def split_by(tags: List[Tag], query: QueryObject, exclusive: bool = True) -> QueryObject: """Transform a query by decorating result items with specified groups, such as tags or enum values which they are tagged with. - Effectively, it partitions or categorizes query results. For each item returned by - the input query that is associated with a group, it additionaly yields a tag in a (item, decorator) tuple. + Effectively, it categorizes query results. For each item returned by + the input query that is associated with a group, it additionaly yields + a tag in a (item, decorator) tuple. Parameters ---------- - groups : List[str] or List[Any] - Collection of groups to split results by: - - * List of string tags - * List of enum values - query : Any - The input query whose results will be categorized. - Any dbzero query object from dbzero.find(), dbzero.filter(), etc. + tags : List[Tag] + A list of tags to split results by. + query : Query + The input query whose result set will be categorized. exclusive : bool, default True Controls handling of items belonging to multiple groups: - * True: Item appears once, paired with one matching group - * False: Item appears for every matching group (multiple times) + * True: Item appears only once, paired with one matching group + * False: Item appears for every matching group Returns ------- - Any - A new query iterator yielding (item, decorator) tuples where item is - from the original query and decorator is the matched tag/enum group. + Query + A new query yielding (item, decorator) tuples where item is from + the original query and decorator is the matched tag/enum group. Examples -------- @@ -1146,27 +1130,25 @@ def split_by(groups: Union[List[str], List[Any]], query: Any, exclusive: bool = """ ... -def filter(filter_func: Callable[[Any], bool], iterable: Any) -> Any: - """Apply fine-grained, custom filtering logic to a sequence of objects. +def filter(filter: Callable[[Any], bool], query: QueryObject) -> QueryObject: + """Apply fine-grained, custom filtering logic to a query. - Useful in situations where complex filtering conditions cannot be expressed with - dbzero.find(). Works similarly to Python's built-in filter(), but seamlessly integrates - into the dbzero query pipeline. + Useful in situations where complex filtering conditions cannot be expressed + with tags and dbzero.find(). Works similarly to Python's built-in filter(), + but seamlessly integrates into the dbzero query pipeline. Parameters ---------- - filter_func : Callable[[Any], bool] + filter : Callable[[Any], bool] A function or lambda that takes a single object as argument. Must return True to include the object, False to exclude it. - iterable : Any - The sequence of objects to filter. Typically result of another - dbzero operation like dbzero.find(), dbzero.index().range(), etc. + query : Query + A query to filter. Returns ------- - Any - An iterable that yields only items from input iterable - for which filter_func returns True. + Query + A query that only yields items for which filter function returned True. Examples -------- @@ -1256,7 +1238,7 @@ def get_state_num(prefix: Optional[str] = None, finalized: bool = False) -> int: # Snapshot functions -def snapshot(state_spec: Optional[Union[int, Dict[str, int]]] = None) -> Any: +def snapshot(state_spec: Optional[Union[int, Dict[str, int]]] = None) -> Snapshot: """Create a read-only, point-in-time view of the prefix for time-travel queries. Essential for isolating long-running queries from concurrent writes, analyzing @@ -1273,14 +1255,8 @@ def snapshot(state_spec: Optional[Union[int, Dict[str, int]]] = None) -> Any: Returns ------- - Any - A Snapshot context manager object with methods: - - * find(): Query objects within the snapshot - * fetch(): Retrieve single object by UUID or singleton by class - * get_state_num(): Get state number for a prefix within snapshot - * deserialize(): Deserialize and run query against snapshot's state - * close(): Manually close snapshot + Snapshot + A Snapshot context manager object. Examples -------- @@ -1309,17 +1285,17 @@ def snapshot(state_spec: Optional[Union[int, Dict[str, int]]] = None) -> Any: """ ... -def get_snapshot_of(obj: Any) -> Any: +def get_snapshot_of(obj: Memo, /) -> Snapshot: """Retrieve the Snapshot instance from which a given object originates. Parameters ---------- - obj : Any + obj : Memo An object instance previously fetched from a snapshot. Returns ------- - Any + Snapshot The Snapshot object corresponding to the state from which obj was loaded. Examples @@ -1338,7 +1314,7 @@ def get_snapshot_of(obj: Any) -> Any: """ ... -def is_memo(obj: Any) -> bool: +def is_memo(obj: Any, /) -> bool: """Check if a given object is a dbzero memo class or instance of one. Parameters @@ -1384,7 +1360,7 @@ def is_memo(obj: Any) -> bool: """ ... -def is_singleton(obj: Any) -> bool: +def is_singleton(obj: Any, /) -> bool: """Check if a given object is a dbzero singleton instance. Parameters @@ -1418,7 +1394,7 @@ def is_singleton(obj: Any) -> bool: """ ... -def is_enum(value: Any) -> bool: +def is_enum(value: Any, /) -> bool: """Check if an object is a dbzero enum type. Parameters @@ -1446,7 +1422,7 @@ def is_enum(value: Any) -> bool: """ ... -def is_enum_value(value: Any) -> bool: +def is_enum_value(value: Any, /) -> bool: """Check if an object is a dbzero enum value. Parameters @@ -1477,7 +1453,7 @@ def is_enum_value(value: Any) -> bool: """ ... -def get_schema(cls: type) -> Dict[str, Dict[str, Any]]: +def get_schema(cls: type, /) -> Dict[str, Dict[str, Any]]: """Introspect all in-memory instances of a @dbzero.memo class to generate dynamic schema. Provides current overview of attributes and their most common data types @@ -1578,7 +1554,7 @@ def get_config() -> Dict[str, Any]: # Serialization functions -def serialize(obj: Any) -> bytes: +def serialize(obj: Union[QueryObject, EnumValue], /) -> bytes: """Convert a dbzero query iterable or enum value into platform-independent binary representation. Parameters @@ -1586,8 +1562,8 @@ def serialize(obj: Any) -> bytes: obj : Any The dbzero object to serialize: - * Query iterable (result of dbzero.find(...)) - * dbzero enum value (e.g., Colors.RED) + * Query iterable + * dbzero enum value Returns ------- @@ -1616,8 +1592,8 @@ def serialize(obj: Any) -> bytes: """ ... -def deserialize(data: bytes) -> Any: - """Reconstruct a dbzero query iterable or enum value from serialized bytes. +def deserialize(data: bytes, /) -> Any: + """Reconstruct a dbzero object from serialized bytes. Parameters ---------- @@ -1627,14 +1603,13 @@ def deserialize(data: bytes) -> Any: Returns ------- Any - A fully functional dbzero object (query iterable or enum value) that was - encoded in the data bytes. + A dbzero object that was encoded in the data bytes. """ ... # Synchronization functions -def wait(prefix: str, state_num: int, timeout: Optional[int] = None) -> bool: +def wait(prefix: str, state: int, timeout: Optional[int] = None) -> bool: """Block execution until desired prefix reaches target state or timeout occurs. Low-level mechanism for synchronizing processes by waiting on data updates. @@ -1646,7 +1621,7 @@ def wait(prefix: str, state_num: int, timeout: Optional[int] = None) -> bool: prefix : str Name of the prefix to monitor for changes. Use dbzero.get_current_prefix().name for current prefix. - state_num : int + state : int Target state number to wait for. Use dbzero.get_state_num(prefix) for current state. timeout : int, optional @@ -1655,7 +1630,7 @@ def wait(prefix: str, state_num: int, timeout: Optional[int] = None) -> bool: Returns ------- bool - True if database reached/surpassed target state_num within timeout. + True if prefix reached/surpassed target state_num within timeout. False if timeout occured. Returns True immediately if current state is already >= target state_num. @@ -1693,7 +1668,7 @@ def wait(prefix: str, state_num: int, timeout: Optional[int] = None) -> bool: # Object lifecycle functions -def getrefcount(obj: Any) -> int: +def getrefcount(obj: Union[Memo, type]) -> int: """Get the number of strong references to a memo object or class. Low-level utility useful for debugging and understanding memory management within dbzero. @@ -1766,22 +1741,22 @@ def getrefcount(obj: Any) -> int: """ ... -def weak_proxy(obj: Any) -> Any: - """Create a weak reference to a dbzero managed object. +def weak_proxy(obj: Memo) -> MemoWeakProxy: + """Create a weak reference to a Memo object. Allows storing reference to an object without increasing its reference count. Crucial for preventing circular dependencies and enabling cross-prefix references. Parameters ---------- - obj : Any + obj : Memo A dbzero managed object to create a weak reference to. Returns ------- - Any + MemoWeakProxy A special weak proxy object that behaves like the original for most operations - but doesn't keep the original object alive. + but doesn't extend the original objects' lifetime. Examples -------- @@ -1814,15 +1789,15 @@ def weak_proxy(obj: Any) -> Any: """ ... -def expired(proxy_object: Any) -> bool: +def expired(proxy_object: MemoWeakProxy) -> bool: """Check if the target object of a dbzero.weak_proxy has been garbage-collected. Used to determine if the original object still exists and can be accessed. Parameters ---------- - proxy_object : Any - The weak proxy object (from dbzero.weak_proxy()) to check. + proxy_object : MemoWeakProxy + The weak proxy object to check. Returns ------- diff --git a/dbzero/dbzero/decorators.py b/dbzero/dbzero/decorators.py index 400769ee..d42b5392 100644 --- a/dbzero/dbzero/decorators.py +++ b/dbzero/dbzero/decorators.py @@ -8,6 +8,7 @@ def check_params_not_equal(params, count): return len(params.args) != count or params.varargs or params.varkw or params.kwonlyargs def immutable(f): + """A deorator to mark a function as not modifying.""" @functools.wraps(f) def wrapper(*args, **kwargs): retval = f(*args, **kwargs) @@ -19,6 +20,7 @@ def wrapper(*args, **kwargs): def fulltext(f): + """A decorator to mark a function as fulltext query.""" @functools.wraps(f) def wrapper(*args, **kwargs): retval = f(*args, **kwargs) @@ -57,6 +59,7 @@ def _get_function_context(f): return context def complete_with(action): + """A decorator for specifying confirmed action.""" def decorator(f): # context = _get_function_context(f) # if not hasattr(context, action): diff --git a/dbzero/dbzero/enum.py b/dbzero/dbzero/enum.py index fd789285..eb713a46 100644 --- a/dbzero/dbzero/enum.py +++ b/dbzero/dbzero/enum.py @@ -1,7 +1,80 @@ +from typing import Any, Optional, List, Union, overload +from .interfaces import EnumType from .dbzero import _make_enum -def enum(cls=None, *args, **kwargs): +@overload +def enum(cls: str, values: List[str], *, prefix: Optional[str] = None) -> EnumType: ... + +@overload +def enum(*, values: List[str], prefix: Optional[str] = None) -> EnumType: ... + +@overload +def enum(cls: type) -> EnumType: ... + +def enum(cls: Optional[Union[str, type]] = None, *args: Any, **kwargs: Any) -> EnumType: + """Create a persistent, type-safe enumerated type. + + Useful for defining a set of named constants. + Using dbzero enums instead of raw strings for tags or object members prevents accidental + clashes and makes the data model more robust and explicit. + + Parameters + ---------- + cls : str + The name for the enum type. + When used as a decorator, the name is inferred from the class name. + values : list of str + A list of unique string names for the enum members. The order of values + is preserved. + prefix : str, optional + Scopes the enum definition to a specific data prefix. If None (default), + the enum is defined in the prefix that is active at the time of creation. + This allows for creating data-model-specific enums. + + Returns + ------- + EnumType + A new EnumType object. + + Examples + -------- + Decorator with values: + + >>> @dbzero.enum(values=["PENDING", "ACTIVE", "INACTIVE"]) + ... class Status: + ... pass + >>> print(Status.ACTIVE) # "ACTIVE" + + As a function: + + >>> Color = dbzero.enum("Color", ["RED", "GREEN", "BLUE"]) + >>> print(Color.RED) # "RED" + + Accessing members: + + >>> # Access by attribute + >>> active_status = Status.ACTIVE + >>> # Access by string key + >>> red_color = Color['RED'] + >>> # Access by integer index + >>> green_color = Color[1] # Corresponds to "GREEN" + + Type safety for tagging: + + >>> Color = dbzero.enum("Color", ["RED", "GREEN", "BLUE"]) + >>> Palette = dbzero.enum("Palette", ["RED", "ORANGE", "YELLOW"]) + >>> + >>> # Tag different objects + >>> dbzero.tags(obj1).add(Color.RED) + >>> dbzero.tags(obj2).add(Palette.RED) + >>> dbzero.tags(obj3).add("RED") + >>> + >>> # Each find query returns distinct sets + >>> list(dbzero.find(Color.RED)) # [obj1] + >>> list(dbzero.find(Palette.RED)) # [obj2] + >>> list(dbzero.find("RED")) # [obj3] + """ def wrap(cls_): return _make_enum(cls_, **kwargs) diff --git a/dbzero/dbzero/fast_query.py b/dbzero/dbzero/fast_query.py index 721808f6..11e196dd 100644 --- a/dbzero/dbzero/fast_query.py +++ b/dbzero/dbzero/fast_query.py @@ -1,5 +1,7 @@ # This is an experimental version of a possible Query Engine # implementation for dbzero +from typing import Union, Callable, Tuple, Dict +from .interfaces import QueryObject, Tag import dbzero as db0 import inspect import typing @@ -11,7 +13,19 @@ __lambda_regex = re.compile(r'lambda\s.*?:\s*([^,)]*)') -def init_fast_query(prefix=None): +def init_fast_query(prefix: str) -> None: + """Initialize the fast query caching system using a specified prefix. + + This function designates a specific prefix to act as a cache for computationally + intensive queries, most notably `dbzero.group_by`. Useful when primary data prefix + is accessed in a read-only mode, so the cache prefix can remain writable. + + Parameters + ---------- + prefix : str + The name of the prefix to use for caching. This prefix must be opened in + read-write ("rw") mode. + """ global __px_fast_query if prefix: __px_fast_query = prefix @@ -293,9 +307,88 @@ def release(self): return result -def group_by(group_defs, query, ops=(count_op,)) -> typing.Dict: - """ - Group query results by the given key +def group_by(group_defs: Union[Callable, Tag, Tuple], query: QueryObject, ops: Tuple[Callable, ...] = (count_op,)) -> Dict: + """Perform cached group-and-aggregate queries over a set of objects. + + The group_by() method categorizes objects returned by input query based on + one or more criteria and then applies aggregation operations to each category. + + The first time a specific query is run, it performs a full scan and caches the result. + Subsequent group_by operations of the exact same query use cached result to speedup computation. + + Parameters + ---------- + group_defs : lambda | Iterable[EnumValue] | str | tuple + The criteria used to group the objects. This can be: + + * A lambda function: Applied to each object to determine its grouping key. + For caching to work, the lambda's source code must be identical between calls. + * Tag: To group objects by tags they are taged with. + The group keys will be the string names of the enum members. + * A tuple of the above: For multi-level grouping. The resulting dictionary keys + will be tuples. + query : Any + A dbzero QueryObject to be grouped. + ops : tuple of callable, default (count_op,) + A tuple of aggregation operations to perform on each group. + + Returns + ------- + dict + A dictionary where: + + * Keys are the group identifiers determined by the group_defs criteria. If multiple + criteria are used, the key will be a tuple. + * Values are the results of the aggregation(s). If a single operation is provided + in ops, the value is a single result (e.g., an int). If multiple operations are + provided, the value is a tuple containing the result of each operation in the + specified order. + + Examples + -------- + Simple grouping by attribute: + + >>> # Assume objects are instances of a class with a 'key' attribute + >>> objects = [] + >>> keys = ["one", "two", "three"] + ... for i in range(10): + ... objects.append(SomeClass(key=keys[i % 3])) + >>> dbzero.tags(*objects).add("my-tag") + >>> + >>> # Group objects with "my-tag" by their 'key' + >>> groups = dbzero.group_by(lambda row: row.key, dbzero.find("my-tag")) + >>> # Example result: {'one': 4, 'two': 3, 'three': 3} + + Multi-level grouping: + + >>> Colors = dbzero.enum("Colors", ["RED", "GREEN", "BLUE"]) + >>> + >>> # Group by color tag and then by whether the value is even (0) or odd (1) + >>> groups = dbzero.group_by( + ... (Colors.values(), lambda x: x.value % 2), + ... dbzero.find(MemoTestClass) + ... ) + >>> # Example result: {('RED', 0): 2, ('RED', 1): 2, ('GREEN', 1): 3, ('BLUE', 0): 2, ...} + + Grouping with custom aggregations: + + >>> # Define two operations: default count and a sum of the 'value' attribute + >>> query_ops = (dbzero.count_op, dbzero.make_sum(lambda x: x.value)) + >>> + >>> groups = dbzero.group_by( + ... lambda x: "even" if x.value % 2 == 0 else "odd", + ... dbzero.find(MemoTestClass), + ... ops=query_ops + ... ) + >>> # Example result where each value is a tuple (count, sum_of_values): + >>> # {'even': (5, 20), 'odd': (5, 25)} + + Notes + ----- + This method creates and updates an internal cache to speed up subsequent identical queries. + For the cache to be persistent across program runs, it must first be initialized using + dbzero.init_fast_query(). A query is considered "identical" if its parameters and its + group_defs are the same as from the previous call """ def delta(start, end): # compute delta between the 2 snapshots diff --git a/dbzero/dbzero/interfaces.py b/dbzero/dbzero/interfaces.py new file mode 100644 index 00000000..68bab78e --- /dev/null +++ b/dbzero/dbzero/interfaces.py @@ -0,0 +1,251 @@ +""" +Definitions of interfaces and types used in dbzero. + +These classes serve as type annotations and documentation only. +They correspond to their C++ implementations but are not a part of actual Python API. +""" + +from __future__ import annotations +from typing import Any, Optional, Union, Iterator, Iterable, List, Tuple + +class Memo: + """A dbzero.memo decorated class type.""" + ... + +class MemoWeakProxy: + """A weak reference to a Memo object that doesn't extend its lifetime.""" + ... + +class EnumType: + """A dbzero.enum object.""" + ... + +class EnumValue: + """A member of dbzero.enum.""" + ... + +class QueryObject(Iterator[Memo]): + """A dbzero objects query type.""" + + def __len__(self) -> int: + """The number of queried elements.""" + ... + + def __getitem__(self, obj: slice) -> QueryObject: + """Get a slice of query result set.""" + ... + +class Tag: + """Memo object/class tag, EnumValue or a simple 'str' tag.""" + ... + +class TagSet: + """A tag set operation, e.g. logical complement, being a result of query negation.""" + ... + +class ListObject(list): + """Persistent list.""" + ... + +class IndexObject: + """Persistent ordered data structure for efficient queries.""" + + def add(self, key: Any, value: Memo, /) -> None: + """Associate a value with a sortable key in the index. + + Parameters + ---------- + key : Any + The sortable key to associate with the value. Can be any comparable type + including None, numbers, strings or datetime objects. + value : Memo + A Memo object to be associated with the key. + """ + ... + + def remove(self, key: Any, value: Memo, /) -> None: + """Remove a specific key-value pair from the index. + + Both the key and value must match exactly for the removal to succeed. + + Parameters + ---------- + key : Any + The exact key that was used when adding the value. + value : Memo + The exact Memo object to remove. + """ + ... + + def select(self, low: Optional[Any] = None, high: Optional[Any] = None, null_first: bool = False) -> QueryObject: + """Query objects within a key range with inclusive bounds. + + Performs a range query returning all objects whose keys fall within the specified + range. Both bounds are inclusive. If no arguments are provided, returns all objects. + + Parameters + ---------- + low : Any, optional + The minimum key value. If None, no lower bound is applied. + high : Any, optional + The maximum key value. If None, no upper bound is applied. + null_first : bool, default False + If True, None keys are considered 'less' than all other values. + If False, None keys are considered 'greater' than all other values. + + Returns + ------- + QueryObject + A query object containing all values whose keys fall within [low, high]. + """ + ... + + def sort(self, query: QueryObject, /, *, desc: bool = False, null_first: bool = False) -> QueryObject: + """Sort objects by their keys in this index. + + Takes a query result and returns the same objects sorted according to their + keys in this index. Objects not present in the index are excluded from the result. + + Parameters + ---------- + query : QueryObject + A query of dbzero objects to be sorted. + desc : bool, default False + If True, sort in descending order. If False, sort in ascending order. + null_first : bool, default False + If True, None keys are considered 'less' than all other values. + If False, None keys are considered 'greater' than all other values. + + Returns + ------- + QueryObject + A new query containing the input objects sorted by their keys in this index. + """ + ... + +class TupleObject(tuple): + """Persistent immutable sequence.""" + ... + +class SetObject(set): + """Persistent unordered collection of unique elements.""" + ... + +class DictObject(dict): + """Persistent mapping object.""" + ... + +class ByteArrayObject(bytearray): + """Persisted sequence of bytes.""" + ... + +class ObjectTagManager: + """Manages tags of one or more Memo instances.""" + + def add(self, *tag: Union[Tag, Iterable[Tag]]) -> None: + """Add one or more tags to the managed objects. + + Parameters + ---------- + *tag : Union[Tag, Iterable[Tag]] + Tags to add. Can be individual tags as separate arguments, or collections of tags. + """ + ... + + def remove(self, *tag: Union[Tag, Iterable[Tag]]) -> None: + """Remove one or more tags from the managed objects. + + Parameters + ---------- + *tag : Union[Tag, Iterable[Tag]] + Tags to remove. Can be individual tags as separate arguments, or collections of tags. + Tags that weren't previously assigned to managed objects are ignored. + """ + ... + +class Snapshot: + """A dbzero snapshot context. + + It is intended for use in a 'with' statement. + """ + + def __enter__(self) -> Snapshot: + """Enter dbzero snapshot context.""" + ... + + def __exit__(self, exc_type, exc_value, traceback): + """Exit dbzero snapshot's context.""" + ... + + def fetch(self, id: Union[str, type], type: Optional[type] = None, prefix: Optional[str] = None) -> Memo: + """Retrieve a single object directly from memory using its unique identifier. + + Parameters + ---------- + id : str or type + The identifier for the object you want to retrieve. + + * UUID string: Returns the specific object instance for that UUID + * type (singleton class): Returns the unique instance of that singleton + type : type, optional + Optional type to validate the retrieved object. + Raises exception if the fetched object is not an instance of this type. + prefix : str, optional + Optional name of the data prefix to fetch the object from. + Useful for retrieving singletons from non-default prefixes. + + Returns + ------- + Memo + The requested Memo object instance. + """ + ... + + def find(self, *query_criteria: Union[Tag, List[Tag], Tuple[Tag], QueryObject, TagSet]) -> QueryObject: + """Query for memo objects based on search criteria such as tags, types, or subqueries. + + Parameters + ---------- + *query_criteria : Union[Tag, List[Tag], Tuple[Tag], QueryObject, TagSet] + Variable number of criteria to filter objects: + + * Type: A class to filter by type (includes subclasses) + * String tag: Simple string tag + * Object tag: Any memo object used as a tag + * List of tags (OR): Objects with at least one of the specified tags + * Tuple of tags (AND): Objects with all of the specified tags + * QueryObject: Result of another query + * TagSet: Logical set operation. + + Returns + ------- + QueryObject + An iterable query object. + """ + + def deserialize(self, data: bytes, /) -> Any: + """Reconstruct a dbzero object from serialized bytes, withing the snapshot context. + + Parameters + ---------- + data : bytes + The bytes object previously created by dbzero.serialize(). + + Returns + ------- + Any + A dbzero object that was encoded in the data bytes. + """ + + def close(self) -> None: + """Close dbzero snapshot.""" + ... + + def get_state_num(self) -> int: + """Get state number of a snapshot. + + Returns + ------- + int + State number of a snapshot. + """ \ No newline at end of file diff --git a/dbzero/dbzero/locked.py b/dbzero/dbzero/locked.py index 3374999c..dfca946b 100644 --- a/dbzero/dbzero/locked.py +++ b/dbzero/dbzero/locked.py @@ -3,9 +3,59 @@ from .dbzero import begin_locked, _async_wait, get_config, commit -def async_wait(prefix_name, state_number): +def async_wait(prefix: str, state_num: int) -> asyncio.Future[None]: + """Pause an asyncio coroutine until a specific data prefix reaches a target state number. + + Async variant of `dbzero.wait` function, suitable for use in coroutines. + + Parameters + ---------- + prefix : str + Name of the prefix to monitor for changes. + state_num : int + Target state number to wait for. + + Returns + ------- + asyncio.Future[None] + An awaitable object (asyncio.Future). + Awaiting this future blocks the coroutine until prefix state number is reached. + + Examples + -------- + Waiting for the next state change: + + >>> import asyncio + >>> # Get the current state number of the default prefix + >>> current_state = dbzero.get_state_num("default") + >>> print("Waiting for the next commit...") + >>> + >>> # In another part of your code, make and commit a change + >>> obj = MyMemoClass(value="initial") + >>> obj.value = "updated" # This mutation will increment the state number on commit + >>> # dbzero automatically commits the change + >>> + >>> await dbzero.async_wait("default", current_state + 1) + >>> print("State change detected!") + + Timeout handling: + + >>> import asyncio + >>> prefix_name = "default" + >>> current_state = dbzero.get_state_num(prefix_name) + >>> + >>> # Wait for a future state with timeout protection + >>> try: + ... await asyncio.wait_for( + ... asyncio.shield(dbzero.async_wait(prefix_name, current_state + 1)), + ... timeout=5.0 + ... ) + ... print("State change detected within timeout!") + ... except asyncio.TimeoutError: + ... print("Timeout: No state change occurred within 5 seconds") + """ future = asyncio.get_running_loop().create_future() - _async_wait(future, prefix_name, state_number) + _async_wait(future, prefix, state_num) return future @@ -21,6 +71,8 @@ async def await_commit(mutation_log: List[Tuple[str, int]]): class LockedManager: + """Locked context manager class""" + def __init__(self, await_commit): self.__await_commit = await_commit @@ -45,5 +97,70 @@ async def __aexit__(self, exc_type, _exc_value, _traceback): self.__ctx = None -def locked(await_commit=False): +def locked(await_commit: bool = False) -> LockedManager: + """Blocks the autocommit, ensuring that all changes will be made in a scope of single transaction. + + Allows to capture information about prefix modifications and their current state. This information + can be used for synchronizing reader processes or analyzing what mutatons take place in a block of + operations. + + Parameters + ---------- + await_commit : bool, default False + If True, the context manager will wait asynchronously until changes made in the locked + block are committed. Setting this parameter to True is only allowed in `async with` statement. + + **With autocommit disabled, the commit is triggered automatically when closing context manager + to ensure expected behavior** + + Returns + ------- + LockedManager + A context manager that can be used with either `with` (synchronous) or + `async with` (asynchronous) statements. + + Examples + -------- + Synchronous mutation analysis: + + >>> # Create some objects in different prefixes + >>> obj_1 = MemoTestClass(951) + >>> dbzero.open("some-new-prefix", "rw") + >>> obj_2 = MemoTestClass(952) + >>> + >>> # Use locked() to track changes + >>> with dbzero.locked() as lock: + ... # A read operation does not create a log entry + ... x = obj_1.value + ... # A mutating operation does + ... obj_2.value = 123123 + >>> + >>> # After the block, get the mutation log + >>> mutation_log = lock.get_mutation_log() + >>> # The log shows that only "some-new-prefix" was modified + >>> assert len(mutation_log) == 1 + >>> assert mutation_log[0][0] == "some-new-prefix" + + Asynchronous commit waiting: + + >>> obj = MemoTestClass(1234) + >>> dbzero.commit() # Ensure initial state is saved + >>> + >>> # Get the number that the *next* commit will have + >>> next_state_num = dbzero.get_state_num() + >>> + >>> # This async block will pause until the change to `obj` is committed + >>> async with dbzero.locked(await_commit=True): + ... obj.value = 5678 + >>> + >>> # Code here will only run after the commit is complete + >>> print("Commit finished!") + >>> # We can verify that the state number has advanced as expected + >>> assert dbzero.get_state_num(finalized=True) == next_state_num + + Notes + ----- + This function can be used as a lightweight alternative to `dbzero.atomic`, + with a difference that it doesn't revert changes in case an error occurs. + """ return LockedManager(await_commit) diff --git a/dbzero/dbzero/memo.py b/dbzero/dbzero/memo.py index fdf9e545..59f8303f 100644 --- a/dbzero/dbzero/memo.py +++ b/dbzero/dbzero/memo.py @@ -1,17 +1,69 @@ import inspect import dis +from typing import Callable, Optional from .dbzero import _wrap_memo_type, set_prefix -def migration(func): - """ - Decorator for marking a function as a migration function. - """ +def migration(func: Callable) -> Callable: + """Decorator for marking a function as a migration function""" func._db0_migration = None return func + +def memo(cls: Optional[type] = None, **kwargs) -> type: + """Transform a standard Python class into a persistent, dbzero-managed object. + + The objects' serialization, storage and lifecycle is handled transparently, + allowing to interact with it as if it was a regular python object. + + Parameters + ---------- + singleton : bool, default False + When True, the decorated class becomes a singleton within its prefix. The first + time you instantiate the class, the object is created and persisted. All subsequent + calls to the constructor within the same prefix will return the existing instance. + prefix : str, optional + Specifies a static prefix for the class and all its instances. + If not provided, the class uses the current active prefix set by dbzero.open(). + no_default_tags : bool, default False + If True, dbzero will not automatically add default system tags (such as the class + name) to new instances of this class. + + Returns + ------- + type + Decorated Memo class. + + Examples + -------- + Basic persistent class: + + >>> @dbzero.memo + ... class Task: + ... def __init__(self, description): + ... self.description = description + ... self.completed = False + >>> + >>> # Creates a new persistent object + >>> task1 = Task("Write documentation") + >>> # Attribute modifications are automatically persisted + >>> task1.completed = True + + Singleton pattern: -def memo(cls=None, **kwargs): + >>> @dbzero.memo(singleton=True) + ... class AppSettings: + ... def __init__(self, theme="dark"): + ... self.theme = theme + >>> + >>> # First call creates the object + >>> settings1 = AppSettings(theme="light") + >>> print(settings1.theme) # "light" + >>> + >>> # Subsequent calls return the *same* object; arguments are ignored + >>> settings2 = AppSettings(theme="dark") + >>> print(settings2.theme) # "light" + """ def getfile(cls_): # inspect.getfile() can raise TypeError if cls_ is a built-in class (e.g. defined in a notebook). try: diff --git a/dbzero/dbzero/reflection_api.py b/dbzero/dbzero/reflection_api.py index a7aa469b..7c785324 100644 --- a/dbzero/dbzero/reflection_api.py +++ b/dbzero/dbzero/reflection_api.py @@ -1,3 +1,4 @@ +from __future__ import annotations from collections import namedtuple from enum import Enum import itertools @@ -10,6 +11,7 @@ import importlib.util import os import sys +from typing import Any, List from .decorators import check_params_not_equal from .storage_api import PrefixMetaData @@ -33,37 +35,45 @@ def _get_callable_params(parameters): class AttributeInfo: - def __init__(self, name: str, metaclass): + """Metadata info of Memo class attriute.""" + + def __init__(self, name: str, metaclass: MemoMetaClass): self.name = name self.metaclass = metaclass class MethodInfo(inspect.Signature): - def __init__(self, name: str, signature: inspect.Signature, metaclass): + """Metadata info of Memo class method.""" + + def __init__(self, name: str, signature: inspect.Signature, metaclass: MemoMetaClass): super().__init__(signature.parameters.values(), return_annotation=signature.return_annotation) self.name = name self.metaclass = metaclass self.__params = _get_callable_params(self.parameters) - def get_params(self): + def get_params(self) -> List[MethodParam]: return [MethodParam(param, self) for param in self.__params.params] @property - def has_args(self): + def has_args(self) -> bool: return self.__params.has_args @property - def has_kwargs(self): + def has_kwargs(self) -> bool: return self.__params.has_kwargs class MethodParam(inspect.Parameter): + """Metadata info of Memo class method parameter.""" + def __init__(self, param: inspect.Parameter, method: MethodInfo): super().__init__(param.name, param.kind, default=param.default, annotation=param.annotation) self.method = method class MemoMetaClass: + """Memo class metadata info.""" + def __init__(self, name, module, class_uuid, is_singleton=False, instance_uuid=None): self.__name = name self.__module = module @@ -73,46 +83,50 @@ def __init__(self, name, module, class_uuid, is_singleton=False, instance_uuid=N self.__cls = None @property - def name(self): + def name(self) -> str: + """Memo class name.""" return self.__name @property def module(self): + """Module containing memo class.""" return self.__module @property - def class_uuid(self): + def class_uuid(self) -> str: + """UUID of memo class.""" return self.__class_uuid - def get_class(self): + def get_class(self) -> Any: + """Get Memo class object.""" if self.__cls is None: self.__cls = db0.fetch(self.__class_uuid) return self.__cls - def type_exists(self): + def type_exists(self) -> bool: + """Check if Memo class Python type exist and can be resolved.""" return self.get_class().type_exists() - def get_type(self): + def get_type(self)-> type: + """Get Memo class Python type.""" return self.get_class().type() @property def is_singleton(self): + """Is Memo class a singleton.""" return self.__is_singleton @property def instance_uuid(self): + """Memo class singleton object instance UUID.""" return self.__instance_uuid def get_instance(self): - """ - Get the associated singleton instance of this class. - """ + """Get the associated singleton instance of this class.""" return db0.fetch(self.__instance_uuid) def get_attributes(self, include_properties=False): - """ - get_attributes works for known and unknown types - """ + """Get attribute info of a Memo class.""" for attr in self.get_class().get_attributes(): yield AttributeInfo(attr[0], self) @@ -124,9 +138,7 @@ def get_attributes(self, include_properties=False): yield AttributeInfo(attr_name, self) def get_methods(self): - """ - get_methods works for known types only - """ + """Get Memo class methods of known Python type.""" def is_private(name): return name.startswith("_") @@ -138,9 +150,11 @@ def is_private(name): yield MethodInfo(attr_name, inspect.signature(attr), self) def get_schema(self): + """Get Memo class attributes schema.""" return db0.get_schema(self.get_type()) def all(self, snapshot=None, as_memo_base=False): + """Find all instances of this Memo class.""" if not as_memo_base and self.get_class().is_known_type(): if snapshot is not None: return snapshot.find(self.get_class().type()) @@ -154,6 +168,7 @@ def all(self, snapshot=None, as_memo_base=False): return db0.find(db0.MemoBase, self.get_class()) def get_instance_count(self): + """Get number of instances of this Memo class.""" return db0.getrefcount(self.get_class()) def __str__(self): @@ -167,6 +182,7 @@ def __eq__(self, value): def get_memo_classes(prefix: PrefixMetaData = None): + """Get metadata info of all Memo classes.""" if type(prefix) is str: # fallback to prefix name for memo_class in (_get_memo_classes(prefix) if prefix is not None else _get_memo_classes()): @@ -177,11 +193,13 @@ def get_memo_classes(prefix: PrefixMetaData = None): def get_memo_class(arg: str | db0.MemoBase): + """Get Memo class metadata info from class UUID or of a Memo object instance.""" type_info = _get_memo_class(arg) if db0.is_memo(arg) else db0.fetch(arg).type_info() return MemoMetaClass(*type_info) class Query(inspect.Signature): + """dbzero query function metadata info.""" def __init__(self, name: str, function_obj: typing.Callable): signature = inspect.signature(function_obj) super().__init__(signature.parameters.values(), return_annotation=signature.return_annotation) @@ -191,27 +209,34 @@ def __init__(self, name: str, function_obj: typing.Callable): @property def name(self): + """Function name.""" return self.__name @property def function_object(self): + """Function callable.""" return self.__function_obj @property def has_kwargs(self): + """Has **kwargs arguments.""" return self.__params.has_kwargs @property def has_params(self): + """Has any named of kwargs arguments.""" return len(self.__params.params) > 0 or self.has_kwargs def get_params(self): + """Get query function parameters.""" return [QueryParam(param, self) for param in self.__params.params] def execute(self, *args, **kwargs): + """Execute query function.""" return self.__function_obj(*args, **kwargs) class QueryParam(inspect.Parameter): + """Query function parameter metadata info.""" def __init__(self, param: inspect.Parameter, query: Query): super().__init__(param.name, param.kind, default=param.default, annotation=param.annotation) self.query = query @@ -241,7 +266,7 @@ def __import_from_file(file_path, submodule_search_locations): return importlib.import_module(path_obj.stem) -def import_submodules(package, module): +def __import_submodules(package, module): if not hasattr(package, "__path__"): return for loader, module_name, is_pkg in pkgutil.iter_modules(package.__path__): @@ -257,7 +282,7 @@ def __import_module(module_or_file_name, package = None): module = importlib.import_module(module_or_file_name, package) # Optionally import all public attributes sys.modules[module_or_file_name] = module - import_submodules(module, package) + __import_submodules(module, package) return [module] except Exception as ex: pass @@ -272,10 +297,12 @@ def __import_module(module_or_file_name, package = None): def import_model(module_or_file_name, package=None): + """Import dbzero Memo classes from a Python package.""" return importlib.import_module(module_or_file_name, package) def get_queries(*module_names): + """Get dbzero query functions from Python modules.""" # Dynamically import modules for name in module_names: module = importlib.import_module(name) diff --git a/dbzero/dbzero/select.py b/dbzero/dbzero/select.py index cd557ecf..251263f8 100644 --- a/dbzero/dbzero/select.py +++ b/dbzero/dbzero/select.py @@ -1,12 +1,50 @@ +from typing import Optional, Callable, Iterable, Tuple +from .interfaces import Memo, QueryObject, Snapshot import dbzero as db0 from .dbzero import _select_mod_candidates, _split_by_snapshots -def select_new(query, pre_snapshot, last_snapshot): - """ - Refines the query to include only new objects in the given state range (scope) - :param query: the query to refine - :param context: context object which needs to be managed by the calling function +def select_new(query: QueryObject, pre_snapshot: Optional[Snapshot], last_snapshot: Snapshot) -> QueryObject: + """Refine the query to only include objects that were created between two snapshots. + + Objects that match the query condition in later snapshot and do not match in earlier + snapshot are considered 'new' and are included in the result set. + + Parameters + ---------- + query : QueryObject + A dbzero query. + pre_snapshot : Snapshot or None + The snapshot representing the starting point for the comparison. If None or doesn't + contain the prefix query, all objects matching the query found in last_snapshot will + be returned as new. + last_snapshot : Snapshot + The snapshot representing the ending point for the comparison. + + Returns + ------- + QueryObject + A new query containing the objects that match the input query and were created + between pre_snapshot and last_snapshot. + + Examples + -------- + Find newly created objects: + + >>> # Create an initial state and snapshot + >>> dbzero.commit() + >>> snap_1 = dbzero.snapshot() + >>> + >>> # Create a new District object and commit the change + >>> new_district = District("some_District") + >>> dbzero.commit() + >>> snap_2 = dbzero.snapshot() + >>> + >>> # Query for new District objects created between snap_1 and snap_2 + >>> new_objects = dbzero.select_new(dbzero.find(District), snap_1, snap_2) + >>> + >>> assert len(new_objects) == 1 + >>> assert next(iter(new_objects)) == new_district """ # there's no initial state, therefore all results from the last_snapshot will be "new" query_data = db0.serialize(query) @@ -19,9 +57,60 @@ def select_new(query, pre_snapshot, last_snapshot): ) -def select_deleted(query, pre_snapshot, last_snapshot): - """ - Refines the query to include only objects which were deleted within the given state range (scope) +def select_deleted(query: QueryObject, pre_snapshot: Optional[Snapshot], last_snapshot: Snapshot) -> QueryObject: + """Refine the query to only include objects that were deleted between two snapshots. + + Objects that match the query condition in earlier snapshot and do not match in later + snapshot are considered 'deleted' and are included in the result set. + + Parameters + ---------- + query : QueryObject + A dbzero query. + pre_snapshot : Snapshot or None + The snapshot representing the starting point for the comparison. + If None, returns an empty list since there's no initial state to compare against. + last_snapshot : Snapshot + The snapshot representing the ending point for the comparison. + + Returns + ------- + QueryObject + A new query containing the objects that match the input query and were deleted + between pre_snapshot and last_snapshot. + + Examples + -------- + Find objects that were untagged between snapshots: + + >>> @dbzero.memo + ... class Task: + ... def __init__(self, name): + ... self.name = name + >>> + >>> # Create two tasks and tag them as 'active' + >>> task1 = Task("Write docs") + >>> dbzero.tags(task1).add("active") + >>> task2 = Task("Review code") + >>> dbzero.tags(task2).add("active") + >>> dbzero.commit() + >>> + >>> # Capture the "before" snapshot + >>> snap_before = dbzero.snapshot() + >>> + >>> # "Delete" task1 from the 'active' view by removing its tag + >>> dbzero.tags(task1).remove("active") + >>> dbzero.commit() + >>> + >>> # Capture the "after" snapshot + >>> snap_after = dbzero.snapshot() + >>> + >>> # Find which tasks were removed from the query results + >>> active_tasks_query = dbzero.find(Task, "active") + >>> deleted_tasks = dbzero.select_deleted(active_tasks_query, snap_before, snap_after) + >>> + >>> assert len(deleted_tasks) == 1 + >>> assert deleted_tasks[0].name == "Write docs" """ # there's no initiali state, so no pre-existing objects could've been deleted if not pre_snapshot: @@ -69,10 +158,81 @@ def __len__(self): return size -def select_modified(query, pre_snapshot, last_snapshot, compare_with = None): - """ - Refines the query to include only objects which were modified within the given state range (scope) - not including new objects +def select_modified(query: QueryObject, pre_snapshot: Optional[Snapshot], last_snapshot: Snapshot, compare_with: Optional[Callable] = None) -> Iterable[Tuple[Memo, Memo]]: + """Refines the query to include only objects which were modified between two snapshots, + not including new objects. + + Objects that match the query condition in both snapshots and their state was modified + between two snapshots are included in the result set. + + Parameters + ---------- + query : QueryObject + A dbzero query. + pre_snapshot : Snapshot or None + The snapshot representing the starting point for the comparison. + If None, returns an empty list since there's no initial state to compare against. + last_snapshot : Snapshot + The snapshot representing the ending point for the comparison. + compare_with : callable, optional + A custom comparator that takes two arguments: the object from pre_snapshot and the object + from last_snapshot. It should return True if the objects are considered unchanged. + + Returns + ------- + Iterable[Tuple[Memo, Memo]] + An iterable of tuples, where each tuple contains the "before" and "after" versions + of a modified object: (old_version, new_version). old_version is the object's state + from pre_snapshot, new_version is the object's state from last_snapshot. + + Examples + -------- + Basic usage - find all modified districts: + + >>> # Initial state + >>> district_a = District(name="Oldtown") + >>> dbzero.commit() + >>> snap_1 = dbzero.snapshot() + >>> + >>> # Make a change + >>> district_a.name = "Newtown" + >>> dbzero.commit() + >>> snap_2 = dbzero.snapshot() + >>> + >>> # Find modified objects + >>> modified_districts = dbzero.select_modified(dbzero.find(District), snap_1, snap_2) + >>> + >>> for old_ver, new_ver in modified_districts: + ... print(f"District name changed from '{old_ver.name}' to '{new_ver.name}'") + ... assert old_ver.name == "Oldtown" + ... assert new_ver.name == "Newtown" + + Advanced usage - custom comparison logic: + + >>> class Product: + ... def __init__(self, name, price, last_updated): + ... self.name = name + ... self.price = price + ... self.last_updated = last_updated + >>> + >>> # Initial state + >>> product = Product(name="Laptop", price=1200, last_updated=1672531200) + >>> dbzero.commit() + >>> snap_1 = dbzero.snapshot() + >>> + >>> # Modify only the timestamp + >>> product.last_updated = 1672617600 + >>> dbzero.commit() + >>> snap_2 = dbzero.snapshot() + >>> + >>> # Custom comparison function + >>> def a_meaningful_change(obj1, obj2): + ... # Return True if only the timestamp is different (i.e., they are "equal" for our purposes) + ... return obj1.name == obj2.name and obj1.price == obj2.price + >>> + >>> # This will find no "meaningful" modifications + >>> results = dbzero.select_modified(dbzero.find(Product), snap_1, snap_2, compare_with=a_meaningful_change) + >>> assert len(results) == 0 """ # there's no state before 1, so no pre-existing objects could've been modified if not pre_snapshot: diff --git a/dbzero/dbzero/storage_api.py b/dbzero/dbzero/storage_api.py index 42091b18..6e81ec9c 100644 --- a/dbzero/dbzero/storage_api.py +++ b/dbzero/dbzero/storage_api.py @@ -1,23 +1,156 @@ from collections import namedtuple -import dbzero as db0 +from typing import Iterator, Any from .dbzero import _get_prefixes, _get_current_prefix, _get_prefix_of, _get_mutable_prefixes PrefixMetaData = namedtuple("PrefixMetaData", ["name", "uuid"]) -def get_prefixes(): +def get_prefixes() -> Iterator[PrefixMetaData]: + """Discover and return all available prefixes from the configured dbzero storage location. + + Returns + ------- + Iterator[PrefixMetaData] + An iterator that yields metadata objects for each discovered prefix. + + Examples + -------- + Discovering and opening all available prefixes: + + >>> # First, initialize the dbzero environment + >>> dbzero.init(path="/path/to/my/data") + >>> + >>> # Discover all available prefixes + >>> all_prefixes = dbzero.get_prefixes() + >>> print(f"Discovered {len(list(all_prefixes))} prefixes.") + >>> + >>> # Iterate and open each one + >>> for prefix in dbzero.get_prefixes(): + ... print(f"Opening prefix: {prefix.name}") + ... dbzero.open(prefix.name, "r") + >>> + >>> # Now all prefixes are open and their data can be queried + """ for prefix in _get_prefixes(): yield PrefixMetaData(*prefix) -def get_mutable_prefixes(): +def get_mutable_prefixes() -> Iterator[PrefixMetaData]: + """Return all currently open prefixes that can be modified (read-write mode). + + Returns + ------- + Iterator[PrefixMetaData] + An iterator of PrefixMetaData objects, where each object corresponds to a prefix + that is currently open in read-write ('rw') mode. + + Examples + -------- + Listing all mutable prefixes: + + >>> # The default prefix is open and mutable + >>> print([p.name for p in dbzero.get_mutable_prefixes()]) + >>> ['main'] + >>> + >>> # Open two more prefixes + >>> dbzero.open('prefix1') + >>> dbzero.open('prefix2') + >>> + >>> # The list will now include all three open prefixes + >>> print([p.name for p in dbzero.get_mutable_prefixes()]) + >>> ['main', 'prefix1', 'prefix2'] + """ for prefix in _get_mutable_prefixes(): yield PrefixMetaData(*prefix) -def get_current_prefix(): +def get_current_prefix() -> PrefixMetaData: + """Retrieve the currently active prefix. + + Returns + ------- + PrefixMetaData + A PrefixMetaData object that represents the current prefix. + + Examples + -------- + Getting the current prefix name: + + >>> # Assuming a connection is open with a default prefix like 'main' + >>> current_px = dbzero.get_current_prefix() + >>> print(current_px.name) + >>> 'main' + + How dbzero.open() and dbzero.close() affect the current prefix: + + >>> # Get the initial prefix + >>> initial_prefix = dbzero.get_current_prefix() + >>> print(f"Initial prefix: {initial_prefix.name}") + >>> + >>> # Open a new prefix, which becomes the current one + >>> dbzero.open("secondary-prefix") + >>> print(f"New current prefix: {dbzero.get_current_prefix().name}") + >>> + >>> # Close the new prefix + >>> dbzero.close("secondary-prefix") + >>> + >>> # The current prefix is restored to the initial one + >>> print(f"Restored prefix: {dbzero.get_current_prefix().name}") + >>> # Expected output: + >>> # Initial prefix: main + >>> # New current prefix: secondary-prefix + >>> # Restored prefix: main + """ return PrefixMetaData(*_get_current_prefix()) -def get_prefix_of(obj): +def get_prefix_of(obj: Any) -> PrefixMetaData: + """Return the prefix where given dbzero-managed object resides. + + Parameters + ---------- + obj : Any + The dbzero item whose prefix you want to find. This can be an object instance, + a class decorated with @dbzero.memo, an enum, or a dbzero query object. + + Returns + ------- + PrefixMetaData + A PrefixMetaData object representing the objects' prefix. + + Examples + -------- + Getting the prefix of an object instance: + + >>> # Create an object on the default prefix + >>> obj_1 = MemoTestClass(100) + >>> print(f"obj_1 lives on prefix: {dbzero.get_prefix_of(obj_1).name}") + >>> + >>> # Open a new prefix, making it the current one + >>> dbzero.open("secondary-db") + >>> obj_2 = MemoTestClass(200) + >>> print(f"obj_2 lives on prefix: {dbzero.get_prefix_of(obj_2).name}") + >>> # Expected output: + >>> # obj_1 lives on prefix: main + >>> # obj_2 lives on prefix: secondary-db + + Getting the prefix of a class type: + + >>> @dbzero.memo(prefix="scoped-class-prefix") + ... class ScopedDataClass: + ... def __init__(self, value): + ... self.value = value + >>> + >>> # Get the prefix directly from the type + >>> class_prefix = dbzero.get_prefix_of(ScopedDataClass) + >>> print(f"ScopedDataClass belongs to: {class_prefix.name}") + >>> + >>> # An instance of the class will belong to the same prefix + >>> instance = ScopedDataClass(42) + >>> instance_prefix = dbzero.get_prefix_of(instance) + >>> print(f"An instance belongs to: {instance_prefix.name}") + >>> # Expected output: + >>> # ScopedDataClass belongs to: scoped-class-prefix + >>> # An instance belongs to: scoped-class-prefix + """ return PrefixMetaData(*_get_prefix_of(obj)) diff --git a/dbzero/dbzero/utilities.py b/dbzero/dbzero/utilities.py index 4fbac90e..59eccc9e 100644 --- a/dbzero/dbzero/utilities.py +++ b/dbzero/dbzero/utilities.py @@ -1,4 +1,4 @@ -import typing +from typing import Iterator _NORMALIZE_TRANSLATION_SOURCE = 'ĄĆĘŁŃÓŚŻŹ' _NORMALIZE_TRANSLATION_MAPPINGS = 'ACELNOSZZ' @@ -7,36 +7,62 @@ _EXTRA_SPLIT_DELIMITERS = '.,;:-!?\t\n"\'' _EXTRA_SPLIT_TABLE = str.maketrans(_EXTRA_SPLIT_DELIMITERS, ' ' * len(_EXTRA_SPLIT_DELIMITERS)) -def taggify(input_text: str, max_len = 3, min_len = 1, suffix = False) -> typing.Iterator[str]: - """ - This function tokenizes an arbitrary string and breaks it into an iterable of tags - which are constructed by taking a prefix of up to - a specific length, removing any whitespaces and delimiters and normalizing to uppercase and latin characters. - Parameters: - input_text: the arbitrary input text (unicode) - max_len: the maximum prefix length (must be > 0), if None then return unlimited length tags - min_len: the minimum prefix length to be returned - Returns: - An iterable of unique tags (see example below) - - Example: - # returns ["MINS", "MAZO"] - db0.taggify("--Mińsk Mazowiecki", max_len = 4) - - # returns ["MAR"] - db0.taggify("Markowski, Marek", max_len = 3) - - # returns ["KOW"] - db0.taggify("A.Kowalski", min_len = 3) - - # returns ["A", "KOWALSKI"] - db0.taggify("A.Kowalski", max_len = None) - - How input text is processed: - * The input is split into tokens by whitespace (and other delimiters, like '.') - * Non-alphanumeric characters are removed - * Tokens shorter than 'min_len' are filtered out - * Letters are converted to upper-case. Diacritic characters are transliterated to latin counterparts. - * Slice of 'max_len' characters is returned for each token +def taggify(input_text: str, max_len: int = 3, min_len: int = 1, suffix: bool = False) -> Iterator[str]: + """Tokenize an arbitrary string and convert it into an iterable of normalized tags. + + This function breaks text into tokens, removes whitespaces and delimiters, and + normalizes to uppercase Latin characters. Tags are constructed by taking a prefix + (or suffix) of up to a specific length from each token. + + Parameters + ---------- + input_text : str + The arbitrary input text (unicode) to tokenize. + max_len : int, default 3 + The maximum prefix length (must be > 0). + min_len : int, default 1 + The minimum prefix length to be returned. Tokens shorter than this are filtered out. + suffix : bool, default False + If True, take suffix instead of prefix from tokens. + + Returns + ------- + Iterator[str] + An iterator of unique tags. + + Examples + -------- + Basic usage with maximum length: + + >>> list(dbzero.taggify("--Mińsk Mazowiecki", max_len=4)) + ['MINS', 'MAZO'] + + Filtering by maximum length: + + >>> list(dbzero.taggify("Markowski, Marek", max_len=3)) + ['MAR'] + + Filtering by minimum length: + + >>> list(dbzero.taggify("A.Kowalski", min_len=3)) + ['KOW'] + + Unlimited length tags: + + >>> list(dbzero.taggify("A.Kowalski", max_len=None)) + ['A', 'KOWALSKI'] + + Notes + ----- + Text Processing Steps: + + 1. The input is split into tokens by whitespace and delimiters (.,;:-!?\\t\\n"') + 2. Non-alphanumeric characters are removed from each token + 3. Tokens shorter than `min_len` are filtered out + 4. Letters are converted to upper-case + 5. Diacritic characters are transliterated to Latin counterparts (Ą→A, Ć→C, etc.) + 6. A slice of `max_len` characters is taken from each token + 7. Only unique tags are yielded """ yielded_tags = set()