diff --git a/design/METASPACE_DESIGN.md b/design/METASPACE_DESIGN.md new file mode 100644 index 00000000..197c737c --- /dev/null +++ b/design/METASPACE_DESIGN.md @@ -0,0 +1,295 @@ +# MetaSpace Design + +This document describes the planned `MetaPrefix` / `MetaSpace` storage model +for dbzero metadata pages, including the multi-slot extension and integration +with `BDevStorage`. + +## Goal + +`MetaPrefix` stores durable metadata, with durable-page mapping metadata being +the primary use case. It is responsible for capturing and persisting only the +most recent head state of metadata pages. It is not intended to retain a full +history of metadata states. + +The design builds on the existing in-memory `DRAM_Prefix` and +`DRAM_Allocator` machinery for data-page management and bookkeeping, but it +changes the persistence layer: + +- Page contents are stored through a `Diff_IO` backed store. +- Logical-page-to-storage-location mappings are stored in an additional sparse + pair managed outside the `MetaPrefix`. +- Updates prefer sequential diff-stream appends over random full-page + overwrites. +- Periodic compaction rewrites head pages as full pages, clears old diffs, and + bounds replay cost. + +## Terminology + +The design uses these terms: + +- `DP`: a durable data page. +- `head state`: the newest committed state that must be reopened after restart. +- `historical state`: the previous committed state retained for crash safety. +- `full DP`: a complete page image stored at a specific `Diff_IO` location. +- `diff block`: an append-only delta against a previous full DP or diff chain. +- `DiffIndex`: the in-memory or durable index that tracks appended diff blocks. +- `sparse pair`: the external mapping from logical page id to storage location + and diff sequence. +- `slot`: an independently managed metadata address-space partition in the + multi-slot extension. + +## Storage Model + +`MetaPrefix` keeps the same in-memory page-management responsibilities as a +`DRAM_Prefix`. Allocated metadata pages have local logical page ids and are +managed by a `DRAM_Allocator`. + +Persistent page locations are not stored directly inside the `MetaPrefix`. +Instead, `MetaPrefix` requires an additional sparse pair whose values describe +the current storage chain for each logical page: + +```text +local logical page id -> full DP location + ordered diff locations +``` + +The sparse pair is maintained elsewhere so that the `MetaPrefix` can be used as +a metadata host without recursively depending on itself for its own location +mapping. For the multi-slot `MetaSpace` used by `BDevStorage`, this sparse pair +is maintained by the root-level `DRAM_Prefix`. + +## Diff_IO + +`Diff_IO` is the persistent store used by `MetaPrefix`. In production it is +typically embedded as a separate page-IO channel in the underlying +`BDevStorage`. + +Required operations: + +- Read a full DP from a specific location. +- Write a full DP to a specific location. +- Update or overwrite a full DP at a specific location. +- Append a diff block to the diff stream. +- Apply or replay diffs from a specific location or chain. +- Clear the diff stream so the space can be reused after compaction. + +The implementation should treat full-page writes and diff appends differently. +Full-page writes are used for initial materialization, crash-safe state +rotation, and compaction. Ordinary metadata updates should generally be +persisted as appended diffs. + +## Persistence Semantics + +`MetaPrefix` persists only the head state, plus one previous historical state +needed for crash recovery. Retaining one historical state protects against a +crash that happens after part of the new head state has been persisted but +before all metadata needed to reopen it has become durable. + +This implies a two-generation storage discipline: + +- The current head generation is the generation reopened during normal startup. +- The previous generation is retained until the next head generation is fully + durable. +- Full DP locations from older generations may be reused after they are no + longer needed for crash recovery. + +The sparse pair update must be ordered so that recovery can always choose a +complete generation. A crash must not expose a sparse-pair entry that points to +a partially written full DP or an incomplete diff sequence as the only +available state. + +## Flush Mode + +The default flush path should prefer appending diff blocks over overwriting full +DP locations. + +Sequential appends are preferred because: + +- They are usually faster than random writes. +- They match SSD write behavior better than repeatedly overwriting the same + physical locations. +- They reduce premature cell wear caused by hot random overwrite patterns. +- They allow commits to persist small metadata changes without rewriting entire + pages. + +Full DP overwrites remain necessary for compaction, initial page creation, and +state-generation management, but they should not be the common path for small +metadata mutations. + +## Diff Growth And Compaction + +The diff stream must not grow without bound. Long diff chains increase startup +or page-load replay time and place unnecessary pressure on the `DiffIndex`. + +Compaction is the administrative operation that bounds this cost: + +1. Materialize every dirty or live head DP as a full DP. +2. Update the sparse pair so each logical page points to the new full-page + location without old diff chains. +3. Ensure the new head generation is durable. +4. Retain the previous generation until it is safe to reclaim. +5. Clear the diff stream for reuse. +6. Clear or rebuild the `DiffIndex`. + +Compaction may extend commit latency because it rewrites all head metadata +pages that need a compact full representation. The runtime should expose a +programmatic mechanism to suspend or postpone compaction when the system is +under load. While compaction is suspended, ordinary diff appends may continue +until the configured diff-stream cap forces the system to either resume +compaction or reject further growth with a clear operational error. + +## Crash Consistency Invariants + +The implementation must preserve these invariants: + +- Startup can always recover either the latest complete head state or the + previous complete historical state. +- A sparse-pair entry published as part of the head generation never points to + storage that was not fully written. +- Diff replay for a page is ordered and deterministic. +- Clearing the diff stream only happens after all head DPs have full-page + representations and the sparse pair no longer needs the old diff locations. +- Reusing full DP locations from old generations only happens after the + previous generation is no longer needed for crash recovery. +- Compaction is atomic at the `MetaSpace` level, not per page. + +## Multi-Slot MetaSpace + +Multi-slot `MetaSpace` extends the regular `MetaSpace` model with independently +managed slots. A slot is a separate metadata address space with its own memory +mapping lifecycle. The term `slot` matches the allocator interface, although +the concept is closer to a realm. + +Slots improve memory management by allowing metadata groups to be mapped and +evicted independently. A slot should correspond to a fixed-size or limited-scope +resource, such as one allocator slab. Slots are intended for metadata, not for +unbounded application data. + +The persistence model is still global. All changed slots are persisted as part +of one atomic `MetaSpace` commit. Compaction is also global across all slots. + +## Slot Address Encoding + +Slot identity is encoded in the logical page number. The proposed split is: + +```text +high 40 bits: slot id +low 24 bits: within-slot page id +``` + +With 16 KiB DPs, a 24-bit within-slot page id addresses roughly 256 GiB per +slot. If the implementation reserves ids or uses a smaller effective range, the +addressable space is still expected to be far larger than needed for +fixed-scope metadata slots. + +The page-id encoding must be treated as part of the durable format once +persisted. Helpers should be used instead of open-coded bit manipulation so the +split can be audited and versioned. + +## Slot Mapping Policies + +The multi-slot runtime supports three mapping policies: + +- `eager`: all slots are memory-mapped on startup. This is the default. +- `lazy`: slots are mapped on demand when data from the slot is accessed. +- `mixed`: selected slot groups are mapped lazily while others are mapped + eagerly. + +The expected mixed-mode use case is to keep critical or frequently used +metadata eager while mapping no-cache or low-priority metadata lazily. + +Lazy loading uses range queries over the associated sparse pair. Because slot id +is encoded into the high bits of the page number, a slot load can retrieve all +logical page mappings in the slot with a range scan: + +```text +[slot_id << 24, (slot_id + 1) << 24) +``` + +Each returned mapping gives the full DP location and diff sequence needed to +materialize the page into the slot-local mapping. + +## Atomic Commit Across Slots + +Slot independence is a memory-management property, not a transactional +property. The persistence algorithm must commit all slot changes atomically. + +Commit requirements: + +- Dirty pages from all mapped slots participate in the same head-state commit. +- Lazy slots with no loaded or dirty pages do not need to be materialized merely + because another slot is committed. +- Sparse-pair updates for all changed slots are published as one generation. +- Recovery must not observe a commit where only some slots advanced to the new + generation. +- Compaction rewrites the head state consistently across all slots. + +## BDevStorage Integration + +The multi-slot `MetaSpace` store is integrated with `BDevStorage` as a separate +dedicated page-IO channel. + +Its primary responsibility is hosting the main sparse pair that maps +application-level data pages to their physical storage locations and diff +chains. The `MetaSpace` itself also needs metadata describing its own page +locations. That self-metadata sparse pair is maintained by the root-level +`DRAM_Prefix`, avoiding recursive dependency on the multi-slot `MetaSpace` +being opened. + +The storage layering is: + +```text +BDevStorage + application data page channel + MetaSpace page-IO channel + main sparse pair for application data pages + root-level DRAM_Prefix + sparse pair for MetaSpace's own metadata pages +``` + +## Open Questions + +The implementation should resolve these details before coding: + +- The exact durable format for sparse-pair values: full DP location, diff + sequence encoding, generation id, and checksums. +- The generation publication protocol used to choose head versus historical + state during recovery. +- The diff-stream size cap and whether it is configured by byte size, block + count, replay cost estimate, or a combination. +- The operational behavior when compaction is suspended and the diff cap is + reached. +- The public or internal API shape for suspending and resuming compaction. +- The slot policy configuration format and whether policies are global, + per-slot, or per slot group. + +## Test Plan + +Follow TDD when implementing this design. + +Required storage-level tests: + +- A metadata page can be written as a full DP and reopened through the sparse + pair mapping. +- Multiple updates to a metadata page are persisted as diff appends and replay + in order. +- Recovery uses the previous historical generation if a crash is simulated + before the new head generation is fully published. +- Old full DP locations are reused only after the previous generation is no + longer needed. +- Compaction rewrites diff-backed head pages as full DPs and clears the diff + stream. +- Suspended compaction postpones administrative rewrite work without breaking + ordinary diff-backed commits below the cap. + +Required multi-slot tests: + +- Eager mode maps all slots on startup. +- Lazy mode maps a slot only after accessing a page in that slot. +- Mixed mode eagerly maps configured slots and lazily maps configured lazy + slots. +- Slot load uses sparse-pair range lookup and reconstructs all pages in the + slot. +- A commit containing dirty pages from multiple slots is recovered atomically. +- Compaction covers all slots and leaves no stale diff dependencies for the new + head generation. + diff --git a/design/PANDAS_DATAFRAME_INTEGRATION_DESIGN.md b/design/PANDAS_DATAFRAME_INTEGRATION_DESIGN.md new file mode 100644 index 00000000..cf5349f0 --- /dev/null +++ b/design/PANDAS_DATAFRAME_INTEGRATION_DESIGN.md @@ -0,0 +1,584 @@ +# Pandas DataFrame Integration Design + +This document describes a first-class pandas DataFrame integration for dbzero. +The integration should allow pandas DataFrames to be stored as durable memo +members while keeping the storage model based on overlaid types, `v_object`, and +the existing `ObjectBase` lifecycle. + +## Goal + +dbzero should support pandas DataFrames as durable live objects: + +- Assigning a `pandas.DataFrame` to a memo field persists the frame in dbzero. +- Reading the field returns a dbzero DataFrame wrapper backed by durable storage. +- Mutating common DataFrame locations through the wrapper updates durable state. +- Reopening the prefix reconstructs the same frame contents, labels, and core + dtypes. +- Users can convert explicitly between pandas and dbzero with `db0.dataframe(df)` + and `db0_df.to_pandas()`. + +The storage representation must not be a pickle or opaque serialized blob. The +frame should be decomposed into durable metadata and column storage that dbzero +can validate, reference count, detach, commit, and eventually optimize. + +## Non-Goals + +The first implementation should not try to implement the whole pandas API. +Pandas is a large Python library with a wide surface area, and a partial wrapper +that claims complete compatibility would be brittle. + +The following are out of scope for v1: + +- Full pandas method parity. +- Persistence of arbitrary object columns. +- Pickle/blob fallback for unsupported columns. +- MultiIndex for rows or columns. +- Categorical columns. +- Pandas extension arrays and nullable extension dtypes. +- Sparse arrays. +- Timezone-aware datetime columns. +- Depending on pandas internals such as `BlockManager` layout. +- Adding pandas as a mandatory dbzero runtime dependency. + +Unsupported features should fail clearly at construction or assignment time. +They should not silently degrade to object storage or lossy conversion. + +## Python API + +The feature has both transparent and explicit construction paths. + +Transparent memo assignment: + +```python +@db0.memo +class Model: + pass + +obj = Model() +obj.frame = pandas.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) + +assert obj.frame.shape == (2, 2) +``` + +Explicit construction: + +```python +df = pandas.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) +durable = db0.dataframe(df) +obj.frame = durable +``` + +Reading a DataFrame field returns a dbzero wrapper, not a pandas copy: + +```python +frame = obj.frame +frame.loc[0, "a"] = 10 +db0.commit() +``` + +Conversion back to pandas is explicit: + +```python +pandas_df = obj.frame.to_pandas() +``` + +The returned pandas DataFrame is a copy. Mutating that copy does not persist +unless the user assigns it back through the dbzero wrapper or through a memo +field. + +## Wrapper Surface + +The v1 wrapper should expose common pandas-style access and mutation: + +- `frame.shape` +- `frame.columns` +- `frame.index` +- `frame.dtypes` +- `frame.to_pandas()` +- `frame["column"]` +- `frame["column"] = values` +- `frame.loc[row_label, column_label]` +- `frame.loc[row_selector, column_selector] = values` +- `frame.iloc[row_index, column_index]` +- `frame.iloc[row_selector, column_selector] = values` + +Scalar `loc` and `iloc` reads return Python scalar values. Slice/list reads may +return pandas `Series` or `DataFrame` copies. Mutations through `loc`, `iloc`, +and column assignment update durable storage. + +The wrapper should not expose direct mutable views into durable storage. Any +pandas `Series` or `DataFrame` returned from read operations is a copy unless it +is another dbzero wrapper explicitly documented as durable. + +## Dependency Model + +Pandas and numpy must remain optional. + +dbzero module import must not import pandas or numpy. The integration should use +lazy runtime imports only when DataFrame functionality is used: + +- `db0.dataframe(...)` +- transparent assignment of a pandas DataFrame +- `.to_pandas()` +- pandas-copy read paths such as slice reads + +If pandas is not installed: + +- Normal dbzero usage is unchanged. +- `db0.dataframe(...)` raises a clear import/runtime error. +- Reading an existing dbzero DataFrame should either raise a clear error when a + pandas object is required or support metadata/scalar access that does not need + pandas. The v1 default can require pandas for wrapper use. + +Packaging should not add pandas to `project.dependencies`. A future optional +extra such as `dbzero[pandas]` is acceptable. + +## Type And Storage Registration + +Add new type identifiers: + +- `TypeId::PANDAS_DATAFRAME`: a native pandas DataFrame input object. +- `TypeId::DB0_DATAFRAME`: a dbzero DataFrame wrapper object. +- `StorageClass::DB0_DATAFRAME`: a durable DataFrame member reference. + +Registration must be added to: + +- `PyTypeManager` detection and extraction helpers. +- `StorageClassMapper`. +- `createMember` and `unloadMember`. +- `unrefMember`. +- schema reporting and type names. +- GC0 type registration. +- fetch/load handling. +- module initialization. +- Python stubs. + +The existing `PyTypeManager` should detect pandas DataFrames without importing +pandas at dbzero import time. A reasonable strategy is to lazily import pandas +on first DataFrame check and cache the `pandas.DataFrame` type object if the +import succeeds. + +## Native Object Model + +Add a new native subsystem under `src/dbzero/object_model/pandas/`. + +The primary object should follow the project-wide constructor convention: + +```cpp +class DataFrame + : public db0::ObjectBase +{ +public: + DataFrame() = default; + DataFrame(db0::swine_ptr &, PyObject *pandas_df, AccessFlags = {}); + DataFrame(db0::swine_ptr &, Address, AccessFlags = {}); +}; +``` + +The overlaid root stores metadata and addresses for column storage: + +```text +o_dataframe + o_unique_header + row_count + column_count + index_kind + column_metadata_address + index_metadata_address +``` + +Column metadata should be durable and fixed-size where possible: + +```text +o_dataframe_column + dtype_kind + null_mask_address + data_address + label_address + flags +``` + +The root object owns its column metadata, row index metadata, labels, null masks, +and column data blocks. Destruction and unref paths must release all owned +allocations. + +## Column Storage + +v1 should support core dtypes: + +- signed integers: `int8`, `int16`, `int32`, `int64` +- unsigned integers: `uint8`, `uint16`, `uint32`, `uint64` +- floating point: `float32`, `float64` +- boolean +- naive `datetime64[ns]` +- string/object-string columns with string or null values only + +Fixed-width columns should use typed durable vectors: + +```text +v_bvector +v_bvector +v_bvector +... +``` + +Each nullable column should store a null mask separately. For v1 this can be a +durable byte vector or bitset-like overlaid structure. Null handling should +round-trip pandas missing values as closely as possible within the supported +dtype set. + +String columns should not store Python object pointers. Store strings as durable +overlaid data, for example: + +```text +string column + offsets: v_bvector + null mask + payload bytes or string pool references +``` + +The exact string-column layout can be optimized later. The v1 requirement is +that the representation is durable, overlaid, and not a pickle. + +## Pandas Column Injection Interface + +Pandas DataFrames are column-oriented. The durable dbzero DataFrame should expose +each stored column through a pandas-compatible one-dimensional array object +rather than trying to make the whole DataFrame look like one contiguous NumPy +array. + +The supported pandas integration point is `ExtensionArray` plus +`ExtensionDtype`. Pandas documents these as the custom one-dimensional array and +dtype interface. `ExtensionArray` instances may be stored directly inside a +`DataFrame` or `Series`, and pandas does not require a specific backing storage +layout. This is a better fit for dbzero than imitating every `numpy.ndarray` +operation because dbzero column storage may be backed by `v_bvector`, null-mask +blocks, string payload blocks, or other overlaid structures. + +The dbzero design should use a thin Python-visible array wrapper backed by a +C++ durable column object: + +```text +pandas Series/DataFrame column + Db0ExtensionArray Python object + Db0Column C++ wrapper + typed durable column storage + durable null mask + durable label/dtype metadata +``` + +The C++ column wrapper is the low-level interface. The pandas `ExtensionArray` +methods delegate to this wrapper. + +### Required Low-Level Column Operations + +Every durable column implementation should provide these foundational +operations: + +```cpp +class DataFrameColumn +{ +public: + std::size_t size() const; + DataFrameDType dtype() const; + std::size_t nbytes() const; + + PyObject *getScalar(std::size_t row) const; + void setScalar(FixtureLock &, std::size_t row, PyObject *value); + + bool isNull(std::size_t row) const; + void setNull(FixtureLock &, std::size_t row, bool is_null); + + std::shared_ptr slice(SliceSpec) const; + std::shared_ptr take( + FixtureLock *, const std::vector &indices, + bool allow_fill, PyObject *fill_value + ) const; + + void setMany(FixtureLock &, SelectionSpec rows, PyObject *values); + std::shared_ptr copy(db0::swine_ptr &, bool deep) const; + + PyObject *toNumpy(bool copy, PyObject *dtype, PyObject *na_value) const; + PyObject *toPandasArray() const; +}; +``` + +Required semantics: + +- `size()` is O(1). +- `getScalar()` returns a Python scalar or the dtype-specific missing value. +- `setScalar()` validates and writes one durable value through `modifyExt()`. +- `isNull()` reads the durable null mask. +- `take()` implements pandas positional selection, including `allow_fill`. +- `slice()` may return a view wrapper when safe, but may return a copy for v1. +- `setMany()` is the shared implementation for `.iloc`, `.loc`, and column + assignment. +- `copy(deep=True)` creates independent durable storage in the target fixture. +- `toNumpy(copy=False)` may return a NumPy view only when the column has one + contiguous memory buffer with a stable lifetime. Otherwise it returns a copy. + +The low-level API should be intentionally smaller than pandas. Pandas-facing +behavior belongs in the `ExtensionArray` adapter; durable storage behavior +belongs in `DataFrameColumn`. + +### Required ExtensionArray Methods + +The pandas-facing wrapper should implement the abstract `ExtensionArray` +surface by delegating to the low-level column API: + +- `_from_sequence` +- `_from_factorized` +- `__getitem__` +- `__len__` +- `__eq__` +- `dtype` +- `nbytes` +- `isna` +- `take` +- `copy` +- `_concat_same_type` +- `interpolate` + +For useful performance and pandas compatibility, also implement: + +- `__setitem__` for durable mutation. +- `to_numpy` and `__array__` for NumPy conversion. +- `_values_for_factorize` and `_from_factorized`. +- `_values_for_argsort`. +- `_reduce` for simple reductions where the dtype supports them. +- `__array_ufunc__` only after the basic storage path is stable. + +For `__array_ufunc__`, return `NotImplemented` when any pandas `Series`, +`DataFrame`, or `Index` is present in the inputs. Pandas expects to unbox the +extension array and re-box the result itself. + +### Required ExtensionDtype Methods + +Each supported dbzero column kind should have a matching dtype object. + +The dtype wrapper must provide: + +- `type` +- `name` +- `construct_array_type` + +It should also provide: + +- `na_value` +- `_is_numeric` for numeric dtypes +- `_is_boolean` for boolean dtype +- `_get_common_dtype` for compatible dtype promotion + +The dtype name should be explicit, for example `dbzero[int64]`, +`dbzero[float64]`, `dbzero[bool]`, `dbzero[datetime64ns]`, and +`dbzero[string]`. The exact public names can be changed before implementation, +but they must be stable once persisted in any user-visible schema. + +### NumPy Protocol Support + +NumPy interoperability is still useful, but it should not be the primary pandas +storage contract. + +For fixed-width columns that can expose a stable contiguous memory range, the +column object may expose: + +- Python buffer protocol. +- `__array_interface__`. +- `__array__`. + +For dbzero's likely block-backed `v_bvector` layout, full-column zero-copy NumPy +views may not be possible. In that case: + +- `__array__` returns a NumPy copy. +- `to_numpy(copy=False)` is best-effort and may still copy. +- pandas mutation must go through `ExtensionArray.__setitem__`, not through a + NumPy view. + +If a future column storage variant is explicitly contiguous, a NumPy view may be +returned with the dbzero column wrapper as the base object so the durable memory +stays alive for the lifetime of the view. + +## Index And Labels + +v1 should support: + +- default `RangeIndex` +- simple single-level indexes containing supported scalar values +- string column labels + +Column labels and row labels should be persisted separately from data columns. +`loc` resolves labels through the durable index metadata. `iloc` uses integer +positions directly. + +MultiIndex is rejected in v1. + +## Mutation Semantics + +All mutating Python APIs must use `PY_MUTATING_API_FUNC` and route native +changes through `modifyExt()`. + +Supported durable mutations: + +- scalar cell assignment by `.loc` and `.iloc` +- shape-compatible row/column slice assignment +- full column add or replacement through `frame["column"] = values` + +Mutation should validate: + +- the target column exists, unless column assignment is intentionally adding a + new column +- row and column selectors resolve to existing positions +- assigned value shape matches the selected region +- assigned values can be represented by the target dtype, or the whole column is + replaced with a supported new dtype + +For v1, scalar assignment should not silently widen column dtype. If a value +cannot be stored in the existing dtype, raise a clear error. Column replacement +may choose a new supported dtype based on the replacement values. + +Mutations inside `db0.read_only()` must be rejected. + +## Member Assignment + +When a pandas DataFrame is assigned to a memo field: + +1. `PyTypeManager` detects `TypeId::PANDAS_DATAFRAME`. +2. `StorageClassMapper` maps it to `PreStorageClass::DB0_DATAFRAME`. +3. `createMember` creates a new `DataFrame` object in + the target fixture and imports supported columns. +4. The new durable DataFrame increments its object reference count. +5. The memo field stores the DataFrame address as `StorageClass::DB0_DATAFRAME`. + +When a dbzero DataFrame wrapper is assigned: + +1. `createMember` extracts the native `DataFrame`. +2. If it belongs to the same fixture, increment the reference count and store + its address. +3. If it belongs to a different fixture, either auto-harden by moving the + unreferenced DataFrame to the target fixture or reject cross-prefix + assignment for v1. The conservative v1 default is to reject cross-prefix + assignment until move semantics are implemented for owned column blocks. + +## Unload, Fetch, And Load + +`unloadMember` returns a dbzero DataFrame wrapper. +It should use the language cache when possible, matching the behavior of other +dbzero collection wrappers. + +`db0.fetch(uuid)` should support DataFrame object IDs if fetch-by-UUID for +collection-like objects is expected for the new storage class. + +`db0.load()` and `db0.load_all()` should convert a dbzero DataFrame wrapper to a +pandas DataFrame copy. This keeps load output in ordinary Python/Pandas objects +rather than returning durable wrappers inside loaded graphs. + +## Atomic, Detach, And GC Behavior + +`DataFrame` must participate in the same lifecycle as existing dbzero +collections: + +- `incRef` and `decRef` use the root header. +- `destroy()` releases column metadata, index metadata, null masks, and data + blocks. +- `detach()` detaches all owned durable child objects and the root. +- `commit()` commits all owned durable child objects and the root. +- `beginModify()` integration should register wrappers with the atomic context + so rollback can detach stale views. + +If a mutation changes an owned child structure address, the root metadata must +be re-synced immediately, following the same discipline used for morphing +indexes and other address-changing structures. + +## Error Policy + +Errors should be explicit and early: + +- Missing pandas when DataFrame functionality is used: `RuntimeError` or + `ImportError` with an actionable message. +- Unsupported dtype: `TypeError`. +- Unsupported index shape: `TypeError`. +- Out-of-range `iloc`: `IndexError`. +- Missing `loc` label: `KeyError`. +- Shape mismatch on assignment: `ValueError`. +- Mutation in read-only context: `RuntimeError`. + +No unsupported DataFrame content should be silently converted to string, +pickled, or dropped. + +## Implementation Slices + +Use TDD. Start with Python behavior tests, then add native tests for storage +layout and lifecycle. + +Recommended slices: + +1. Add failing Python tests for `db0.dataframe(df)` and memo assignment. +2. Add type IDs, storage class, schema names, and stub registration. +3. Add minimal native `DataFrame` object with row/column metadata and one + numeric column type. +4. Add Python wrapper construction, unload, `.shape`, `.columns`, `.index`, and + `.to_pandas()`. +5. Add fixed-width dtype coverage and null masks. +6. Add string column storage. +7. Add `frame["column"]` read and replacement. +8. Add `.iloc` scalar read/write. +9. Add `.loc` scalar read/write. +10. Add slice/list reads and shape-compatible assignment. +11. Add load/fetch integration and `.pyi` stubs. +12. Add debug/release validation and C++ tests. + +## Tests + +Python tests should use `pytest.importorskip("pandas")` so the suite remains +valid when pandas is not installed. + +Behavior tests: + +- `db0.dataframe(pd.DataFrame(...))` creates a dbzero DataFrame wrapper. +- A pandas DataFrame assigned as a memo member reopens with the same values, + columns, index, and supported dtypes. +- A dbzero DataFrame assigned as a memo member reopens correctly. +- `.to_pandas()` round-trips supported numeric, bool, datetime64, and string + columns. +- `frame["col"]` returns a pandas Series copy. +- `frame["col"] = values` persists across commit/reopen. +- `.iloc[row, col]` scalar get/set persists. +- `.loc[label, column]` scalar get/set persists. +- Slice/list reads return pandas copies. +- Shape-compatible slice/list assignment persists. +- Unsupported dtypes and MultiIndex raise clear errors. +- Mutations inside `db0.read_only()` raise. +- `db0.load(obj)` converts DataFrame members to pandas DataFrames. + +Native tests: + +- `o_dataframe` size and `safeSizeOf` validation. +- Column metadata can be created and reopened. +- Fixed-width column blocks persist values. +- Null masks persist missing values. +- String columns persist offsets and payload. +- `destroy`, `detach`, and `commit` process owned child structures. +- Address-changing child structures update root references. + +## Open Questions + +The following decisions can be deferred until implementation reaches the +relevant slice: + +- Whether string column payloads should use dedicated payload blocks or existing + string pool primitives. +- Whether cross-prefix DataFrame assignment should be rejected or auto-hardened. +- Whether `db0.load_all()` should always return pandas copies or preserve dbzero + wrappers behind an option. +- Whether a future optional `dbzero[pandas]` package extra should be added. + +## Feasibility + +The dbzero architecture can support this feature. Existing collection wrappers +already provide most of the required lifecycle patterns: type detection, +storage-class mapping, `ObjectBase` reference counting, wrapper cache use, +member creation/unload, read-only enforcement, and atomic mutation registration. + +The main implementation risk is pandas API breadth, not durable storage. v1 +must keep a narrow compatibility surface and reject unsupported pandas features +clearly. diff --git a/design/PYDANTIC_INTEGRATION_DESIGN.md b/design/PYDANTIC_INTEGRATION_DESIGN.md new file mode 100644 index 00000000..261b22c3 --- /dev/null +++ b/design/PYDANTIC_INTEGRATION_DESIGN.md @@ -0,0 +1,361 @@ +# Pydantic Integration Design + +This document describes a low-risk integration path between dbzero memo classes +and Pydantic. The integration should make memo classes usable in Pydantic +validation and serialization workflows without making memo classes inherit from +`pydantic.BaseModel`. + +## Goal + +dbzero memo classes should be accepted by Pydantic as first-class custom types: + +- Existing memo instances validate as instances of their memo class. +- Dictionaries and other supported mappings can be validated and converted into + new memo instances. +- Memo instances can serialize into plain Python values suitable for Pydantic + model dumping and JSON schema workflows. +- The feature is optional and does not add Pydantic as a mandatory runtime + dependency. + +Example target behavior: + +```python +import dbzero as db0 +from dataclasses import dataclass +from pydantic import BaseModel + + +@db0.memo +@dataclass +class User: + name: str + age: int + + +class Event(BaseModel): + user: User + + +existing = User("Ada", 36) +assert Event(user=existing).user is existing + +created = Event(user={"name": "Grace", "age": 37}).user +assert isinstance(created, User) +assert created.age == 37 +``` + +## Non-Goals + +Memo classes should not be converted into Pydantic models. In particular, the +following patterns are not part of this design: + +```python +@db0.memo +class User(pydantic.BaseModel): + ... + + +@db0.memo +@pydantic.dataclasses.dataclass +class User: + ... +``` + +These patterns conflict with dbzero's native Python extension type layout. +Pydantic models and Pydantic dataclasses expect to own instance state such as +`__dict__`, private attributes, validators, and model metadata. dbzero memo +instances instead route attribute access through native `tp_getattro` and +`tp_setattro` hooks and expose a synthetic read-only `__dict__`. + +The integration also should not enable Pydantic assignment validation by +default. Assigning `obj.field = value` on a memo object is a durable mutation, +so automatic assignment validation would need explicit mutation semantics and +read-only-context handling. + +## Current Compatibility + +The following patterns already work without dbzero changes: + +```python +from pydantic import BaseModel, ConfigDict + + +class Holder(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + user: User +``` + +This treats memo instances as opaque arbitrary types. It validates only that +the value is an instance of the target class. + +DTO-style validation also works: + +```python +class UserDTO(BaseModel): + model_config = ConfigDict(from_attributes=True) + name: str + age: int + + +dto = UserDTO.model_validate(user) +``` + +This is useful for read-side schemas but does not make the memo class itself a +Pydantic-supported type. + +## Recommended Integration + +Add Pydantic v2 support by installing `__get_pydantic_core_schema__` on wrapped +memo classes. + +Pydantic v2 uses `__get_pydantic_core_schema__` as the custom type hook. dbzero +can provide this hook on memo classes after `_wrap_memo_type` returns the native +wrapped type. The hook should be generated in the Python layer so Pydantic +remains optional and import failures are isolated. + +Conceptually: + +```python +@classmethod +def __get_pydantic_core_schema__(cls, source_type, handler): + ... +``` + +The generated schema should: + +1. Accept existing instances of `cls`. +2. Validate mappings against fields inferred from annotations, dataclass + metadata, or constructor signature. +3. Construct a new `cls(**validated_values)` when input is a mapping. +4. Serialize memo instances through a dbzero load function. + +This approach avoids changing the native memo object layout and avoids relying +on Pydantic internals beyond its public custom type hook. + +## Dependency Model + +Pydantic must remain optional. + +Implementation should not import Pydantic at module import time. Instead: + +- `dbzero.memo` can install a lightweight classmethod that imports Pydantic only + when Pydantic asks for a schema. +- If Pydantic is unavailable, ordinary dbzero usage is unchanged. +- Packaging metadata should not add Pydantic to `project.dependencies`. +- If a dedicated extra is wanted later, use an optional extra such as + `dbzero[pydantic]`. + +## Field Discovery + +The generated schema needs a stable field list. + +Preferred sources, in order: + +1. `__annotations__`. +2. Dataclass fields from `dataclasses.fields(cls)` when available. +3. `inspect.signature(cls.__init__)`, excluding `self`. + +The existing bytecode-derived `py_init_vars` list is useful for dbzero field +layout and migrations, but it should not be the primary source for Pydantic +validation because it does not preserve type information. + +Dynamic fields are intentionally not fully representable. If a memo class uses +`**kwargs` or assigns fields conditionally, Pydantic support should either: + +- allow extra mapping keys and pass them through to the constructor when the + constructor accepts `**kwargs`, or +- reject unknown fields by default for classes without `**kwargs`. + +The default should be conservative: validate declared fields, pass through only +when the constructor shape makes that clearly intentional. + +## Validation Semantics + +Input handling should follow these rules: + +- If input is already an instance of the memo class, return it unchanged. +- If input is a mapping, validate its declared fields and construct a new memo + instance. +- If input is not a mapping or memo instance, raise a Pydantic validation error. +- Missing required constructor parameters should produce Pydantic validation + errors before calling the memo constructor. +- Default values should be taken from dataclass fields or constructor + signatures. +- Values that Pydantic validates successfully may still be rejected by dbzero if + dbzero cannot persist them. That failure should propagate as a construction + error. + +Validation should not materialize immutable deferred objects unless normal memo +construction would do so. Pydantic validation must not introduce extra durable +side effects beyond constructing the memo object requested by the user. + +## Serialization Semantics + +Serialization should support both Python and JSON-oriented Pydantic dumping. + +Recommended default: + +```python +db0.load(obj) +``` + +This respects custom `__load__` methods and existing dbzero conversion rules. + +A future option may allow `db0.load_all(obj)` for schemas that require every +field, but the initial integration should use the same default serialization +surface dbzero users already know. + +Protected fields and access-control masking must be honored. Serialization +should read through normal Python/dbzero access paths rather than bypassing +field protection in native code. + +## JSON Schema + +Initial JSON schema support can be minimal: + +- For annotated memo classes, expose an object schema with declared properties. +- For classes without useful annotations, expose a generic object schema. +- For opaque instance-only use, a plain custom type schema is acceptable. + +Schema generation should not be allowed to force opening prefixes or +materializing dbzero classes. It should operate from Python type metadata only. + +## Constructor And Prefix Handling + +The mapping-to-instance validator should call the memo class constructor through +normal Python invocation: + +```python +return cls(**validated_values) +``` + +This keeps existing dbzero behavior for: + +- Static prefixes. +- Dynamic prefixes resolved inside `__init__`. +- `db0.set_prefix(self, prefix)` patterns. +- Singletons. +- Immutable and interned classes. +- Constructor-side tags and field assignments. + +For singleton classes, Pydantic validation from a mapping may return an existing +singleton and ignore constructor arguments, matching normal dbzero semantics. +This should be documented in user-facing docs if the feature is exposed. + +## Assignment Validation + +Do not implement Pydantic `validate_assignment` support for memo fields in the +initial integration. + +Durable assignment has dbzero-specific behavior: + +- It mutates persistent state. +- It must respect `db0.read_only()`. +- It may materialize referenced immutable objects. +- It may update reference counts, tags, indexes, and atomic context state. + +If assignment validation is added later, it should be an explicit helper such +as: + +```python +db0.pydantic_assign(obj, "field", value) +``` + +or a decorator option that clearly documents durable mutation behavior. + +## Implementation Plan + +Follow TDD. Add failing Python tests first under +`python_tests/test_pydantic_integration.py`. + +Implementation should be Python-side first: + +1. Add tests for existing memo instance validation through a Pydantic model. +2. Add tests for mapping input constructing a memo instance. +3. Add tests for serialization through `model_dump`. +4. Add tests for optional dependency behavior when Pydantic is not imported. +5. Add a helper in `dbzero/dbzero/memo.py` that attaches Pydantic hooks to the + wrapped memo class. +6. Keep native C++ changes out of the first implementation unless Python-side + attachment cannot preserve the hook. + +The hook attachment should happen after: + +```python +wrapped = _wrap_memo_type(...) +``` + +The wrapped type currently preserves annotations and dataclass metadata, so the +schema helper can inspect the wrapped class. + +## Test Plan + +Required tests: + +- A memo dataclass field in a Pydantic `BaseModel` accepts an existing memo + instance without `arbitrary_types_allowed=True`. +- A memo dataclass field accepts a dictionary and constructs a memo instance. +- Pydantic coerces simple annotated field values before construction, such as + `"7"` to `int`. +- Missing required fields produce a Pydantic validation error. +- Unknown fields are rejected for a constructor without `**kwargs`. +- Unknown fields are passed through for a memo class whose constructor accepts + `**kwargs`. +- `model_dump()` serializes a memo field to a plain dictionary using normal + dbzero loading. +- Custom memo `__load__` methods are respected by serialization. +- Existing memo instances validate by identity, not by copying. +- Singleton memo classes validate according to normal singleton construction + semantics. +- Immutable memo classes validate without forcing unexpected materialization. +- Pydantic is not imported during normal `import dbzero`. + +Optional tests: + +- JSON schema for an annotated memo class contains object properties. +- A protected field masked from normal reads is not exposed by Pydantic + serialization. +- `db0.read_only()` rejects mapping validation that would construct or mutate a + durable memo instance. + +Do not add tests that require direct inheritance from `BaseModel` or Pydantic +dataclasses. Those patterns are non-goals and should remain unsupported unless +the native object layout changes substantially. + +## Risks + +The main risk is surprising durable side effects. Pydantic validation is often +viewed as a pure data transformation, while constructing a dbzero memo object +persists state. Documentation and examples must make this clear. + +Other risks: + +- Pydantic's custom core-schema APIs may change across major versions. +- Pydantic can validate values that dbzero later rejects as unsupported durable + field types. +- Dynamic memo classes may not have enough static metadata for precise schemas. +- Serialization may be expensive for large object graphs. +- Custom `__load__` methods may return shapes that differ from the validation + schema. + +These risks are acceptable if the first implementation is opt-in through +Pydantic's normal type usage and avoids changing dbzero construction semantics. + +## Open Questions + +- Should schema generation use `db0.load` or `db0.load_all` by default? +- Should there be a decorator option to disable Pydantic hook generation for a + specific memo class? +- Should unknown mapping keys default to reject or pass through for non-dataclass + classes with permissive constructors? +- Should user-facing docs recommend DTO models for read-only validation and memo + schemas only for construction? +- Should Pydantic v1 be supported at all via `__get_validators__`, or should the + integration target Pydantic v2 only? + +## Recommendation + +Implement Pydantic v2 support as an optional generated custom-type hook on memo +classes. Do not attempt to make memo classes Pydantic models. Keep the first +iteration Python-only, test-driven, and limited to validation from instances, +validation from mappings, and normal dbzero serialization. diff --git a/python_tests/test_copy_prefix.py b/python_tests/test_copy_prefix.py index e8ba25b1..b067abab 100644 --- a/python_tests/test_copy_prefix.py +++ b/python_tests/test_copy_prefix.py @@ -9,7 +9,6 @@ from .conftest import DB0_DIR, worker_path import multiprocessing - def test_copy_current_prefix(db0_fixture): file_name = worker_path("./test-copy.db0") # remove file if it exists @@ -193,7 +192,6 @@ def test_copy_prefix_without_opening_it(db0_fixture): @pytest.mark.stress_test -@pytest.mark.skip(reason="https://github.com/dbzero-software/dbzero/issues/662") def test_copy_prefix_continuous_process(db0_fixture): px_name = db0.get_current_prefix().name px_path = os.path.join(DB0_DIR, px_name + ".db0") @@ -453,7 +451,6 @@ def validate_copy(copy_id, expected_len = None, expected_min_len = None): @pytest.mark.stress_test -@pytest.mark.skip(reason="https://github.com/dbzero-software/dbzero/issues/662") def test_copy_prefix_continuous_process_slow_copy(db0_fixture): if 'D' in db0.build_flags(): px_name = db0.get_current_prefix().name diff --git a/python_tests/test_copy_prefix_recovery_regression.py b/python_tests/test_copy_prefix_recovery_regression.py new file mode 100644 index 00000000..9b8be07c --- /dev/null +++ b/python_tests/test_copy_prefix_recovery_regression.py @@ -0,0 +1,164 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# Copyright (c) 2025 DBZero Software sp. z o.o. + +import os +import multiprocessing +import time + +import dbzero as db0 + +from .conftest import DB0_DIR, worker_path +from .memo_test_types import MemoTestClass, MemoTestSingleton + + +def _copy_prefix_live_writer(prefix, obj_count, commit_count, sleep_seconds): + db0.init(DB0_DIR) + db0.open(prefix, "rw") + root = MemoTestSingleton([]) + for _ in range(commit_count): + for _ in range(obj_count): + root.value.append(MemoTestClass("b" * 1024)) + db0.commit() + time.sleep(sleep_seconds) + db0.close() + + +def test_copy_prefix_recovered_file_reopens_read_only(db0_fixture): + copy_file_name = worker_path("./test-copy-recovery.db0") + if os.path.exists(copy_file_name): + os.remove(copy_file_name) + + px_name = db0.get_current_prefix().name + px_path = os.path.join(DB0_DIR, px_name + ".db0") + + root = MemoTestSingleton([]) + for _ in range(50): + root.value.append(MemoTestClass("a" * 1024)) + db0.commit() + + db0.copy_prefix(copy_file_name) + db0.close() + + os.remove(px_path) + os.rename(copy_file_name, px_path) + + db0.init(DB0_DIR, prefix=px_name, read_write=False) + root = db0.fetch(MemoTestSingleton) + assert [item.value for item in root.value] == ["a" * 1024] * 50 + + +def test_copy_closed_prefix_by_name_recovered_file_reopens_read_only(db0_fixture): + copy_file_name = worker_path("./test-copy-closed-prefix.db0") + if os.path.exists(copy_file_name): + os.remove(copy_file_name) + + px_name = db0.get_current_prefix().name + px_path = os.path.join(DB0_DIR, px_name + ".db0") + + root = MemoTestSingleton([]) + for _ in range(5): + root.value.append(MemoTestClass("a" * 1024)) + db0.commit() + db0.close() + + db0.init(DB0_DIR) + db0.copy_prefix(copy_file_name, prefix=px_name) + db0.close() + + os.remove(px_path) + os.rename(copy_file_name, px_path) + + db0.init(DB0_DIR, prefix=px_name, read_write=False) + root = db0.fetch(MemoTestSingleton) + assert [item.value for item in root.value] == ["a" * 1024] * 5 + + +def test_copy_prefix_while_writer_active_then_final_copy_recovers(db0_fixture): + live_copy_file_name = worker_path("./test-copy-live-prefix.db0") + final_copy_file_name = worker_path("./test-copy-live-prefix-final.db0") + for file_name in (live_copy_file_name, final_copy_file_name): + if os.path.exists(file_name): + os.remove(file_name) + + px_name = db0.get_current_prefix().name + px_path = os.path.join(DB0_DIR, px_name + ".db0") + db0.close() + + obj_count = 500 + commit_count = 50 + writer = multiprocessing.Process( + target=_copy_prefix_live_writer, args=(px_name, obj_count, commit_count, 0.01)) + writer.start() + + db0.init(DB0_DIR) + db0.open(px_name, "r") + while writer.is_alive(): + try: + if db0.exists(MemoTestSingleton) and len(db0.fetch(MemoTestSingleton).value) > obj_count: + break + except Exception: + pass + time.sleep(0.02) + + assert writer.is_alive() + db0.copy_prefix(live_copy_file_name, prefix=px_name) + writer.join() + + db0.copy_prefix(final_copy_file_name, prefix=px_name) + db0.close() + + os.remove(px_path) + os.rename(final_copy_file_name, px_path) + + db0.init(DB0_DIR, prefix=px_name, read_write=False) + root = db0.fetch(MemoTestSingleton) + assert len(root.value) == obj_count * commit_count + assert [item.value for item in root.value] == ["b" * 1024] * (obj_count * commit_count) + + +def test_copy_prefix_repeated_live_copies_do_not_observe_unreadable_descriptor_diffs(db0_fixture): + px_name = db0.get_current_prefix().name + px_path = os.path.join(DB0_DIR, px_name + ".db0") + db0.close() + + obj_count = 500 + commit_count = 120 + writer = multiprocessing.Process( + target=_copy_prefix_live_writer, args=(px_name, obj_count, commit_count, 0.0)) + writer.start() + + db0.init(DB0_DIR) + db0.open(px_name, "r") + while writer.is_alive(): + try: + if db0.exists(MemoTestSingleton) and len(db0.fetch(MemoTestSingleton).value) > obj_count: + break + except Exception: + pass + time.sleep(0.01) + + assert writer.is_alive() + copy_count = 0 + copy_file_names = [] + while writer.is_alive() and copy_count < 12: + copy_file_name = worker_path(f"./test-copy-live-prefix-repeat-{copy_count}.db0") + if os.path.exists(copy_file_name): + os.remove(copy_file_name) + db0.copy_prefix(copy_file_name, prefix=px_name) + copy_file_names.append(copy_file_name) + copy_count += 1 + + writer.join() + db0.close() + assert copy_count > 1 + + last_len = 0 + for copy_file_name in copy_file_names: + os.remove(px_path) + os.rename(copy_file_name, px_path) + db0.init(DB0_DIR, prefix=px_name, read_write=False) + root = db0.fetch(MemoTestSingleton) + assert len(root.value) >= last_len + assert all(item.value == "b" * 1024 for item in root.value) + last_len = len(root.value) + db0.close() diff --git a/python_tests/test_page_io.py b/python_tests/test_page_io.py index 45bccd8c..b505babd 100644 --- a/python_tests/test_page_io.py +++ b/python_tests/test_page_io.py @@ -18,14 +18,17 @@ def test_create_prefix_with_page_io_step_size(db0_fixture): px_size_1 = db0.get_storage_stats()["prefix_size"] assert px_size_1 > (16 << 20) - # after adding more pages, prefix size should not increase until next step is reached + # SparsePairManager stores application sparse pairs in descriptor-backed meta-space. + # DRAM metadata is append-only so concurrent readers can still open the previous + # committed root state while a writer is publishing the next one. The file may + # grow with metadata, but it must not allocate another 16 MB page-IO step. for _ in range(50): buf.append(MemoTestClass("a" * 1024)) # 1 KB string # commit after each append db0.commit() - + px_size_2 = db0.get_storage_stats()["prefix_size"] - assert (px_size_2 - px_size_1) < (128 << 10) + assert (px_size_2 - px_size_1) < (8 << 20) def test_continue_append_with_step_size(db0_fixture): @@ -44,6 +47,7 @@ def test_continue_append_with_step_size(db0_fixture): root.value.append(MemoTestClass("a" * 1024)) db0.commit() - # NOTE: this behavior will change after we implement REL_Index - assert db0.get_storage_stats()["prefix_size"] > (32 << 20) - \ No newline at end of file + px_size = db0.get_storage_stats()["prefix_size"] + assert px_size > (16 << 20) + assert px_size < (48 << 20) + diff --git a/python_tests/test_refresh_stress_tests.py b/python_tests/test_refresh_stress_tests.py index 0d43967f..de0cb8e7 100644 --- a/python_tests/test_refresh_stress_tests.py +++ b/python_tests/test_refresh_stress_tests.py @@ -3,6 +3,7 @@ import pytest import multiprocessing +import queue import time import dbzero as db0 import os @@ -66,6 +67,136 @@ def create_process_refresh_query_while_adding(px_name, num_iterations, db0.close() +def _get_sparse_pair_manager_refresh_stress_config(): + # Increase DB0_SPM_REFRESH_STRESS_SECONDS or set DB0_SPM_REFRESH_STRESS_MAX_COMMITS=0 + # for open-ended long-duration runs. Large fast-writer settings are expected + # to exercise SparsePairManager refresh catch-up aggressively. + return { + "duration_seconds": float(os.environ.get("DB0_SPM_REFRESH_STRESS_SECONDS", "10")), + "batch_size": int(os.environ.get("DB0_SPM_REFRESH_STRESS_BATCH_SIZE", "256")), + "payload_size": int(os.environ.get("DB0_SPM_REFRESH_STRESS_PAYLOAD_SIZE", "2048")), + "max_commits": int(os.environ.get("DB0_SPM_REFRESH_STRESS_MAX_COMMITS", "200")), + "reader_sleep_seconds": float(os.environ.get("DB0_SPM_REFRESH_STRESS_READER_SLEEP", "0.01")), + "catch_up_seconds": float(os.environ.get("DB0_SPM_REFRESH_STRESS_CATCH_UP_SECONDS", "60")), + } + + +def _sparse_pair_manager_refresh_writer(px_name, config, result_queue): + try: + db0.init(DB0_DIR) + db0.open(px_name, "rw") + root = MemoTestSingleton([]) + start_time = time.monotonic() + commit_count = 0 + total_count = 0 + + while True: + if config["max_commits"] and commit_count >= config["max_commits"]: + break + if time.monotonic() - start_time >= config["duration_seconds"]: + break + + payload = f"{commit_count:08d}-" + ("x" * config["payload_size"]) + for _ in range(config["batch_size"]): + root.value.append(MemoTestClass(payload)) + db0.commit() + + commit_count += 1 + total_count += config["batch_size"] + if commit_count % 10 == 0: + result_queue.put(("progress", total_count)) + + result_queue.put(("done", total_count)) + db0.close() + except BaseException as exc: + result_queue.put(("error", repr(exc))) + try: + db0.close() + except BaseException: + pass + + +@pytest.mark.stress_test +@pytest.mark.parametrize("stress_config", [_get_sparse_pair_manager_refresh_stress_config()]) +def test_sparse_pair_manager_sparse_indexes_refresh_under_long_running_updates(db0_fixture, stress_config): + root = MemoTestSingleton([]) + px_name = db0.get_current_prefix().name + db0.commit() + db0.close() + + result_queue = multiprocessing.Queue() + writer = multiprocessing.Process( + target=_sparse_pair_manager_refresh_writer, + args=(px_name, stress_config, result_queue), + ) + writer.start() + + final_count = None + last_seen_count = 0 + refresh_count = 0 + last_state_num = 0 + last_refresh_result = None + start_time = time.monotonic() + writer_timeout_seconds = max(30.0, stress_config["duration_seconds"] * 4) + catch_up_start_time = None + + try: + db0.init(DB0_DIR) + db0.open(px_name, "r") + while True: + try: + while True: + event, value = result_queue.get_nowait() + if event == "error": + raise AssertionError(f"writer failed: {value}") + if event == "done": + final_count = value + catch_up_start_time = time.monotonic() + except queue.Empty: + pass + + last_refresh_result = db0.refresh() + refresh_count += 1 + last_state_num = db0.get_state_num(px_name) + + with db0.snapshot() as snap: + root = snap.fetch(MemoTestSingleton) + current_count = len(root.value) + assert current_count >= last_seen_count + if current_count: + first = root.value[0].value + last = root.value[current_count - 1].value + assert isinstance(first, str) and first.endswith("x" * stress_config["payload_size"]) + assert isinstance(last, str) and last.endswith("x" * stress_config["payload_size"]) + last_seen_count = current_count + + if final_count is not None and last_seen_count >= final_count: + break + if final_count is None and time.monotonic() - start_time > writer_timeout_seconds: + raise AssertionError( + f"writer did not finish: seen={last_seen_count}, refresh_count={refresh_count}" + ) + if (catch_up_start_time is not None + and time.monotonic() - catch_up_start_time > stress_config["catch_up_seconds"]): + raise AssertionError( + f"reader did not catch up: seen={last_seen_count}, final={final_count}, " + f"refresh_count={refresh_count}, state_num={last_state_num}, " + f"last_refresh_result={last_refresh_result}" + ) + time.sleep(stress_config["reader_sleep_seconds"]) + + writer.join(timeout=5) + assert writer.exitcode == 0 + assert final_count is not None + assert last_seen_count == final_count + assert refresh_count > 0 + finally: + if writer.is_alive(): + writer.terminate() + writer.join() + db0.close() + + @pytest.mark.stress_test def test_refresh_query_while_adding_new_objects(db0_fixture): px_name = db0.get_current_prefix().name diff --git a/src/dbzero/bindings/python/PyInternalAPI.cpp b/src/dbzero/bindings/python/PyInternalAPI.cpp index 5fd44791..23842f01 100644 --- a/src/dbzero/bindings/python/PyInternalAPI.cpp +++ b/src/dbzero/bindings/python/PyInternalAPI.cpp @@ -39,6 +39,8 @@ #include #include #include +#include +#include namespace db0::python @@ -1260,15 +1262,19 @@ namespace db0::python meta_io_step_size = in_meta_step_size > 1 ? in_meta_step_size : (1u << 20); } + std::unique_ptr out; try { BDevStorage::create(output_file_name, src_storage.getPageSize(), src_storage.getDRAMPageSize(), page_io_step_size); - BDevStorage out(output_file_name, db0::AccessType::READ_WRITE); + out = std::unique_ptr(new BDevStorage(output_file_name, db0::AccessType::READ_WRITE)); // copy entire prefix - src_storage.copyTo(out); - out.close(); + src_storage.copyTo(*out); + out->close(); } catch (...) { // cleanup try { + if (out) { + out->close(); + } if (db0::CFile::exists(output_file_name)) { db0::CFile::remove(output_file_name); } @@ -1324,29 +1330,40 @@ namespace db0::python } } - std::unique_ptr storage; - try { - auto prefix = tryFindPrefixFromArgs(py_prefix); - StorageFlags flags = { StorageOptions::NO_LOAD }; - if (prefix) { - // open as a copy of an existing prefix - auto &in = prefix->getPrefix().getStorage().asFile(); - storage = std::unique_ptr(new BDevStorage( - in.getFileName(), AccessType::READ_ONLY, {}, in.getMetaIO().getStepSize(), flags) - ); - } else { - // NOTE: for copy we open the storage as NO_LOAD - storage = getPrefixStorage(py_prefix, meta_io_step_size, flags); - } - auto result = Py_OWN(tryCopyPrefixImpl(*storage, output_file_name, page_io_step_size, meta_io_step_size)); - storage->close(); - return result.steal(); - } catch (...) { - if (storage) { + constexpr unsigned int copy_attempt_count = 8; + for (unsigned int attempt = 0; attempt < copy_attempt_count; ++attempt) { + std::unique_ptr storage; + try { + auto prefix = tryFindPrefixFromArgs(py_prefix); + StorageFlags flags = { StorageFlagOption::NO_LOAD }; + if (prefix) { + // open as a copy of an existing prefix + auto &in = prefix->getPrefix().getStorage().asFile(); + storage = std::unique_ptr(new BDevStorage( + in.getFileName(), AccessType::READ_ONLY, {}, in.getMetaIO().getStepSize(), flags) + ); + } else { + storage = getPrefixStorage(py_prefix, meta_io_step_size, flags); + } + auto result = Py_OWN(tryCopyPrefixImpl(*storage, output_file_name, page_io_step_size, meta_io_step_size)); storage->close(); + return result.steal(); + } catch (db0::IOException &) { + if (storage) { + storage->close(); + } + if (attempt + 1 == copy_attempt_count) { + throw; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } catch (...) { + if (storage) { + storage->close(); + } + throw; } - throw; } + Py_UNREACHABLE(); } #ifndef NDEBUG diff --git a/src/dbzero/core/collections/SGB_Tree/SGB_CompressedLookupTree.hpp b/src/dbzero/core/collections/SGB_Tree/SGB_CompressedLookupTree.hpp index 8819a173..87ea1adb 100644 --- a/src/dbzero/core/collections/SGB_Tree/SGB_CompressedLookupTree.hpp +++ b/src/dbzero/core/collections/SGB_Tree/SGB_CompressedLookupTree.hpp @@ -227,22 +227,24 @@ DB0_PACKED_END using CompT = typename super_t::CompT; using NodeItemCompT = typename super_t::NodeItemCompT; using NodeItemEqualT = typename super_t::NodeItemEqualT; + using HeapCompT = typename super_t::HeapCompT; using const_iterator = typename super_t::const_iterator; + static constexpr unsigned int DEFAULT_SORT_THRESHOLD = super_t::DEFAULT_SORT_THRESHOLD; // as null / invalid SGB_CompressedLookupTree() = default; SGB_CompressedLookupTree(Memspace &memspace, std::size_t node_capacity, AccessType access_type, const CompT &comp = {}, const NodeItemCompT &item_cmp = {}, const NodeItemEqualT &item_eq = {}, - unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD) - : super_t(memspace, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr) + unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD, Allocator::SlotId slot_num = 0) + : super_t(memspace, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr, slot_num) { } SGB_CompressedLookupTree(mptr ptr, std::size_t node_capacity, AccessType access_type, const CompT &comp = {}, const NodeItemCompT &item_cmp = {}, const NodeItemEqualT &item_eq = {}, - unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD) - : super_t(ptr, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr) + unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD, Allocator::SlotId slot_num = 0) + : super_t(ptr, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr, slot_num) { } @@ -264,6 +266,72 @@ DB0_PACKED_END insert_into(node, 0, item); } + template bool erase_equal(const KeyT &key) + { + assert(this->m_access_type == AccessType::READ_WRITE); + auto node = base_t::lower_equal_bound(key); + if (node == base_t::end() || !node->header().canFit(key)) { + return false; + } + if (!node.modify().erase(node->header().compress(key), this->m_heap_comp)) { + return false; + } + --base_t::modify().m_sgb_size; + if (node->empty()) { + base_t::erase(node); + } + return true; + } + + std::size_t erase_range(const ItemT &first, const ItemT &last) + { + assert(this->m_access_type == AccessType::READ_WRITE); + if (base_t::empty() || !m_raw_item_comp(first, last)) { + return 0; + } + + auto node = base_t::lower_equal_bound(last); + if (node == base_t::end()) { + return 0; + } + + std::size_t removed = 0; + for (;;) { + auto max_item_ptr = node->find_max(this->m_heap_comp); + assert(max_item_ptr); + if (m_raw_item_comp(node->header().uncompress(*max_item_ptr), first)) { + break; + } + + auto prev_node = node; + bool has_prev_node = prev_node != base_t::begin(); + if (has_prev_node) { + --prev_node; + } + + auto header = node->header(); + auto removed_from_node = node.modify().erase_if([&](const CompressedItemT &item) { + auto uncompressed = header.uncompress(item); + return !m_raw_item_comp(uncompressed, first) && m_raw_item_comp(uncompressed, last); + }, this->m_heap_comp); + + removed += removed_from_node; + if (removed_from_node && node->empty()) { + base_t::erase(node); + } + + if (!has_prev_node) { + break; + } + node = prev_node; + } + + if (removed) { + base_t::modify().m_sgb_size -= removed; + } + return removed; + } + AddressT getAddress() const { return base_t::getAddress(); } @@ -283,6 +351,13 @@ DB0_PACKED_END std::size_t size() const { return super_t::size(); } + + void clear() + { + assert(this->m_access_type == AccessType::READ_WRITE); + base_t::clear(); + base_t::modify().m_sgb_size = 0; + } void commit() const { super_t::commit(); @@ -295,11 +370,6 @@ DB0_PACKED_END return { nullptr, sg_tree_const_iterator() }; } - // node will be sorted if needed (only if in READ/WRITE mode) - if (this->m_access_type == AccessType::READ_WRITE) { - this->onNodeLookup(node); - } - if (node->header().canFit(key)) { // within the node look up by compressed key (only if able to fit) return { node->lower_equal_bound(node->header().compress(key), this->m_heap_comp), node }; @@ -324,11 +394,6 @@ DB0_PACKED_END THROWF(db0::InternalException) << "Corrupted SGB_CompressedLookupTree node found at " << node.getAddress(); } - // node will be sorted if needed (only if opened as READ/WRITE) - if (this->m_access_type == AccessType::READ_WRITE) { - this->onNodeLookup(node); - } - // within the node look up by compressed key if (node->header().canFit(key)) { auto item_ptr = node->lower_equal_bound(node->header().compress(key), this->m_heap_comp); @@ -356,10 +421,6 @@ DB0_PACKED_END --node; } - // node will be sorted if needed (only if opened as READ/WRITE) - if (this->m_access_type == AccessType::READ_WRITE) { - this->onNodeLookup(node); - } // within the node look up by compressed key const CompressedItemT *item_ptr = nullptr; if (node->header().canFit(key)) { @@ -388,11 +449,6 @@ DB0_PACKED_END return nullptr; } - // node will be sorted if needed (only if opened as READ/WRITE) - if (this->m_access_type == AccessType::READ_WRITE) { - this->onNodeLookup(node); - } - if (node->header().canFit(key)) { // within the node look up by compressed key return node->lower_equal_bound(node->header().compress(key), this->m_heap_comp); @@ -421,6 +477,22 @@ DB0_PACKED_END } } + void forRange(const ItemT &first, const ItemT &end, + const std::function &callback) const + { + if (!m_raw_item_comp(first, end)) { + return; + } + + for (auto item = sortedBeginFrom(first); !item.is_end(); ++item) { + auto uncompressed = *item; + if (!m_raw_item_comp(uncompressed, end)) { + return; + } + callback(uncompressed); + } + } + void detach() const { super_t::detach(); } @@ -458,6 +530,99 @@ DB0_PACKED_END return super_t::cbegin(); } + class ConstSortedIterator + { + public: + ConstSortedIterator(const SGB_CompressedLookupTree &tree) + : m_node(tree.cbegin_nodes()) + , m_node_end(tree.cend_nodes()) + , m_heap_comp(tree.m_heap_comp) + { + advance_to_first(); + } + + ConstSortedIterator(const SGB_CompressedLookupTree &tree, const ItemT &first) + : m_node(tree.base_t::lower_equal_bound(first)) + , m_node_end(tree.cend_nodes()) + , m_heap_comp(tree.m_heap_comp) + { + if (m_node == m_node_end) { + m_node = tree.cbegin_nodes(); + } + advance_to_first(first, tree.m_raw_item_comp); + } + + ConstSortedIterator &operator++() + { + assert(!is_end()); + ++m_item; + advance_to_next(); + return *this; + } + + bool is_end() const { + return m_node == m_node_end || m_item.is_end(); + } + + ItemT operator*() const + { + assert(!is_end()); + return m_node->header().uncompress(*m_item); + } + + private: + void advance_to_first() + { + if (m_node != m_node_end) { + m_item = m_node->cbegin_sorted(m_heap_comp); + advance_to_next(); + } + } + + void advance_to_first(const ItemT &first, const ItemCompT &raw_item_comp) + { + while (m_node != m_node_end) { + auto header = m_node->header(); + auto max_item_ptr = m_node->find_max(m_heap_comp); + assert(max_item_ptr); + if (raw_item_comp(header.uncompress(*max_item_ptr), first)) { + ++m_node; + continue; + } + + m_item = m_node->cbegin_sorted(m_heap_comp); + while (!m_item.is_end() && raw_item_comp(header.uncompress(*m_item), first)) { + ++m_item; + } + advance_to_next(); + return; + } + } + + void advance_to_next() + { + while (m_node != m_node_end && m_item.is_end()) { + ++m_node; + if (m_node != m_node_end) { + m_item = m_node->cbegin_sorted(m_heap_comp); + } + } + } + + sg_tree_const_iterator m_node; + sg_tree_const_iterator m_node_end; + typename super_t::sgb_node_const_sorting_iterator m_item; + HeapCompT m_heap_comp; + }; + + ConstSortedIterator sortedBegin() const { + return ConstSortedIterator(*this); + } + + ConstSortedIterator sortedBeginFrom(const ItemT &first) const { + return ConstSortedIterator(*this, first); + } + private: ItemCompT m_raw_item_comp; @@ -487,4 +652,4 @@ DB0_PACKED_END }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/collections/SGB_Tree/SGB_LookupTree.hpp b/src/dbzero/core/collections/SGB_Tree/SGB_LookupTree.hpp index f48d43af..026b89ca 100644 --- a/src/dbzero/core/collections/SGB_Tree/SGB_LookupTree.hpp +++ b/src/dbzero/core/collections/SGB_Tree/SGB_LookupTree.hpp @@ -321,6 +321,27 @@ DB0_PACKED_BEGIN bool erase_existing(unsigned int at, const HeapCompT &comp) { return this->erase_existing(this->itemAt(at), comp); } + + template std::size_t erase_if(PredicateT predicate, const HeapCompT &comp) + { + std::size_t removed = 0; + auto step_ = this->step(); + auto it = this->begin(); + auto end_ = this->end(); + while (it != end_) { + if (predicate(*it)) { + this->erase_existing(it, comp); + ++removed; + end_ -= step_; + if (this->empty()) { + break; + } + } else { + it += step_; + } + } + return removed; + } class const_sorting_iterator { @@ -478,8 +499,8 @@ DB0_PACKED_END SGB_LookupTreeBase(Memspace &memspace, std::size_t node_capacity, AccessType access_type, const CompT &comp = {}, const NodeItemCompT item_cmp = {}, const NodeItemEqualT item_eq = {}, - unsigned int sort_thr = DEFAULT_SORT_THRESHOLD) - : super_t(memspace, node_capacity, comp, item_cmp, item_eq) + unsigned int sort_thr = DEFAULT_SORT_THRESHOLD, Allocator::SlotId slot_num = 0) + : super_t(memspace, node_capacity, comp, item_cmp, item_eq, slot_num) , m_sort_threshold(sort_thr) , m_access_type(access_type) { @@ -487,8 +508,8 @@ DB0_PACKED_END SGB_LookupTreeBase(mptr ptr, std::size_t node_capacity, AccessType access_type, const CompT &comp = {}, const NodeItemCompT item_cmp = {}, const NodeItemEqualT item_eq = {}, - unsigned int sort_thr = DEFAULT_SORT_THRESHOLD) - : super_t(ptr, node_capacity, comp, item_cmp, item_eq) + unsigned int sort_thr = DEFAULT_SORT_THRESHOLD, Allocator::SlotId slot_num = 0) + : super_t(ptr, node_capacity, comp, item_cmp, item_eq, slot_num) , m_sort_threshold(sort_thr) , m_access_type(access_type) { @@ -513,10 +534,6 @@ DB0_PACKED_END return { nullptr, sg_tree_const_iterator() }; } - // node will be sorted if needed (only if in READ/WRITE mode) - if (m_access_type == AccessType::READ_WRITE) { - this->onNodeLookup(node); - } return { node->lower_equal_bound(key, this->m_heap_comp), node }; } @@ -569,17 +586,17 @@ DB0_PACKED_END SGB_LookupTree(Memspace &memspace, std::size_t node_capacity, AccessType access_type, const CompT &comp = {}, const ItemCompT &item_cmp = {}, const ItemEqualT &item_eq = {}, - unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD) - : super_t(memspace, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr) + unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD, Allocator::SlotId slot_num = 0) + : super_t(memspace, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr, slot_num) { } SGB_LookupTree(mptr ptr, std::size_t node_capacity, AccessType access_type, const CompT &comp = {}, const ItemCompT &item_cmp = {}, const ItemEqualT &item_eq = {}, - unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD) - : super_t(ptr, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr) + unsigned int sort_thr = super_t::DEFAULT_SORT_THRESHOLD, Allocator::SlotId slot_num = 0) + : super_t(ptr, node_capacity, access_type, comp, item_cmp, item_eq, sort_thr, slot_num) { } }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/collections/SGB_Tree/SGB_Tree.hpp b/src/dbzero/core/collections/SGB_Tree/SGB_Tree.hpp index 04bd5a03..452b392d 100644 --- a/src/dbzero/core/collections/SGB_Tree/SGB_Tree.hpp +++ b/src/dbzero/core/collections/SGB_Tree/SGB_Tree.hpp @@ -103,8 +103,9 @@ namespace db0 * @tparam args optional arguments for the header's constructor */ SGB_TreeBase(Memspace &memspace, std::size_t node_capacity, - const CompT &comp = {}, const NodeItemCompT &item_cmp = {}, const NodeItemEqualT &item_eq = {}) - : super_t(memspace, comp, node_capacity) + const CompT &comp = {}, const NodeItemCompT &item_cmp = {}, const NodeItemEqualT &item_eq = {}, + Allocator::SlotId slot_num = 0) + : super_t(typename super_t::tag_runtime_slot(), memspace, slot_num, comp, node_capacity) , m_node_capacity(node_capacity) , m_item_comp(item_cmp) , m_heap_comp(item_cmp, item_eq) @@ -112,8 +113,9 @@ namespace db0 } SGB_TreeBase(mptr ptr, std::size_t node_capacity, - const CompT &comp = {}, const NodeItemCompT &item_cmp = {}, const NodeItemEqualT &item_eq = {}) - : super_t(ptr, comp) + const CompT &comp = {}, const NodeItemCompT &item_cmp = {}, const NodeItemEqualT &item_eq = {}, + Allocator::SlotId slot_num = 0) + : super_t(ptr, comp, slot_num) , m_node_capacity(node_capacity) , m_item_comp(item_cmp) , m_heap_comp(item_cmp, item_eq) @@ -604,16 +606,16 @@ namespace db0 using CompT = typename super_t::CompT; SGB_Tree(Memspace &memspace, std::size_t node_capacity, const CompT &comp = {}, const ItemCompT &item_comp = {}, - const ItemEqualT &item_eq = {}) - : super_t(memspace, node_capacity, comp, item_comp, item_eq) + const ItemEqualT &item_eq = {}, Allocator::SlotId slot_num = 0) + : super_t(memspace, node_capacity, comp, item_comp, item_eq, slot_num) { } SGB_Tree(mptr ptr, std::size_t node_capacity, const CompT &comp = {}, const ItemCompT &item_comp = {}, - const ItemEqualT &item_eq = {}) - : super_t(ptr, node_capacity, comp, item_comp, item_eq) + const ItemEqualT &item_eq = {}, Allocator::SlotId slot_num = 0) + : super_t(ptr, node_capacity, comp, item_comp, item_eq, slot_num) { } }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/collections/sgtree/v_sgtree.hpp b/src/dbzero/core/collections/sgtree/v_sgtree.hpp index 4df76666..2ec0ea8b 100644 --- a/src/dbzero/core/collections/sgtree/v_sgtree.hpp +++ b/src/dbzero/core/collections/sgtree/v_sgtree.hpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include namespace db0 @@ -131,6 +133,8 @@ DB0_PACKED_END v_sgtree() = default; + struct tag_runtime_slot {}; + /// The SG tree instance object is the 'head' node (created with either default or user arguments) template v_sgtree(db0::Memspace &memspace, comp_t cmp = comp_t(), Args&&... args) : super(memspace, std::forward(args)...) @@ -141,6 +145,17 @@ DB0_PACKED_END this->modify().ptr_set.right = this->getAddress(); } + template v_sgtree(tag_runtime_slot, db0::Memspace &memspace, Allocator::SlotId slot_num, + comp_t cmp = comp_t(), Args&&... args) + : super(memspace, tag_dynamic_slot(), slot_num, std::forward(args)...) + , _comp(cmp) + , m_slot_num(slot_num) + { + // link to self + this->modify().ptr_set.left = this->getAddress(); + this->modify().ptr_set.right = this->getAddress(); + } + v_sgtree(const ptr_t &ptr, comp_t cmp = comp_t()) : super(ptr) , _comp(cmp) @@ -152,6 +167,13 @@ DB0_PACKED_END , _comp(cmp) { } + + v_sgtree(db0::mptr _ptr, comp_t cmp, Allocator::SlotId slot_num) + : super(_ptr) + , _comp(cmp) + , m_slot_num(slot_num) + { + } v_sgtree(db0::Memspace &memspace, const v_sgtree &other) : v_sgtree(memspace) @@ -277,7 +299,7 @@ DB0_PACKED_END SG_Tree::link_equal_upper_bound( this->head(), key, this->_comp, ld, depth ); - node_t new_node(this->getMemspace(), key, std::forward(args)...); + auto new_node = makeNewNode(key, std::forward(args)...); SG_Tree::link(this->head(), new_node, ld); SG_Tree::rebalance_after_insertion(new_node, depth, this->modify().size++, _alpha); this->updateMaxTreeSize(); @@ -295,7 +317,7 @@ DB0_PACKED_END SG_Tree::link_equal ( this->head(), hint, key, this->_comp, ld, depth ); - node_t new_node(this->getMemspace(), key, std::forward(args)...); + auto new_node = makeNewNode(key, std::forward(args)...); SG_Tree::link(this->head(), new_node, ld); SG_Tree::rebalance_after_insertion(new_node, depth, ++this->modify().size, _alpha); this->updateMaxTreeSize(); @@ -329,7 +351,7 @@ DB0_PACKED_END return result; } // allocate / initialize new SG-Tree node - node_t new_node(this->getMemspace(), key, std::forward(args)...); + auto new_node = makeNewNode(key, std::forward(args)...); SG_Tree::insert_unique_commit( this->head(), new_node, commit_data, this->modify().size++, _alpha ); @@ -460,6 +482,22 @@ DB0_PACKED_END alpha_t _alpha; // node comparer comp_t _comp; + Allocator::SlotId m_slot_num = 0; + + template + node_t makeNewNode(const KeyInitializer &key, Args&&... args) + { + if constexpr ( + std::is_same< + typename std::decay::type>::type, + MappedAddress + >::value) + { + return node_t(this->getMemspace(), key, std::forward(args)...); + } else { + return node_t(this->getMemspace(), tag_dynamic_slot(), m_slot_num, key, std::forward(args)...); + } + } #ifdef __linux__ #pragma GCC diagnostic push diff --git a/src/dbzero/core/dram/DRAM_Allocator.cpp b/src/dbzero/core/dram/DRAM_Allocator.cpp index a02a8613..57995153 100644 --- a/src/dbzero/core/dram/DRAM_Allocator.cpp +++ b/src/dbzero/core/dram/DRAM_Allocator.cpp @@ -20,6 +20,46 @@ namespace db0 update(allocs); } + DRAM_Allocator::Updater::Updater(DRAM_Allocator &allocator) + : m_allocator(&allocator) + , m_page_size(allocator.m_page_size) + { + } + + DRAM_Allocator::Updater::~Updater() + { + // finalize updates + if (m_allocator) { + m_allocator->m_next_page_id = m_max_page_id; + } + } + + void DRAM_Allocator::Updater::operator()(std::size_t addr) + { + assert(m_allocator); + if (addr % m_page_size != 0) { + THROWF(db0::InternalException) << "DRAM_Allocator: invalid alloc address (" << addr << ")" << THROWF_END; + } + + auto page_id = addr / m_page_size; + for (;m_max_page_id <= page_id; ++m_max_page_id) { + if (m_max_page_id != page_id) { + m_allocator->m_free_pages.insert(m_max_page_id); + } + } + m_allocator->m_free_pages.erase(page_id); + } + + bool DRAM_Allocator::Updater::operator!() const + { + return m_allocator == nullptr; + } + + DRAM_Allocator::Updater DRAM_Allocator::beginUpdate() + { + return Updater { *this }; + } + void DRAM_Allocator::update(const std::unordered_set &allocs) { if (allocs.empty()) { @@ -38,7 +78,7 @@ namespace db0 } auto page_id = addr / m_page_size; for (;max_page_id <= page_id; ++max_page_id) { - if ((max_page_id != page_id) && allocs.find(max_page_id * m_page_size) == allocs.end()) { + if (max_page_id != page_id && allocs.find(max_page_id * m_page_size) == allocs.end()) { m_free_pages.insert(max_page_id); } } @@ -46,7 +86,13 @@ namespace db0 m_next_page_id = max_page_id; } - std::optional
DRAM_Allocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + void DRAM_Allocator::reset() + { + m_next_page_id = FIRST_PAGE_ID; + m_free_pages.clear(); + } + + std::optional
DRAM_Allocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char) { assert(slot_num == 0); @@ -126,8 +172,8 @@ namespace db0 } return AllocationInfo { Address::fromOffset(pageId * m_page_size), m_page_size }; } - - Address DRAM_Allocator::firstAlloc() const { + + Address DRAM_Allocator::firstAlloc(SlotId) const { return Address::fromOffset(FIRST_PAGE_ID * m_page_size); } @@ -139,4 +185,9 @@ namespace db0 { } + bool DRAM_Allocator::empty() const + { + return m_next_page_id == FIRST_PAGE_ID && m_free_pages.empty(); + } + } diff --git a/src/dbzero/core/dram/DRAM_Allocator.hpp b/src/dbzero/core/dram/DRAM_Allocator.hpp index 8dbfa110..5a1745c4 100644 --- a/src/dbzero/core/dram/DRAM_Allocator.hpp +++ b/src/dbzero/core/dram/DRAM_Allocator.hpp @@ -23,12 +23,36 @@ namespace db0 */ DRAM_Allocator(const std::unordered_set &allocs, std::size_t page_size); + struct Updater + { + DRAM_Allocator *m_allocator = nullptr; + std::uint64_t m_max_page_id = FIRST_PAGE_ID; + const std::size_t m_page_size = 0; + + // no-op updater + Updater() = default; + Updater(DRAM_Allocator &); + // must be called after all updates to finalize the state + ~Updater(); + + // must be populated in address-ascending order + void operator()(std::size_t addr); + bool operator!() const; + }; + + // Allows populating the initial state, only allowed when the allocator is empty + // expecting a complete list of allocated addresses (e.g. from the underlying storage) + // and to be provided in ascending order + Updater beginUpdate(); + /** * Update with externally provided list of allocations (add new allocations) */ void update(const std::unordered_set &allocs); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + void reset(); + + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; @@ -42,11 +66,13 @@ namespace db0 void commit() const override; void detach() const override; - + /** * Get address of the 1st allocation */ - Address firstAlloc() const; + virtual Address firstAlloc(SlotId = 0) const; + + bool empty() const; private: static constexpr std::size_t FIRST_PAGE_ID = 1; diff --git a/src/dbzero/core/dram/DRAM_Prefix.cpp b/src/dbzero/core/dram/DRAM_Prefix.cpp index eea9ec99..2c4409f3 100644 --- a/src/dbzero/core/dram/DRAM_Prefix.cpp +++ b/src/dbzero/core/dram/DRAM_Prefix.cpp @@ -4,8 +4,8 @@ #include #include #include -#include #include +#include namespace db0 @@ -72,9 +72,18 @@ namespace db0 #endif MemLock DRAM_Prefix::mapRange(std::uint64_t address, std::size_t size, FlagSet access_mode) + { + return mapRangeImpl(address, size, access_mode); + } + + MemLock DRAM_Prefix::mapRangeImpl(std::uint64_t address, std::size_t size, FlagSet access_mode, + bool *became_dirty) { auto page_num = address / m_page_size; auto offset = address % m_page_size; + if (became_dirty) { + *became_dirty = false; + } if (size + offset > m_page_size) { THROWF(db0::InternalException) << "DRAM_Prefix: invalid range requested (@" << address << ", size = " << size << ")" << THROWF_END; @@ -82,8 +91,12 @@ namespace db0 auto it = m_pages.find(page_num); if (it == m_pages.end()) { it = m_pages.emplace(page_num, MemoryPage(m_context, address - offset, m_page_size)).first; + m_max_page_num = std::max(m_max_page_num, page_num); } else if (access_mode[AccessOptions::write]) { - it->second.m_lock->setDirty(); + auto did_set_dirty = it->second.m_lock->setDirty(); + if (became_dirty) { + *became_dirty = did_set_dirty; + } } return { (std::byte*)it->second.m_buffer + offset, it->second.m_lock }; } @@ -103,12 +116,50 @@ namespace db0 void DRAM_Prefix::flushDirty(SinkFunction sink) const { m_dirty_cache.flushDirty(sink); } + + void DRAM_Prefix::forEachDirtyPage(DirtyPageFunction f) const + { + m_dirty_cache.forAll([&](const ResourceLock &lock) { + if (lock.isDirty()) { + f(lock.getAddress() / m_page_size, lock.getBuffer()); + } + }); + } + + bool DRAM_Prefix::isDirty() const { + return !m_dirty_cache.empty(); + } + + bool DRAM_Prefix::hasPage(std::uint64_t page_num) const + { + return m_pages.find(page_num) != m_pages.end(); + } + + bool DRAM_Prefix::evictPageRange(std::uint64_t first_page_num, std::uint64_t end_page_num) + { + // this is to reduce scan to existing pages + end_page_num = std::min(end_page_num, m_max_page_num + 1); + for (auto page_num = first_page_num; page_num < end_page_num; ++page_num) { + auto it = m_pages.find(page_num); + if (it == m_pages.end()) { + continue; + } + auto &lock = it->second.m_lock; + if (!lock || lock->isDirty() || lock.use_count() != 1) { + THROWF(db0::InternalException) << "DRAM_Prefix: unable to evict page " << page_num + << " (dirty = " << (lock ? lock->isDirty() : false) << ", ref_count = " << (lock ? lock.use_count() : 0) << ")" << THROWF_END; + } + m_pages.erase(it); + } + return true; + } - void *DRAM_Prefix::update(std::size_t page_num, bool mark_dirty) + void *DRAM_Prefix::update(std::uint64_t page_num, bool mark_dirty) { auto it = m_pages.find(page_num); if (it == m_pages.end()) { it = m_pages.emplace(page_num, MemoryPage(m_context, page_num * m_page_size, m_page_size)).first; + m_max_page_num = std::max(m_max_page_num, page_num); } if (mark_dirty) { it->second.m_lock->setDirty(); @@ -150,7 +201,8 @@ namespace db0 } else { ++it; } - } + } + m_max_page_num = other.m_max_page_num; } std::uint64_t DRAM_Prefix::getLastUpdated() const { @@ -192,14 +244,17 @@ namespace db0 std::size_t DRAM_Prefix::getDirtySize() const { - assert(false); - throw std::runtime_error("DRAM_Prefix::getDirtySize operation not supported"); + std::size_t result = 0; + forEachDirtyPage([&](std::uint64_t, const void *) { + result += getPageSize(); + }); + return result; } - + std::size_t DRAM_Prefix::flushDirty(std::size_t) { assert(false); throw std::runtime_error("DRAM_Prefix::flushDirty operation not supported"); } -} \ No newline at end of file +} diff --git a/src/dbzero/core/dram/DRAM_Prefix.hpp b/src/dbzero/core/dram/DRAM_Prefix.hpp index 0d816e61..32491476 100644 --- a/src/dbzero/core/dram/DRAM_Prefix.hpp +++ b/src/dbzero/core/dram/DRAM_Prefix.hpp @@ -26,6 +26,7 @@ namespace db0 public: // A function to consume a single resource (for serialization) using SinkFunction = DirtyCache::SinkFunction; + using DirtyPageFunction = std::function; // NOTE: page size for DRAM_Prefix may not be the power of 2 DRAM_Prefix(std::size_t page_size); @@ -78,6 +79,20 @@ namespace db0 // Total number of bytes occupied by all pages std::size_t size() const; + protected: + MemLock mapRangeImpl(std::uint64_t address, std::size_t size, FlagSet, + bool *became_dirty = nullptr); + + void forEachDirtyPage(DirtyPageFunction) const; + // Check if there are any dirty pages + bool isDirty() const; + + bool hasPage(std::uint64_t page_num) const; + + // Evict clean page range without users (the method must be called after detaching user objects) + // and flushing dirty pages (in case of read/write instance) + bool evictPageRange(std::uint64_t first_page_num, std::uint64_t end_page_num); + private: const std::size_t m_page_size; mutable Storage0 m_dev_null; @@ -105,14 +120,16 @@ namespace db0 void resetDirtyFlag(); }; - mutable std::unordered_map m_pages; + mutable std::unordered_map m_pages; + // high-water mark of page numbers allocated so far, used for eviction heuristics + mutable std::uint64_t m_max_page_num = 0; public: #ifndef NDEBUG // get total memory usage across all instances of DRAM_Prefix static std::pair getTotalMemoryUsage(); - const std::unordered_map &getPages() const { + const std::unordered_map &getPages() const { return m_pages; } @@ -121,4 +138,4 @@ namespace db0 #endif }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/dram/MS_Address.hpp b/src/dbzero/core/dram/MS_Address.hpp new file mode 100644 index 00000000..6889b60a --- /dev/null +++ b/src/dbzero/core/dram/MS_Address.hpp @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include +#include +#include + +namespace db0 + +{ + + class MS_Address + { + public: + static constexpr std::uint64_t LOCAL_ADDRESS_BITS = 24; + static constexpr std::uint64_t SLOT_ID_BITS = 40; + static constexpr std::uint64_t LOCAL_ADDRESS_MASK = (1ull << LOCAL_ADDRESS_BITS) - 1; + // the last valid slot ID is SLOT_ID_COUNT - 1, the slot ID of all 1s is reserved for invalid address + static constexpr std::uint64_t SLOT_ID_COUNT = 1ull << SLOT_ID_BITS; + + static MS_Address &from(std::uint64_t &address); + + static const MS_Address &from(const std::uint64_t &address); + + // Encode as external address + static std::uint64_t encode(Allocator::SlotId slot_id, std::uint64_t local_address); + + Allocator::SlotId slot_id() const; + + std::uint64_t local_address() const; + private: + std::uint64_t m_address; + }; + + inline MS_Address &MS_Address::from(std::uint64_t &address) + { + return reinterpret_cast(address); + } + + inline const MS_Address &MS_Address::from(const std::uint64_t &address) + { + return reinterpret_cast(address); + } + + inline std::uint64_t MS_Address::encode(Allocator::SlotId slot_id, std::uint64_t local_address) + { + assert(slot_id < SLOT_ID_COUNT); + assert((local_address & LOCAL_ADDRESS_MASK) == local_address); + return (static_cast(slot_id) << LOCAL_ADDRESS_BITS) | local_address; + } + + inline Allocator::SlotId MS_Address::slot_id() const + { + return m_address >> LOCAL_ADDRESS_BITS; + } + + inline std::uint64_t MS_Address::local_address() const + { + return m_address & LOCAL_ADDRESS_MASK; + } + +} diff --git a/src/dbzero/core/dram/MS_MetaAllocator.cpp b/src/dbzero/core/dram/MS_MetaAllocator.cpp new file mode 100644 index 00000000..b68c4142 --- /dev/null +++ b/src/dbzero/core/dram/MS_MetaAllocator.cpp @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include "MS_MetaAllocator.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + static_assert(sizeof(MS_Address) == sizeof(std::uint64_t)); + static_assert(alignof(MS_Address) == alignof(std::uint64_t)); + static_assert(std::is_standard_layout_v); + + inline Address ms_external_address(Allocator::SlotId slot_id, Address local_addr) + { + // external address = slot ID + local address + return Address::fromOffset(MS_Address::encode(slot_id, local_addr)); + } + + MS_MetaAllocator::MS_MetaAllocator(SparsePair &parent_index, std::size_t page_size) + : DRAM_Allocator(page_size) + , m_parent_index(parent_index) + , m_page_size(page_size) + , m_ps_shift(db0::getPageShift(page_size)) + { + initializeAllocators(); + } + + void MS_MetaAllocator::initializeAllocators() + { + std::optional current_slot_id; + // Current slot-local assigned addresses + std::unordered_set local_allocs; + + auto create_slot_allocator = [&]() { + if (!current_slot_id) { + return; + } + auto allocator = std::make_shared(local_allocs, m_page_size); + m_allocators.emplace(*current_slot_id, std::move(allocator)); + local_allocs.clear(); + }; + + // NOTE: sorted iteration exposes slot-ordered page number + std::uint64_t last_addr = 0; + for (auto it = m_parent_index.getSparseIndex().cbegin(); !it.is_end(); ++it) { + auto item = *it; + if (!item || item.m_page_num == 0) { + continue; + } + + // page-shift to obtain actual address + auto ext_addr = item.m_page_num << m_ps_shift; + auto &address = MS_Address::from(ext_addr); + auto local_addr = address.local_address(); + if (local_addr == 0) { + continue; + } + + auto slot_id = address.slot_id(); + if (current_slot_id && slot_id != *current_slot_id) { + // next slot ID encountered + create_slot_allocator(); + last_addr = 0; + } + current_slot_id = slot_id; + // NOTE: the same address will be repeated with multiple different state numbers + if (local_addr != last_addr) { + local_allocs.insert(local_addr); + last_addr = local_addr; + } + } + create_slot_allocator(); + } + + DRAM_Allocator &MS_MetaAllocator::ensureAllocator(Allocator::SlotId slot_id, bool *is_newly_created) + { + auto it = m_allocators.find(slot_id); + if (it != m_allocators.end()) { + if (is_newly_created) { + *is_newly_created = false; + } + return *it->second; + } + + auto allocator = std::make_shared(m_page_size); + // initialize allocator with the updater + { + auto updater = allocator->beginUpdate(); + + auto first_addr = MS_Address::encode(slot_id, 0); + auto end_addr = slot_id + 1 == MS_Address::SLOT_ID_COUNT + ? std::numeric_limits::max() + : MS_Address::encode(slot_id + 1, 0); + + // scan SparseIndex as the source of truth + m_parent_index.getSparseIndex().forUniquePageRange(first_addr >> m_ps_shift, end_addr >> m_ps_shift, [&](const SI_Item &item) { + auto ext_addr = item.m_page_num << m_ps_shift; + updater(MS_Address::from(ext_addr).local_address()); + }); + } + + if (is_newly_created) { + *is_newly_created = true; + } + + auto [new_it, inserted] = m_allocators.emplace(slot_id, std::move(allocator)); + (void)inserted; + return *new_it->second; + } + + const DRAM_Allocator *MS_MetaAllocator::tryFindAllocator(Allocator::SlotId slot_id) const + { + auto it = m_allocators.find(slot_id); + if (it == m_allocators.end()) { + return nullptr; + } + return it->second.get(); + } + + std::optional
MS_MetaAllocator::tryAlloc(std::size_t size, Allocator::SlotId slot_num, + bool aligned, unsigned char realm_id, unsigned char locality) + { + auto local_addr = ensureAllocator(slot_num).tryAlloc(size, 0, aligned, realm_id, locality); + if (!local_addr) { + return std::nullopt; + } + return ms_external_address(slot_num, *local_addr); + } + + void MS_MetaAllocator::free(Address address) + { + auto offset = address.getOffset(); + auto &ms_addr = MS_Address::from(offset); + ensureAllocator(ms_addr.slot_id()).free(Address::fromOffset(ms_addr.local_address())); + } + + std::size_t MS_MetaAllocator::getAllocSize(Address address) const + { + auto offset = address.getOffset(); + auto &ms_addr = MS_Address::from(offset); + auto allocator = tryFindAllocator(ms_addr.slot_id()); + if (!allocator) { + THROWF(db0::BadAddressException) << "Invalid MS_MetaSpace slot address: " << address; + } + return allocator->getAllocSize(Address::fromOffset(ms_addr.local_address())); + } + + bool MS_MetaAllocator::isAllocated(Address address, std::size_t *size_of_result) const + { + auto offset = address.getOffset(); + auto &ms_addr = MS_Address::from(offset); + auto allocator = tryFindAllocator(ms_addr.slot_id()); + if (!allocator) { + return false; + } + return allocator->isAllocated(Address::fromOffset(ms_addr.local_address()), size_of_result); + } + + Allocator::AllocationInfo MS_MetaAllocator::findAllocation(Address address) const + { + auto offset = address.getOffset(); + auto &ms_addr = MS_Address::from(offset); + auto allocator = tryFindAllocator(ms_addr.slot_id()); + if (!allocator) { + THROWF(db0::BadAddressException) << "Invalid MS_MetaSpace slot address: " << address; + } + auto local_info = allocator->findAllocation(Address::fromOffset(ms_addr.local_address())); + return { + ms_external_address(ms_addr.slot_id(), local_info.address), + local_info.size + }; + } + + std::optional
MS_MetaAllocator::tryFirstAlloc(Allocator::SlotId slot_id) + { + auto allocator = tryFindAllocator(slot_id); + if (!allocator || allocator->empty()) { + return std::nullopt; + } + return ms_external_address(slot_id, allocator->firstAlloc()); + } + + Address MS_MetaAllocator::firstAlloc(SlotId slot_id) const + { + auto allocator = tryFindAllocator(slot_id); + if (!allocator) { + THROWF(db0::BadAddressException) << "Invalid MS_MetaSpace slot ID: " << slot_id; + } + return ms_external_address(slot_id, allocator->firstAlloc()); + } + + void MS_MetaAllocator::evictSlot(Allocator::SlotId slot_id) + { + m_allocators.erase(slot_id); + } + + DRAM_Allocator::Updater MS_MetaAllocator::tryBeginUpdate(Allocator::SlotId slot_id) + { + bool is_newly_created = false; + auto &allocator = ensureAllocator(slot_id, &is_newly_created); + if (is_newly_created) { + // no-op updater since the allocator is newly created and has no state to refresh + return {}; + } + return allocator.beginUpdate(); + } + + DRAM_Allocator::Updater MS_MetaAllocator::beginUpdate(Allocator::SlotId slot_id) + { + auto it = m_allocators.find(slot_id); + if (it == m_allocators.end()) { + auto [new_it, inserted] = m_allocators.emplace(slot_id, std::make_shared(m_page_size)); + (void)inserted; + it = new_it; + } + return it->second->beginUpdate(); + } + + void MS_MetaAllocator::commit() const + { + } + + void MS_MetaAllocator::detach() const + { + } + +} diff --git a/src/dbzero/core/dram/MS_MetaAllocator.hpp b/src/dbzero/core/dram/MS_MetaAllocator.hpp new file mode 100644 index 00000000..754d2736 --- /dev/null +++ b/src/dbzero/core/dram/MS_MetaAllocator.hpp @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include "MS_Address.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + struct MS_MetaSpace; + + // MS_MetaAllocator organizes allocations into independently managed slots + // Slot ID is encoded in the high bits of the returned address (with 40 / 24 bit split) + // this leaves ~16M slot capacity which is sufficient for meta-data (e.g. single SLAB metadata) + // but needs to be monitored to avoid unexpected exhaustion. + class MS_MetaAllocator: public DRAM_Allocator + { + public: + using SlotId = Allocator::SlotId; + MS_MetaAllocator(SparsePair &parent_index, std::size_t page_size); + + std::optional
tryAlloc(std::size_t size, Allocator::SlotId slot_num = 0, + bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; + + void free(Address address) override; + + std::size_t getAllocSize(Address address) const override; + + bool isAllocated(Address address, std::size_t *size_of_result = nullptr) const override; + + AllocationInfo findAllocation(Address address) const override; + + void commit() const override; + + void detach() const override; + + std::optional
tryFirstAlloc(SlotId); + Address firstAlloc(SlotId) const override; + + void evictSlot(SlotId); + + // For scoped refresh / updates of the allocator state + // NOTE: the no-op updater will be returned if the slot was restored and fully initialized + DRAM_Allocator::Updater tryBeginUpdate(SlotId); + + // This version will expose a non-initialized allocator's updater if not found + DRAM_Allocator::Updater beginUpdate(SlotId); + + private: + SparsePair &m_parent_index; + const std::size_t m_page_size; + const std::uint32_t m_ps_shift; + std::unordered_map > m_allocators; + + void initializeAllocators(); + + DRAM_Allocator &ensureAllocator(SlotId, bool *is_newly_created = nullptr); + + const DRAM_Allocator *tryFindAllocator(SlotId) const; + }; + +} diff --git a/src/dbzero/core/dram/MS_MetaPrefix.cpp b/src/dbzero/core/dram/MS_MetaPrefix.cpp new file mode 100644 index 00000000..6ced24f6 --- /dev/null +++ b/src/dbzero/core/dram/MS_MetaPrefix.cpp @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include "MS_MetaPrefix.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + static_assert(sizeof(MS_Address) == sizeof(std::uint64_t)); + static_assert(alignof(MS_Address) == alignof(std::uint64_t)); + static_assert(std::is_standard_layout_v); + + MS_MetaPrefix::MS_MetaPrefix( + std::size_t page_size, SparsePair &parent_index, RandomIO_Stream &page_io) + : MetaPrefix(page_size, parent_index) + , m_ps_shift(db0::getPageShift(page_size)) + , m_page_io(page_io) + { + } + + std::pair MS_MetaPrefix::getPageRange(Allocator::SlotId slot_id) const + { + assert(slot_id < MS_Address::SLOT_ID_COUNT); + auto first_addr = MS_Address::encode(slot_id, 0); + auto end_addr = MS_Address::encode(slot_id + 1, 0); + return { first_addr >> m_ps_shift, end_addr >> m_ps_shift }; + } + + bool MS_MetaPrefix::evictSlot(Allocator::SlotId slot_id) + { + if (m_slot_ids.erase(slot_id) == 0) { + return false; + } + auto [first_page_num, end_page_num] = getPageRange(slot_id); + // NOTE: this is sufficiently fast becuse DRAM_Prefix prunes the range internally + evictPageRange(first_page_num, end_page_num); + return true; + } + + bool MS_MetaPrefix::tryLoadSlot(SlotId slot_id, MS_MetaAllocator &allocator) + { + auto [first_page_num, end_page_num] = getPageRange(slot_id); + // Collect slot-specific storage (logical) page numbers first + std::vector slot_page_nums; + m_parent_index.forUniquePageRange(first_page_num, end_page_num, [&](std::uint64_t page_num) { + slot_page_nums.push_back(page_num); + }); + // the slot does not exist + if (slot_page_nums.empty()) { + return false; + } + auto updater = allocator.beginUpdate(slot_id); + db0::load(*this, slot_page_nums.data(), slot_page_nums.data() + slot_page_nums.size(), std::move(updater)); + m_slot_ids.insert(slot_id); + return true; + } + + void load(MS_MetaPrefix &prefix, const std::uint64_t *page_num, const std::uint64_t *end, + DRAM_Allocator::Updater &&updater) + { + db0::load(prefix, prefix.m_page_io, page_num, end); + if (!updater) { + return; + } + for (; page_num != end; ++page_num) { + updater(MS_Address::from(*page_num << prefix.m_ps_shift).local_address()); + } + } + + void load(MS_MetaPrefix &prefix, MS_MetaAllocator &allocator) + { + std::vector page_nums; + Allocator::SlotId current_slot_id = 0; + + auto load_current_slot = [&]() { + if (!page_nums.empty()) { + auto updater = allocator.beginUpdate(current_slot_id); + db0::load(prefix, prefix.m_page_io, page_nums.data(), page_nums.data() + page_nums.size(), std::move(updater)); + prefix.m_slot_ids.insert(current_slot_id); + } + }; + + // Iterate all known pages and load on a per-slot basis + prefix.m_parent_index.forUniquePageRange([&](std::uint64_t page_num) { + auto slot_id = MS_Address::from(page_num << prefix.m_ps_shift).slot_id(); + if (slot_id != current_slot_id) { + load_current_slot(); + page_nums.clear(); + current_slot_id = slot_id; + } + page_nums.push_back(page_num); + }); + + load_current_slot(); + } + +} diff --git a/src/dbzero/core/dram/MS_MetaPrefix.hpp b/src/dbzero/core/dram/MS_MetaPrefix.hpp new file mode 100644 index 00000000..c3e35da2 --- /dev/null +++ b/src/dbzero/core/dram/MS_MetaPrefix.hpp @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include "MS_Address.hpp" +#include "MS_MetaAllocator.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + struct MS_MetaSpace; + + // NOTE: access to MS_MetaPrefix requires managing slots via (loadSlot / evictSlot) + // Use SparsePairManager to safely manage slots with a chosen policy + class MS_MetaPrefix: public MetaPrefix + { + public: + using SlotId = Allocator::SlotId; + + /** + * Creates a metadata prefix over the shared sparse mapping. + * page_io reference is required for lazy / mixed slot loading policy + */ + MS_MetaPrefix(std::size_t page_size, SparsePair &parent_index, RandomIO_Stream &); + + // Evict dirty and unused slot (must be flushed and detached) + bool evictSlot(SlotId); + + // Get slot associated desc-io logical begin / end page pair + std::pair getPageRange(SlotId) const; + + // Load or refresh and entire slot and initialize or update the associated allocator's state + // @return true if the slot was loaded, false if the slot has no data yet + bool tryLoadSlot(SlotId, MS_MetaAllocator &); + + private: + friend struct MS_MetaSpace; + + const std::uint32_t m_ps_shift; + RandomIO_Stream &m_page_io; + // the loaded slot IDs + std::unordered_set m_slot_ids; + + friend void load(MS_MetaPrefix &, const std::uint64_t *, const std::uint64_t *, + DRAM_Allocator::Updater &&); + + friend void load(MS_MetaPrefix &, MS_MetaAllocator &); + }; + + // Load the entire prefix and initialize the associated allocator's state + void load(MS_MetaPrefix &, MS_MetaAllocator &); + + // Load or refresh pages from a single specific slot only + void load(MS_MetaPrefix &, const std::uint64_t *page_num, const std::uint64_t *end, + DRAM_Allocator::Updater &&updater = {}); + +} diff --git a/src/dbzero/core/dram/MetaPrefix.cpp b/src/dbzero/core/dram/MetaPrefix.cpp new file mode 100644 index 00000000..850f8844 --- /dev/null +++ b/src/dbzero/core/dram/MetaPrefix.cpp @@ -0,0 +1,397 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include "MetaPrefix.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + namespace + { + std::vector collectReusableFullPageNums(const SparsePair &sparse_pair, StateNumType state_num) + { + std::vector reusable_full_pages; + + bool have_page = false; + std::uint64_t current_page_num = 0; + std::size_t retained_count = 0; + SI_Item oldest_retained; + SI_Item newest_retained; + + auto retain = [&](const SI_Item &item) { + if (!have_page || item.m_page_num != current_page_num) { + have_page = true; + current_page_num = item.m_page_num; + retained_count = 0; + } + + if (retained_count == 0) { + oldest_retained = item; + retained_count = 1; + return; + } + if (retained_count == 1) { + newest_retained = item; + retained_count = 2; + return; + } + + reusable_full_pages.push_back(oldest_retained.m_storage_page_num); + oldest_retained = newest_retained; + newest_retained = item; + }; + + for (auto it = sparse_pair.getSparseIndex().cbegin(); !it.is_end(); ++it) { + auto item = *it; + if (!!item && item.m_page_num != 0 && item.m_storage_page_num != 0 && item.m_state_num <= state_num) { + retain(item); + } + } + return reusable_full_pages; + } + } + + MetaPrefix::MetaPrefix(std::size_t page_size, SparsePair &parent_index) + : DRAM_Prefix(page_size) + , m_parent_index(parent_index) + { + } + + void load(MetaPrefix &prefix, RandomIO_Stream &page_io) + { + // Collect unique page numbers first (there might more than one state number available per page) + std::uint64_t last_page_num = 0; + std::vector page_nums; + for (auto it = prefix.m_parent_index.getSparseIndex().cbegin(); !it.is_end(); ++it) { + auto item = *it; + if (!!item && item.m_page_num != 0 && item.m_page_num != last_page_num) { + page_nums.push_back(item.m_page_num); + last_page_num = item.m_page_num; + } + } + db0::load(prefix, page_io, page_nums); + } + + struct Load_OP + { + std::uint64_t m_storage_page_num; + // target buffer + void *m_buffer; + }; + + struct LoadDiff_OP + { + std::uint64_t m_storage_page_num; + std::uint64_t m_page_num; + StateNumType m_diff_state_num; + // target buffer + void *m_buffer; + }; + + // fetch a single page from storage + bool fetchPage(MetaPrefix &prefix, RandomIO_Stream &page_io, std::uint64_t page_num, StateNumType state_num, + void *buffer) + { + SparseIndexQuery query(prefix.m_parent_index.getSparseIndex(), prefix.m_parent_index.getDiffIndex(), + page_num, state_num); + if (query.empty()) { + return false; + } + + auto storage_page_num = query.first(); + if (storage_page_num) { + page_io.readRandom(storage_page_num, buffer); + } else { + std::memset(buffer, 0, prefix.getPageSize()); + } + + StateNumType diff_state_num = 0; + while (query.next(diff_state_num, storage_page_num)) { + page_io.applyFrom(storage_page_num, buffer, { page_num, diff_state_num }); + } + return true; + } + + void load(MetaPrefix &prefix, RandomIO_Stream &page_io, const std::vector &page_nums, + DRAM_Allocator::Updater &&updater) + { + load(prefix, page_io, page_nums.data(), page_nums.data() + page_nums.size(), std::move(updater)); + } + + void load(MetaPrefix &prefix, RandomIO_Stream &page_io, const std::uint64_t *page_num, + const std::uint64_t *end, DRAM_Allocator::Updater &&updater) + { + auto state_num = prefix.getStateNum(false); + // For I/O performace we first determine the operations and then execute ordered for better locality + std::vector load_ops; + std::vector load_diff_ops; + + auto &sparse_index = prefix.m_parent_index.getSparseIndex(); + auto &diff_index = prefix.m_parent_index.getDiffIndex(); + for (;page_num != end; ++page_num) { + SparseIndexQuery query(sparse_index, diff_index, *page_num, state_num); + if (query.empty()) { + continue; + } + + auto page_buf = prefix.update(*page_num, false); + if (!!updater) { + updater(*page_num * prefix.getPageSize()); + } + auto storage_page_num = query.first(); + if (storage_page_num) { + load_ops.push_back(Load_OP { storage_page_num, page_buf }); + } else { + std::memset(page_buf, 0, prefix.getPageSize()); + } + + StateNumType diff_state_num = 0; + while (query.next(diff_state_num, storage_page_num)) { + load_diff_ops.push_back(LoadDiff_OP { storage_page_num, *page_num, diff_state_num, page_buf }); + } + } + + // sort both ops-buffers by storage page number for better locality + std::sort(load_ops.begin(), load_ops.end(), [](const Load_OP &a, const Load_OP &b) { + return a.m_storage_page_num < b.m_storage_page_num; + }); + + // Load full pages first + for (const auto &op: load_ops) { + page_io.readRandom(op.m_storage_page_num, op.m_buffer); + } + + // Apply diffs next + std::sort(load_diff_ops.begin(), load_diff_ops.end(), [](const LoadDiff_OP &a, const LoadDiff_OP &b) { + return a.m_storage_page_num < b.m_storage_page_num; + }); + for (const auto &op: load_diff_ops) { + page_io.applyFrom(op.m_storage_page_num, op.m_buffer, { op.m_page_num, op.m_diff_state_num }); + } + } + + MemLock MetaPrefix::mapRange(std::uint64_t address, std::size_t size, FlagSet access_mode) + { + bool became_dirty = false; + auto lock = mapRangeImpl(address, size, access_mode, &became_dirty); + if (became_dirty) { + auto page_num = address / getPageSize(); + // copy for diff generation on flush + captureCoWPage(page_num, lock); + } + return lock; + } + + void MetaPrefix::captureCoWPage(std::uint64_t page_num, const MemLock &lock) + { + // Avoid SparseIndexQuery here; a loaded DRAM page is enough to decide + // whether keeping an in-memory previous version is useful for diff flush. + if (!hasPage(page_num)) { + return; + } + + auto resource_lock = lock.lock(); + if (!resource_lock) { + THROWF(db0::InternalException) << "MetaPrefix: missing page lock for previous page capture"; + } + auto &cow_page = m_cow_pages[page_num]; + cow_page.resize(getPageSize()); + std::memcpy(cow_page.data(), resource_lock->getBuffer(), cow_page.size()); + } + + std::uint64_t MetaPrefix::commit(ProcessTimer *) + { + // MetaPrefix dirty pages must already be persisted by flush(MetaPrefix &, RandomIO_Stream &). + // Commit is only the post-flush transaction boundary; accepting dirty pages here + // would hide a missed detach/cache-commit preparation step in the owner. + if (isDirty()) { + THROWF(db0::InternalException) + << "MetaPrefix::commit requires flush(MetaPrefix &, RandomIO_Stream &) for dirty pages"; + } + + // The sparse pair belongs to this MetaPrefix and may still have pending + // sparse/diff index write-backs. Commit it before dirty-page detection so + // the flush scans the final metadata image for this transaction. + m_parent_index.commit(); + m_cow_pages.clear(); + return getStateNum(false); + } + + bool flush(MetaPrefix &prefix, RandomIO_Stream &page_io, ProcessTimer *) + { + // The owner must complete metadata detach/cache-commit preparation before + // this scan. Flush only persists an already registered application state; + // it must not advance state or perform hidden write-back preparation. + bool was_dirty = false; + auto state_num = prefix.getStateNum(false); + prefix.flushDirty([&](std::uint64_t page_num, const void *buffer) { + was_dirty |= prefix.flushPage(page_io, page_num, buffer, state_num); + }); + + if (!was_dirty) { + return false; + } + + page_io.flush(); + prefix.commit(); + return true; + } + + bool MetaPrefix::flushPage(RandomIO_Stream &page_io, std::uint64_t page_num, const void *buffer, StateNumType state_num) + { + auto cow_page = m_cow_pages.find(page_num); + if (cow_page != m_cow_pages.end()) { + std::vector diffs; + if (getDiffs(cow_page->second.data(), buffer, getPageSize(), diffs) && !diffs.empty()) { + bool is_first_page = false; + auto [storage_page_num, overflow] = page_io.appendDiff( + buffer, { page_num, state_num }, diffs, &is_first_page + ); + m_parent_index.getDiffIndex().insert(page_num, state_num, storage_page_num, overflow); + return true; + } + if (diffs.empty()) { + return false; + } + } + + auto storage_page_num = page_io.appendRandom(buffer); + if (storage_page_num == 0) { + THROWF(db0::InternalException) << "MetaPrefix: storage page 0 is reserved as an empty full-DP sentinel"; + } + m_parent_index.getSparseIndex().emplace(page_num, state_num, storage_page_num); + return true; + } + + std::uint64_t MetaPrefix::writeFullPage(RandomIO_Stream &page_io, const void *buffer, + std::uint64_t reusable_storage_page_num) + { + if (reusable_storage_page_num != 0) { + page_io.writeRandom(reusable_storage_page_num, buffer); + return reusable_storage_page_num; + } + + auto storage_page_num = page_io.appendRandom(buffer); + if (storage_page_num == 0) { + THROWF(db0::InternalException) << "MetaPrefix: storage page 0 is reserved as an empty full-DP sentinel"; + } + return storage_page_num; + } + + void MetaPrefix::publishCompactedState(StateNumType state_num) + { + m_parent_index.recordMaxStateNum(state_num); + m_parent_index.commit(); + m_cow_pages.clear(); + flushDirty([&](std::uint64_t, const void *) {}); + } + + bool compact(MetaPrefix &prefix, RandomIO_Stream &page_io, ProcessTimer *) + { + std::map dirty_pages; + prefix.forEachDirtyPage([&](std::uint64_t page_num, const void *buffer) { + dirty_pages[page_num] = buffer; + }); + + std::vector sparse_page_nums; + sparse_page_nums.reserve(prefix.m_parent_index.getSparseIndex().size()); + std::uint64_t previous_page_num = 0; + for (auto it = prefix.m_parent_index.getSparseIndex().cbegin(); !it.is_end(); ++it) { + auto item = *it; + if (!!item && item.m_page_num != 0 && item.m_page_num != previous_page_num) { + sparse_page_nums.push_back(item.m_page_num); + previous_page_num = item.m_page_num; + } + } + + std::vector page_nums; + page_nums.reserve(sparse_page_nums.size() + dirty_pages.size()); + auto sparse_it = sparse_page_nums.begin(); + auto dirty_it = dirty_pages.begin(); + while (sparse_it != sparse_page_nums.end() || dirty_it != dirty_pages.end()) { + if (dirty_it == dirty_pages.end() + || (sparse_it != sparse_page_nums.end() && *sparse_it < dirty_it->first)) { + page_nums.push_back(*sparse_it); + ++sparse_it; + } else if (sparse_it == sparse_page_nums.end() || dirty_it->first < *sparse_it) { + if (dirty_it->first != 0) { + page_nums.push_back(dirty_it->first); + } + ++dirty_it; + } else { + page_nums.push_back(*sparse_it); + ++sparse_it; + ++dirty_it; + } + } + + if (page_nums.empty()) { + return false; + } + + auto before_state_num = prefix.getStateNum(false); + auto new_state_num = before_state_num + 1; + auto reusable_full_pages = collectReusableFullPageNums(prefix.m_parent_index, before_state_num); + std::size_t next_reusable_page = 0; + std::vector page_buffer(prefix.getPageSize()); + + for (auto page_num: page_nums) { + auto dirty_it = dirty_pages.find(page_num); + if (dirty_it != dirty_pages.end()) { + std::memcpy(page_buffer.data(), dirty_it->second, page_buffer.size()); + } else if (prefix.hasPage(page_num)) { + auto lock = prefix.mapRange(page_num * prefix.getPageSize(), prefix.getPageSize(), { AccessOptions::read }); + std::memcpy(page_buffer.data(), static_cast(lock), page_buffer.size()); + } else if (!fetchPage(prefix, page_io, page_num, before_state_num, page_buffer.data())) { + continue; + } + + auto reusable_storage_page_num = next_reusable_page < reusable_full_pages.size() + ? reusable_full_pages[next_reusable_page++] + : 0; + auto storage_page_num = prefix.writeFullPage(page_io, page_buffer.data(), reusable_storage_page_num); + prefix.m_parent_index.getSparseIndex().update(page_num, new_state_num, storage_page_num); + } + + prefix.publishCompactedState(new_state_num); + return true; + } + + StateNumType MetaPrefix::getStateNum() const + { + return m_parent_index.getMaxStateNum(); + } + + StateNumType MetaPrefix::getStateNum(bool) const + { + return m_parent_index.getMaxStateNum(); + } + + std::size_t MetaPrefix::flushDirty(std::size_t) + { + THROWF(db0::InternalException) << "MetaPrefix::flushDirty(std::size_t) is unsupported; use flush(MetaPrefix &, RandomIO_Stream &)"; + return 0; + } + + void MetaPrefix::forAllocatedAddresses(std::function sink) const + { + std::uint64_t last_page_num = 0; + for (auto it = m_parent_index.getSparseIndex().cbegin(); !it.is_end(); ++it) { + auto item = *it; + if (!!item && item.m_page_num != 0 && item.m_page_num != last_page_num) { + sink(item.m_page_num * getPageSize()); + last_page_num = item.m_page_num; + } + } + } + +} diff --git a/src/dbzero/core/dram/MetaPrefix.hpp b/src/dbzero/core/dram/MetaPrefix.hpp new file mode 100644 index 00000000..49f8c98e --- /dev/null +++ b/src/dbzero/core/dram/MetaPrefix.hpp @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + class RandomIO_Stream; + + class MetaPrefix: public DRAM_Prefix + { + public: + using DRAM_Prefix::flushDirty; + + /// @brief Create a MetaPrefix instance over the shared sparse mapping. + /// @param page_size + /// @param sparse_pair maintains storage locations of the managed metadata pages + MetaPrefix(std::size_t page_size, SparsePair &parent_index); + + MemLock mapRange(std::uint64_t address, std::size_t size, FlagSet = {}) override; + + std::uint64_t commit(ProcessTimer * = nullptr) override; + + StateNumType getStateNum(bool finalized = false) const override; + + std::size_t flushDirty(std::size_t limit) override; + + void forAllocatedAddresses(std::function sink) const; + + // Get current head state number + StateNumType getStateNum() const; + + protected: + SparsePair &m_parent_index; + + private: + std::unordered_map > m_cow_pages; + + bool flushPage(RandomIO_Stream &page_io, std::uint64_t page_num, const void *buffer, StateNumType state_num); + + std::uint64_t writeFullPage(RandomIO_Stream &page_io, const void *buffer, + std::uint64_t reusable_storage_page_num = 0); + + void publishCompactedState(StateNumType state_num); + + void captureCoWPage(std::uint64_t page_num, const MemLock &lock); + + friend void load(MetaPrefix &prefix, RandomIO_Stream &page_io); + friend bool fetchPage(MetaPrefix &prefix, RandomIO_Stream &page_io, std::uint64_t page_num, + StateNumType state_num, void *buffer); + friend void load(MetaPrefix &prefix, RandomIO_Stream &page_io, const std::uint64_t *page_num, + const std::uint64_t *end, DRAM_Allocator::Updater &&updater); + + friend bool flush(MetaPrefix &prefix, RandomIO_Stream &page_io, ProcessTimer *timer); + + friend bool compact(MetaPrefix &prefix, RandomIO_Stream &page_io, ProcessTimer *timer); + }; + + // Load or refresh ALL pages from the current head state + void load(MetaPrefix &, RandomIO_Stream &); + + // Load or refresh specific pages from the current head state + // this operation is optimized for large page batches + // @param page_nums sorted page numbers to load + // @param updater optional updater to initialize or refresh the associated allocator's state + void load(MetaPrefix &, RandomIO_Stream &, const std::vector &page_nums, + DRAM_Allocator::Updater &&updater = {}); + void load(MetaPrefix &, RandomIO_Stream &, const std::uint64_t *page_num, const std::uint64_t *end, + DRAM_Allocator::Updater &&updater = {}); + + bool flush(MetaPrefix &prefix, RandomIO_Stream &page_io, ProcessTimer *timer = nullptr); + + /** + * Manually compact MetaSpace page storage. + * + * Stages the current head state of all persisted and dirty metadata pages + * as full DPs at the next state number. Disk writes preserve storage pages + * needed to read the current head state, prefer reusing stale full-DP pages + * from the previous state when safe, and do not flush or clear the diff + * stream. Obsolete diff storage must be reclaimed by a later external step + * after the compacted head is durably published. + * + * @return true if a compacted state was published, false when there are no + * metadata pages to compact. + */ + bool compact(MetaPrefix &prefix, RandomIO_Stream &page_io, ProcessTimer *timer = nullptr); + +} diff --git a/src/dbzero/core/dram/MetaSpace.cpp b/src/dbzero/core/dram/MetaSpace.cpp new file mode 100644 index 00000000..7bcafc2b --- /dev/null +++ b/src/dbzero/core/dram/MetaSpace.cpp @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include "MetaSpace.hpp" +#include "MetaPrefix.hpp" +#include +#include +#include +#include +#include + +namespace db0 + +{ + + Memspace MetaSpace::create(std::size_t page_size, SparsePair &sparse_pair, RandomIO_Stream &page_io) + { + auto prefix = std::make_shared(page_size, sparse_pair); + load(*prefix, page_io); + auto allocator = std::make_shared(page_size); + auto updater = allocator->beginUpdate(); + prefix->forAllocatedAddresses([&](std::uint64_t address) { + updater(address); + }); + return { prefix, allocator }; + } + + MS_MetaSpace::MS_MetaSpace(std::shared_ptr prefix, std::shared_ptr allocator) + : Memspace(std::move(prefix), std::move(allocator)) + { + } + + MS_MetaSpace MS_MetaSpace::create(std::size_t page_size, SparsePair &sparse_pair, RandomIO_Stream &page_io) + { + auto prefix = std::make_shared(page_size, sparse_pair, page_io); + auto allocator = std::make_shared(sparse_pair, page_size); + return { prefix, allocator }; + } + + std::shared_ptr MS_MetaSpace::getMSPrefixPtr() const + { + return std::static_pointer_cast(m_prefix); + } + + std::shared_ptr MS_MetaSpace::getMSAllocatorPtr() const + { + return std::static_pointer_cast(m_allocator); + } + +} diff --git a/src/dbzero/core/dram/MetaSpace.hpp b/src/dbzero/core/dram/MetaSpace.hpp new file mode 100644 index 00000000..ce2be043 --- /dev/null +++ b/src/dbzero/core/dram/MetaSpace.hpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include +#include +#include +namespace db0 + +{ + + class RandomIO_Stream; + + struct MetaSpace: public DRAMSpace + { + static Memspace create(std::size_t page_size, SparsePair &sparse_pair, RandomIO_Stream &page_io); + }; + + class MS_MetaSpace: public Memspace + { + public: + static MS_MetaSpace create(std::size_t page_size, SparsePair &sparse_pair, RandomIO_Stream &); + + std::shared_ptr getMSPrefixPtr() const; + + std::shared_ptr getMSAllocatorPtr() const; + + private: + MS_MetaSpace(std::shared_ptr prefix, std::shared_ptr allocator); + }; + +} diff --git a/src/dbzero/core/memory/AlgoAllocator.cpp b/src/dbzero/core/memory/AlgoAllocator.cpp index 40be0a00..84692dd1 100644 --- a/src/dbzero/core/memory/AlgoAllocator.cpp +++ b/src/dbzero/core/memory/AlgoAllocator.cpp @@ -16,7 +16,7 @@ namespace db0 { } - std::optional
AlgoAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + std::optional
AlgoAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char, unsigned char) { assert(slot_num == 0); diff --git a/src/dbzero/core/memory/AlgoAllocator.hpp b/src/dbzero/core/memory/AlgoAllocator.hpp index 170d2f4a..30b8a475 100644 --- a/src/dbzero/core/memory/AlgoAllocator.hpp +++ b/src/dbzero/core/memory/AlgoAllocator.hpp @@ -20,7 +20,7 @@ namespace db0 AlgoAllocator(AddressPoolF f, ReverseAddressPoolF rf, std::size_t alloc_size); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; diff --git a/src/dbzero/core/memory/Allocator.cpp b/src/dbzero/core/memory/Allocator.cpp index 3a2b781f..81fb7328 100644 --- a/src/dbzero/core/memory/Allocator.cpp +++ b/src/dbzero/core/memory/Allocator.cpp @@ -9,13 +9,13 @@ namespace db0 { std::optional Allocator::tryAllocUnique( - std::size_t, std::uint32_t, bool, unsigned char, unsigned char) + std::size_t, SlotId, bool, unsigned char, unsigned char) { THROWF(InternalException) << "Allocator: unique allocation not supported by: " << typeid(*this).name() << THROWF_END; } - Address Allocator::alloc(std::size_t size, std::uint32_t slot_num, bool aligned, + Address Allocator::alloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char locality) { auto result = tryAlloc(size, slot_num, aligned, realm_id, locality); @@ -25,7 +25,7 @@ namespace db0 return *result; } - UniqueAddress Allocator::allocUnique(std::size_t size, std::uint32_t slot_num, bool aligned, + UniqueAddress Allocator::allocUnique(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char locality) { auto result = tryAllocUnique(size, slot_num, aligned, realm_id, locality); @@ -60,7 +60,7 @@ namespace db0 return findAllocation(address); } - std::pair > Allocator::getRange(std::uint32_t slot_num) const + std::pair > Allocator::getRange(SlotId slot_num) const { if (slot_num != 0) { THROWF(InternalException) << "Invalid / unsupported slot number"; diff --git a/src/dbzero/core/memory/Allocator.hpp b/src/dbzero/core/memory/Allocator.hpp index 4246ecc4..18724eff 100644 --- a/src/dbzero/core/memory/Allocator.hpp +++ b/src/dbzero/core/memory/Allocator.hpp @@ -21,6 +21,8 @@ namespace db0 class Allocator { public: + using SlotId = std::uint64_t; + struct AllocationInfo { Address address; @@ -37,13 +39,13 @@ namespace db0 * Note that slot functionality is implementation specific and may not be supported by all allocators. * We use slots in special cases where objects needs to be allocated from a limited narrow address range */ - virtual std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + virtual std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) = 0; // Try allocating a unique, never repeating address // NOTE: this functionality is only supported by some allocators // The default throwing implementation is provided - virtual std::optional tryAllocUnique(std::size_t size, std::uint32_t slot_num = 0, + virtual std::optional tryAllocUnique(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0); /** @@ -97,10 +99,10 @@ namespace db0 * @param slot_num optional slot number to allocate from (slot_num = 0 means any slot). * @return the address of the range */ - Address alloc(std::size_t size, std::uint32_t slot_num = 0, bool aligned = false, + Address alloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0); - UniqueAddress allocUnique(std::size_t size, std::uint32_t slot_num = 0, bool aligned = false, + UniqueAddress allocUnique(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0); // Check if the address is within the range managed by the allocator @@ -109,7 +111,7 @@ namespace db0 // Get range covered by the allocator or a specific slot // @return begin / end (which might be undefined for unlimited allocators) - virtual std::pair > getRange(std::uint32_t slot_num = 0) const; + virtual std::pair > getRange(SlotId slot_num = 0) const; // To be implemented where it makes sense virtual void close(); diff --git a/src/dbzero/core/memory/BitsetAllocator.hpp b/src/dbzero/core/memory/BitsetAllocator.hpp index 93e99a56..1d65f6eb 100644 --- a/src/dbzero/core/memory/BitsetAllocator.hpp +++ b/src/dbzero/core/memory/BitsetAllocator.hpp @@ -26,7 +26,7 @@ namespace db0 */ BitsetAllocator(BitSetT &&bitset, Address base_addr, std::size_t alloc_size, int direction); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; @@ -100,7 +100,7 @@ namespace db0 } template std::optional
- BitsetAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, bool aligned, unsigned char, unsigned char) + BitsetAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char, unsigned char) { assert(slot_num == 0); // all BitSetAllocator allocations are aligned diff --git a/src/dbzero/core/memory/DP_Lock.cpp b/src/dbzero/core/memory/DP_Lock.cpp index 8443861f..1b0c1d6e 100644 --- a/src/dbzero/core/memory/DP_Lock.cpp +++ b/src/dbzero/core/memory/DP_Lock.cpp @@ -23,6 +23,9 @@ namespace db0 assert(addrPageAligned(m_context.m_storage_ref.get())); // initialzie the local buffer if (access_mode[AccessOptions::read]) { + if (read_state_num == 0) { + return; + } assert(read_state_num > 0); // read into the local buffer m_context.m_storage_ref.get().read( @@ -150,4 +153,4 @@ namespace db0 } #endif -} \ No newline at end of file +} diff --git a/src/dbzero/core/memory/DirtyCache.cpp b/src/dbzero/core/memory/DirtyCache.cpp index a3ec589f..d1bdf43e 100644 --- a/src/dbzero/core/memory/DirtyCache.cpp +++ b/src/dbzero/core/memory/DirtyCache.cpp @@ -88,6 +88,17 @@ namespace db0 return flushed; } + bool DirtyCache::empty() const + { + std::unique_lock lock(m_mutex); + for (auto &res_lock : m_locks) { + if (res_lock->isDirty()) { + return false; + } + } + return true; + } + void DirtyCache::flushDirty(SinkFunction sink) { std::unique_lock lock(m_mutex); diff --git a/src/dbzero/core/memory/DirtyCache.hpp b/src/dbzero/core/memory/DirtyCache.hpp index aa1f6821..2549d44c 100644 --- a/src/dbzero/core/memory/DirtyCache.hpp +++ b/src/dbzero/core/memory/DirtyCache.hpp @@ -40,6 +40,9 @@ namespace db0 * The flush order is undefined */ void flushDirty(SinkFunction); + + // Check if there are any dirty locks + bool empty() const; // NOTE: size only works for a metered cache (i.e. initialized with the dirty_meter) std::size_t size() const; diff --git a/src/dbzero/core/memory/Memspace.cpp b/src/dbzero/core/memory/Memspace.cpp index 26a76491..7c3d1667 100644 --- a/src/dbzero/core/memory/Memspace.cpp +++ b/src/dbzero/core/memory/Memspace.cpp @@ -159,12 +159,12 @@ namespace db0 return canceled_modified; } - Address Memspace::alloc(std::size_t size, std::uint32_t slot_num, unsigned char realm_id, unsigned char locality) { + Address Memspace::alloc(std::size_t size, Allocator::SlotId slot_num, unsigned char realm_id, unsigned char locality) { // align if the alloc size > page size return getAllocatorForUpdate().alloc(size, slot_num, size > m_page_size, realm_id, locality); } - UniqueAddress Memspace::allocUnique(std::size_t size, std::uint32_t slot_num, unsigned char realm_id, unsigned char locality) { + UniqueAddress Memspace::allocUnique(std::size_t size, Allocator::SlotId slot_num, unsigned char realm_id, unsigned char locality) { return getAllocatorForUpdate().allocUnique(size, slot_num, size > m_page_size, realm_id, locality); } diff --git a/src/dbzero/core/memory/Memspace.hpp b/src/dbzero/core/memory/Memspace.hpp index 16714a5d..b409f169 100644 --- a/src/dbzero/core/memory/Memspace.hpp +++ b/src/dbzero/core/memory/Memspace.hpp @@ -49,9 +49,9 @@ namespace db0 } // Memspace::alloc implements the auto-align logic - Address alloc(std::size_t size, std::uint32_t slot_num = 0, unsigned char realm_id = 0, + Address alloc(std::size_t size, Allocator::SlotId slot_num = 0, unsigned char realm_id = 0, unsigned char locality = 0); - UniqueAddress allocUnique(std::size_t size, std::uint32_t slot_num = 0, unsigned char realm_id = 0, + UniqueAddress allocUnique(std::size_t size, Allocator::SlotId slot_num = 0, unsigned char realm_id = 0, unsigned char locality = 0); void free(Address); diff --git a/src/dbzero/core/memory/MetaAllocator.cpp b/src/dbzero/core/memory/MetaAllocator.cpp index 60e158c6..71989a9e 100644 --- a/src/dbzero/core/memory/MetaAllocator.cpp +++ b/src/dbzero/core/memory/MetaAllocator.cpp @@ -70,6 +70,22 @@ namespace db0 }; } + MetaAllocator::StorageSlabBucketingFunction MetaAllocator::getStorageSlabBucketingFunction( + std::size_t page_size, std::size_t slab_size) + { + return getStorageSlabBucketingFunction(0, page_size, slab_size); + } + + MetaAllocator::StorageSlabBucketingFunction MetaAllocator::getStorageSlabBucketingFunction( + std::size_t offset, std::size_t page_size, std::size_t slab_size) + { + (void)page_size; + return { + static_cast(offset), + static_cast(slab_size) + }; + } + std::function MetaAllocator::getSlabIdFunction(std::size_t offset, std::size_t page_size, std::size_t slab_size) { @@ -204,14 +220,14 @@ namespace db0 return meta_header.const_ref(); } - std::optional
MetaAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + std::optional
MetaAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char locality) { std::uint16_t instance_id; return tryAllocImpl(size, slot_num, aligned, false, instance_id, realm_id, locality); } - std::optional MetaAllocator::tryAllocUnique(std::size_t size, std::uint32_t slot_num, + std::optional MetaAllocator::tryAllocUnique(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char locality) { std::uint16_t instance_id; @@ -222,7 +238,7 @@ namespace db0 return {}; } - std::optional
MetaAllocator::tryAllocImpl(std::size_t size, std::uint32_t slot_num, bool aligned, bool unique, + std::optional
MetaAllocator::tryAllocImpl(std::size_t size, SlotId slot_num, bool aligned, bool unique, std::uint16_t &instance_id, unsigned char realm_id, unsigned char locality) { assert(slot_num == 0); diff --git a/src/dbzero/core/memory/MetaAllocator.hpp b/src/dbzero/core/memory/MetaAllocator.hpp index 76d3ca9a..cf28f32e 100644 --- a/src/dbzero/core/memory/MetaAllocator.hpp +++ b/src/dbzero/core/memory/MetaAllocator.hpp @@ -74,10 +74,10 @@ DB0_PACKED_END using CapacityTreeT = SGB_Tree; using SlabTreeT = SGB_Tree; - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; - std::optional tryAllocUnique(std::size_t size, std::uint32_t slot_num = 0, + std::optional tryAllocUnique(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; @@ -109,6 +109,68 @@ DB0_PACKED_END static std::function getReverseAddressPool(std::size_t offset, std::size_t page_size, std::size_t slab_size); + /** + * Fast bucketing function for raw BDevStorage byte addresses. + * + * This deliberately ignores MetaAllocator's internal metadata/slab layout and divides + * the storage address space into equal-size buckets: bucket_id = (address - offset) / slab_size. + * Use getSlabIdFunction() for real allocator slab lookup. + */ + struct StorageSlabBucketingFunction + { + /** + * Describes the storage bucket containing a raw BDevStorage byte address. + * + * m_slot_id is the meta-space slot used to store sparse-pair metadata for + * pages in the bucket. m_begin_page_num and m_end_page_num form a half-open + * logical page range [begin, end) covered by that slot. + */ + struct Bucket + { + std::uint32_t m_slot_id = 0; + std::uint64_t m_begin_page_num = 0; + std::uint64_t m_end_page_num = 0; + }; + + std::uint64_t m_offset = 0; + std::uint64_t m_slab_size = 0; + + std::uint32_t operator()(std::uint64_t address) const + { + auto rel_address = address > m_offset ? address - m_offset : 0; + return static_cast(rel_address / m_slab_size); + } + + std::uint32_t operator()(Address address) const + { + return (*this)(address.getOffset()); + } + + /** + * Return the bucket id plus logical page span for the bucket containing address. + * + * The returned page range is half-open and may be wider than the exact byte + * range when m_offset or m_slab_size are not page-aligned. + */ + Bucket getBucket(std::uint64_t address, std::uint32_t page_size) const + { + auto slot_id = (*this)(address); + auto begin_address = m_offset + static_cast(slot_id) * m_slab_size; + auto end_address = begin_address + m_slab_size; + return { + slot_id, + begin_address / page_size, + (end_address + page_size - 1) / page_size + }; + } + }; + + static StorageSlabBucketingFunction getStorageSlabBucketingFunction( + std::size_t page_size, std::size_t slab_size); + + static StorageSlabBucketingFunction getStorageSlabBucketingFunction( + std::size_t offset, std::size_t page_size, std::size_t slab_size); + static std::function getSlabIdFunction(std::size_t offset, std::size_t page_size, std::size_t slab_size); @@ -237,7 +299,7 @@ DB0_PACKED_END std::shared_ptr getSlabAllocator(std::size_t min_capacity); // NOTE: instance ID will only be populated when unique = true - std::optional
tryAllocImpl(std::size_t size, std::uint32_t slot_num, bool aligned, bool unique, + std::optional
tryAllocImpl(std::size_t size, SlotId slot_num, bool aligned, bool unique, std::uint16_t &instance_id, unsigned char realm_id, unsigned char locality); }; diff --git a/src/dbzero/core/memory/OneShotAllocator.cpp b/src/dbzero/core/memory/OneShotAllocator.cpp index c9c8b131..810ae717 100644 --- a/src/dbzero/core/memory/OneShotAllocator.cpp +++ b/src/dbzero/core/memory/OneShotAllocator.cpp @@ -15,7 +15,7 @@ namespace db0 { } - std::optional
OneShotAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + std::optional
OneShotAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char, unsigned char) { assert(slot_num == 0); diff --git a/src/dbzero/core/memory/OneShotAllocator.hpp b/src/dbzero/core/memory/OneShotAllocator.hpp index d08aa3ff..fbdac90f 100644 --- a/src/dbzero/core/memory/OneShotAllocator.hpp +++ b/src/dbzero/core/memory/OneShotAllocator.hpp @@ -17,7 +17,7 @@ namespace db0 public: OneShotAllocator(Address addr, std::size_t size); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; diff --git a/src/dbzero/core/memory/ResourceLock.cpp b/src/dbzero/core/memory/ResourceLock.cpp index d9c9d0f8..898e8d89 100644 --- a/src/dbzero/core/memory/ResourceLock.cpp +++ b/src/dbzero/core/memory/ResourceLock.cpp @@ -121,7 +121,7 @@ namespace db0 other.discard(); } - void ResourceLock::setDirty() + bool ResourceLock::setDirty() { if (atomicCheckAndSetFlags(m_resource_flags, db0::RESOURCE_DIRTY)) { // register lock with the dirty cache @@ -130,7 +130,9 @@ namespace db0 // register with the dirty cache m_context.m_cache_ref.get().append(shared_from_this()); } + return true; } + return false; } void ResourceLock::freeze() { @@ -242,4 +244,4 @@ namespace db0 return m_context.m_storage_ref.get().getPageSize(); } -} \ No newline at end of file +} diff --git a/src/dbzero/core/memory/ResourceLock.hpp b/src/dbzero/core/memory/ResourceLock.hpp index 5b2ee459..b63fdca4 100644 --- a/src/dbzero/core/memory/ResourceLock.hpp +++ b/src/dbzero/core/memory/ResourceLock.hpp @@ -117,8 +117,13 @@ namespace db0 return !m_access_mode[AccessOptions::no_cache]; } - // Mark lock as dirty without range specification - void setDirty(); + /** + * Mark the whole lock as dirty without recording a specific dirty range. + * + * @return true if this call transitioned the lock from clean to dirty; + * false if the lock was already dirty. + */ + bool setDirty(); // Mark a specific range as forced-dirty // it will be assumed dirty even if the data is not changed @@ -231,4 +236,4 @@ namespace db0 std::ostream &showBytes(std::ostream &, const std::byte *, std::size_t); -} \ No newline at end of file +} diff --git a/src/dbzero/core/memory/SlabAllocator.cpp b/src/dbzero/core/memory/SlabAllocator.cpp index 5896ff80..c93965e4 100644 --- a/src/dbzero/core/memory/SlabAllocator.cpp +++ b/src/dbzero/core/memory/SlabAllocator.cpp @@ -63,7 +63,7 @@ namespace db0 { } - std::optional
SlabAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + std::optional
SlabAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char, unsigned char) { assert(slot_num == 0); @@ -292,7 +292,7 @@ namespace db0 (address.getOffset() < m_begin_addr.getOffset() + m_slab_size); } - std::pair > SlabAllocator::getRange(std::uint32_t slot_num) const + std::pair > SlabAllocator::getRange(SlotId slot_num) const { assert(!slot_num && "SlabAllocator does not support slots"); return { m_begin_addr, m_begin_addr + static_cast(m_slab_size) }; diff --git a/src/dbzero/core/memory/SlabAllocator.hpp b/src/dbzero/core/memory/SlabAllocator.hpp index 611e083a..9ae952c7 100644 --- a/src/dbzero/core/memory/SlabAllocator.hpp +++ b/src/dbzero/core/memory/SlabAllocator.hpp @@ -69,7 +69,7 @@ DB0_PACKED_END virtual ~SlabAllocator(); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; @@ -152,7 +152,7 @@ DB0_PACKED_END UniqueAddress tryMakeAddressUnique(Address); // Get address range of the entire slab (begin, end), not the actually allocated space - std::pair > getRange(std::uint32_t slot_num = 0) const override; + std::pair > getRange(SlotId slot_num = 0) const override; private: using AllocSetT = db0::CRDT_Allocator::AllocSetT; diff --git a/src/dbzero/core/memory/SlabManager.cpp b/src/dbzero/core/memory/SlabManager.cpp index 41370903..d05bf63c 100644 --- a/src/dbzero/core/memory/SlabManager.cpp +++ b/src/dbzero/core/memory/SlabManager.cpp @@ -631,7 +631,7 @@ namespace db0 } } - std::optional
SlabManager::tryAlloc(std::size_t size, std::uint32_t slot_num, bool aligned, + std::optional
SlabManager::tryAlloc(std::size_t size, Allocator::SlotId slot_num, bool aligned, bool unique, std::uint16_t &instance_id, unsigned char locality) { auto slab = tryGetActiveSlab(locality); diff --git a/src/dbzero/core/memory/SlabManager.hpp b/src/dbzero/core/memory/SlabManager.hpp index b348c483..71067ee3 100644 --- a/src/dbzero/core/memory/SlabManager.hpp +++ b/src/dbzero/core/memory/SlabManager.hpp @@ -42,7 +42,7 @@ namespace db0 std::function address_func, std::function slab_id_func, unsigned char realm_id, bool deferred_free); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num, bool aligned, bool unique, + std::optional
tryAlloc(std::size_t size, Allocator::SlotId slot_num, bool aligned, bool unique, std::uint16_t &instance_id, unsigned char locality); void free(Address address); diff --git a/src/dbzero/core/memory/SlotAllocator.cpp b/src/dbzero/core/memory/SlotAllocator.cpp index c97824c7..8be420ea 100644 --- a/src/dbzero/core/memory/SlotAllocator.cpp +++ b/src/dbzero/core/memory/SlotAllocator.cpp @@ -16,15 +16,19 @@ namespace db0 { } - void SlotAllocator::setSlot(std::uint32_t slot_num, std::shared_ptr slot_allocator) + void SlotAllocator::setSlot(SlotId slot_num, std::shared_ptr slot_allocator) { if (slot_num == 0) { THROWF(db0::InternalException) << "slot 0 is reserved for the general allocator"; } - if (slot_num >= m_slots.size()) { - m_slots.resize(slot_num + 1); + if (slot_num >= static_cast(m_slots.max_size())) { + THROWF(db0::InternalException) << "slot " << slot_num << " exceeds slot allocator range"; } - m_slots[slot_num] = slot_allocator; + auto index = static_cast(slot_num); + if (index >= m_slots.size()) { + m_slots.resize(index + 1); + } + m_slots[index] = slot_allocator; } struct ScopedAllocBuf @@ -49,7 +53,7 @@ namespace db0 } }; - std::optional
SlotAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + std::optional
SlotAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char locality) { if (!slot_num) { @@ -59,7 +63,7 @@ namespace db0 return select(slot_num).tryAlloc(size, 0, aligned, realm_id, locality); } - std::optional SlotAllocator::tryAllocUnique(std::size_t size, std::uint32_t slot_num, + std::optional SlotAllocator::tryAllocUnique(std::size_t size, SlotId slot_num, bool aligned, unsigned char realm_id, unsigned char locality) { if (!slot_num) { @@ -100,7 +104,7 @@ namespace db0 return m_allocator_ptr->findAllocation(address, realm_id); } - Allocator::AllocationInfo SlotAllocator::findAllocation(Address address, std::uint32_t slot_num) const + Allocator::AllocationInfo SlotAllocator::findAllocation(Address address, SlotId slot_num) const { if (slot_num == 0) { return findAllocation(address); @@ -108,7 +112,7 @@ namespace db0 return getSlot(slot_num).findAllocation(address); } - Allocator::AllocationInfo SlotAllocator::findAllocation(Address address, std::uint32_t slot_num, unsigned char realm_id) const + Allocator::AllocationInfo SlotAllocator::findAllocation(Address address, SlotId slot_num, unsigned char realm_id) const { if (slot_num == 0) { return findAllocation(address, realm_id); @@ -136,28 +140,29 @@ namespace db0 } } - Allocator &SlotAllocator::select(std::uint32_t slot_num) + Allocator &SlotAllocator::select(SlotId slot_num) { if (slot_num == 0) { return *m_allocator_ptr; } - assert(slot_num < m_slots.size() && m_slots[slot_num]); - return *m_slots[slot_num]; + assert(slot_num < static_cast(m_slots.size()) && m_slots[static_cast(slot_num)]); + return *m_slots[static_cast(slot_num)]; } - SlabAllocator &SlotAllocator::getSlot(std::uint32_t slot_num) const + SlabAllocator &SlotAllocator::getSlot(SlotId slot_num) const { - if (!slot_num || slot_num >= m_slots.size() || !m_slots[slot_num]) { + if (!slot_num || slot_num >= static_cast(m_slots.size()) + || !m_slots[static_cast(slot_num)]) { THROWF(db0::InternalException) << "slot " << slot_num << " not found"; } - return *m_slots[slot_num]; + return *m_slots[static_cast(slot_num)]; } bool SlotAllocator::inRange(Address address) const { return m_allocator_ptr->inRange(address); } - std::pair > SlotAllocator::getRange(std::uint32_t slot_num) const + std::pair > SlotAllocator::getRange(SlotId slot_num) const { if (slot_num == 0) { return m_allocator_ptr->getRange(0); diff --git a/src/dbzero/core/memory/SlotAllocator.hpp b/src/dbzero/core/memory/SlotAllocator.hpp index 2c9a242c..25bc3c1a 100644 --- a/src/dbzero/core/memory/SlotAllocator.hpp +++ b/src/dbzero/core/memory/SlotAllocator.hpp @@ -23,13 +23,13 @@ namespace db0 SlotAllocator(std::shared_ptr allocator); // initialize slot-specific allocator - void setSlot(std::uint32_t slot_num, std::shared_ptr slot_allocator); + void setSlot(SlotId slot_num, std::shared_ptr slot_allocator); - std::optional
tryAlloc(std::size_t size, std::uint32_t slot_num = 0, + std::optional
tryAlloc(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; // Unique allocations are not supported because of the limited slot's address space - std::optional tryAllocUnique(std::size_t size, std::uint32_t slot_num = 0, + std::optional tryAllocUnique(std::size_t size, SlotId slot_num = 0, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; @@ -42,8 +42,8 @@ namespace db0 AllocationInfo findAllocation(Address) const override; AllocationInfo findAllocation(Address, unsigned char realm_id) const override; - AllocationInfo findAllocation(Address, std::uint32_t slot_num) const; - AllocationInfo findAllocation(Address, std::uint32_t slot_num, unsigned char realm_id) const; + AllocationInfo findAllocation(Address, SlotId slot_num) const; + AllocationInfo findAllocation(Address, SlotId slot_num, unsigned char realm_id) const; void commit() const override; @@ -53,16 +53,16 @@ namespace db0 std::shared_ptr getAllocator() const { return m_allocator; } - SlabAllocator &getSlot(std::uint32_t slot_num) const; + SlabAllocator &getSlot(SlotId slot_num) const; - std::pair > getRange(std::uint32_t slot_num = 0) const override; + std::pair > getRange(SlotId slot_num = 0) const override; private: std::shared_ptr m_allocator; Allocator *m_allocator_ptr; std::vector > m_slots; - Allocator &select(std::uint32_t slot_num); + Allocator &select(SlotId slot_num); }; } diff --git a/src/dbzero/core/memory/WideLock.cpp b/src/dbzero/core/memory/WideLock.cpp index ee7537ae..b4870106 100644 --- a/src/dbzero/core/memory/WideLock.cpp +++ b/src/dbzero/core/memory/WideLock.cpp @@ -232,4 +232,4 @@ namespace db0 } } -} \ No newline at end of file +} diff --git a/src/dbzero/core/memory/WideLock.hpp b/src/dbzero/core/memory/WideLock.hpp index 27fab45d..2e359764 100644 --- a/src/dbzero/core/memory/WideLock.hpp +++ b/src/dbzero/core/memory/WideLock.hpp @@ -60,4 +60,4 @@ namespace db0 void resLockFlush(); }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/BDevStorage.cpp b/src/dbzero/core/storage/BDevStorage.cpp index 6fc8835d..e82a0ab5 100644 --- a/src/dbzero/core/storage/BDevStorage.cpp +++ b/src/dbzero/core/storage/BDevStorage.cpp @@ -3,26 +3,34 @@ #include "BDevStorage.hpp" #include "SparseIndexQuery.hpp" +#include "SparsePairQuery.hpp" #include #include +#include #include #include #include #include +#include #include #include #include "copy_prefix.hpp" +#include +#include namespace db0 { o_prefix_config::o_prefix_config(std::uint32_t block_size, std::uint32_t page_size, - std::uint32_t dram_page_size, std::uint32_t page_io_step_size) + std::uint32_t dram_page_size, std::uint32_t page_io_step_size, + std::uint32_t descriptor_page_size, std::uint64_t desc_io_head) : m_block_size(block_size) , m_page_size(page_size) , m_dram_page_size(dram_page_size) , m_page_io_step_size(page_io_step_size) + , m_descriptor_page_size(descriptor_page_size) + , m_desc_io_head(desc_io_head) { std::memset(m_reserved.data(), 0, sizeof(m_reserved)); } @@ -34,15 +42,81 @@ namespace db0 } return dram_io_ptr->getDRAMPair(); } + + Address getRootSparsePairAddress(const DRAM_IOStream &dram_io) + { + return dram_io.getDRAMPair().second->firstAlloc(); + } + + StorageOptions normalizeOptions(StorageOptions options, const o_prefix_config &config) + { + if (!options.m_storage_slab_bucketing) { + auto slot_size = static_cast(config.m_page_size) * (1ull << 24); + auto bucketing = MetaAllocator::getStorageSlabBucketingFunction(0, config.m_page_size, slot_size); + options.m_storage_slab_bucketing = [bucketing](std::uint64_t address) { + return bucketing(address); + }; + options.m_storage_slab_bucket = [bucketing, page_size = config.m_page_size](std::uint64_t address) { + return bucketing.getBucket(address, page_size); + }; + } + return options; + } + + MappingPolicy getOpenMetaMappingPolicy(const StorageOptions &options, StorageFlags flags) + { + return flags[StorageFlagOption::NO_LOAD] + ? MappingPolicy::lazy + : options.m_meta_mapping_policy; + } + void appendSparsePairManagerChangeLog(BDevStorage::DRAM_ChangeLogStreamT &changelog_io, + std::vector &&page_nums, StateNumType state_num) + { + std::sort(page_nums.begin(), page_nums.end()); + ChangeLogData cl_data; + for (auto page_num: page_nums) { + cl_data.m_rle_builder.append(page_num, false); + } + changelog_io.appendChangeLog(std::move(cl_data), state_num); + } + + template + void scanChangeLogs(BDevStorage::DRAM_ChangeLogStreamT &changelog_io, CallbackT callback, + StateNumType begin_state = 0, std::optional end_state = std::nullopt) + { + auto reader = changelog_io.getStreamReader(); + while (auto change_log = reader.readChangeLogChunk()) { + auto state_num = change_log->m_state_num; + if (end_state && state_num >= *end_state) { + break; + } + if (state_num < begin_state) { + continue; + } + callback(*change_log); + } + } + + template + void setChangeLogTail(ChangeLogStreamT &changelog_io) + { + auto reader = changelog_io.getStreamReader(); + while (reader.readChangeLogChunk()) { + } + } + BDevStorage::BDevStorage(const std::string &file_name, AccessType access_type, LockFlags lock_flags, - std::optional meta_io_step_size, StorageFlags flags) + std::optional meta_io_step_size, StorageFlags flags, StorageOptions options) : BaseStorage(access_type, flags) , m_file(file_name, access_type, lock_flags) , m_config(readConfig()) , m_dram_changelog_io(getChangeLogIOStream( m_config.m_dram_changelog_io_offset, access_type) ) + , m_desc_changelog_io(getChangeLogIOStream( + m_config.m_desc_changelog_io_offset, access_type) + ) , m_dp_changelog_io(getChangeLogIOStream( m_config.m_dp_changelog_io_offset, access_type) ) @@ -52,9 +126,7 @@ namespace db0 , m_dram_io(init(getDRAMIOStream( m_config.m_dram_io_offset, m_config.m_dram_page_size, access_type), m_dram_changelog_io, flags) ) - , m_sparse_pair(m_dram_io.getDRAMPair(), access_type, flags) - , m_sparse_index(m_sparse_pair.getSparseIndex()) - , m_diff_index(m_sparse_pair.getDiffIndex()) + , m_root_sparse_pair(m_dram_io.getDRAMPair(), access_type, getRootSparsePairAddress(m_dram_io), flags) , m_ext_dram_changelog_io(tryGetChangeLogIOStream( m_config.m_ext_dram_changelog_io_offset, access_type) ) @@ -62,36 +134,43 @@ namespace db0 m_config.m_ext_dram_io_offset, m_config.m_ext_dram_page_size, access_type), m_ext_dram_changelog_io.get(), // NOTE: the NO_LOAD flag is not applicable to ext DRAM IO since it's created on-demand - flags & ~ StorageFlags {StorageOptions::NO_LOAD }, + flags & ~ StorageFlags {StorageFlagOption::NO_LOAD }, // NOTE: we synchronize up to the maximum state number from DRAM IO (in read/write mode) this->getMaxExtStateNum()) ) , m_ext_space(tryGetDRAMPair(m_ext_dram_io.get()), access_type) + , m_options(normalizeOptions(std::move(options), m_config)) , m_page_io(getPage_IO(getNextStoragePageNum(), m_config.m_page_io_step_size)) + , m_desc_io(getDesc_IO()) + , m_meta_space(MS_MetaSpace::create( + m_config.m_descriptor_page_size, m_root_sparse_pair, m_desc_io)) + , m_sparse_pair_manager(m_meta_space, access_type, flags, getOpenMetaMappingPolicy(m_options, flags)) #ifndef NDEBUG , m_data_mirror(m_config.m_page_size) #endif { - if (m_access_type == AccessType::READ_WRITE && m_flags.test(StorageOptions::NO_LOAD)) { + if (m_access_type == AccessType::READ_WRITE && m_flags.test(StorageFlagOption::NO_LOAD)) { THROWF(db0::IOException) << "Cannot open prefix in READ_WRITE mode with NO_LOAD option"; } // in read-only mode need to refresh in order to retrieve a consitent DRAM state // since other process might be actively modifying the underlying file - if (m_access_type == AccessType::READ_ONLY && !m_flags.test(StorageOptions::NO_LOAD)) { - refresh(); - } - - // Validate state consistency - // The state number reported by DRAM IO must NOT superseed the last state number recorded in DP changelog - if (auto chunk_ptr = m_dp_changelog_io.getLastChangeLogChunk()) { - auto dp_state_num = chunk_ptr->m_state_num; - auto dram_state_num = m_sparse_pair.getMaxStateNum(); - if (dram_state_num > dp_state_num) { - THROWF(db0::IOException) << "Inconsistent state: DRAM state number " << dram_state_num - << " exceeds DP changelog state number " << dp_state_num; + if (m_access_type == AccessType::READ_ONLY) { + if (m_flags.test(StorageFlagOption::NO_LOAD)) { + setChangeLogTail(m_dram_changelog_io); + setChangeLogTail(m_dp_changelog_io); + if (m_ext_dram_changelog_io) { + setChangeLogTail(*m_ext_dram_changelog_io); + } + } else { + refresh(); } } + + // NOTE: since the desc-changelog is not required for the initial load + // (descriptor pages are loaded according based on index from the root_sparse_pair) + // we need to advance its position to tail without any initial processing + setChangeLogTail(m_desc_changelog_io); } BDevStorage::~BDevStorage() @@ -100,7 +179,7 @@ namespace db0 DRAM_IOStream BDevStorage::init(DRAM_IOStream &&dram_io, DRAM_ChangeLogStreamT &dram_change_log, StorageFlags flags) { - if (!flags[StorageOptions::NO_LOAD]) { + if (!flags[StorageFlagOption::NO_LOAD]) { dram_io.load(dram_change_log); } return std::move(dram_io); @@ -109,7 +188,7 @@ namespace db0 std::unique_ptr BDevStorage::initExt(std::unique_ptr &&dram_io, DRAM_ChangeLogStreamT *dram_change_log, StorageFlags flags, std::optional max_state_num) { - if (dram_io && !flags[StorageOptions::NO_LOAD]) { + if (dram_io && !flags[StorageFlagOption::NO_LOAD]) { assert(dram_change_log); dram_io->load(*dram_change_log, max_state_num); } @@ -118,7 +197,7 @@ namespace db0 MetaIOStream BDevStorage::init(MetaIOStream &&io, StorageFlags flags) { - if (!flags[StorageOptions::NO_LOAD]) { + if (!flags[StorageFlagOption::NO_LOAD]) { // exhaust the meta-log stream (position at the last item) and all managed streams io.setTailAll(); } @@ -133,6 +212,10 @@ namespace db0 if (config.m_magic != o_prefix_config::DB0_MAGIC) { THROWF(db0::IOException) << "Not a dbzero file: " << m_file.getName(); } + if (config.m_version != o_prefix_config::DB0_VERSION) { + THROWF(db0::IOException) << "Unsupported dbzero file version: " << config.m_version + << ", expected: " << o_prefix_config::DB0_VERSION; + } return config; } @@ -146,27 +229,102 @@ namespace db0 return 1u; } } + + std::uint32_t getDiffIOStepSize(std::uint32_t block_size, std::uint32_t page_size, + std::optional step_size_hint) + { + auto step_size = getPageIOStepSize(block_size, step_size_hint); + auto block_capacity = block_size / page_size; + if (block_capacity * step_size < 2) { + step_size = (2 + block_capacity - 1) / block_capacity; + } + return step_size; + } + + std::uint64_t alignStorageAddress(std::uint64_t address, std::uint32_t page_size, std::uint64_t header_size) + { + if (address <= header_size) { + return header_size; + } + auto rel_address = address - header_size; + auto rel_pages = (rel_address + page_size - 1) / page_size; + return header_size + rel_pages * page_size; + } + + std::uint32_t getDesc_IOStride(std::uint32_t page_size, std::uint32_t descriptor_page_size, + std::uint32_t stride_hint = 64) + { + if (descriptor_page_size < page_size || descriptor_page_size % page_size != 0) { + THROWF(db0::InternalException) + << "Descriptor page size must be a multiple of storage page size"; + } + auto page_ratio = descriptor_page_size / page_size; + if (stride_hint <= 1) { + return page_ratio + 1; + } + auto descriptor_capacity = (stride_hint - 1 + page_ratio - 1) / page_ratio; + return descriptor_capacity * page_ratio + 1; + } + + void createDesc_IO(CFile &file, o_prefix_config &config, std::vector &config_buffer, + std::function tail_function, std::uint32_t descriptor_stream_stride, + std::uint64_t config_block_size) + { + auto block_capacity = config.m_block_size / config.m_page_size; + auto initial_page_io_address = alignStorageAddress(tail_function(), config.m_page_size, config_block_size); + + // Open a temporary page_io cursor after the block-backed streams that + // were already materialized during prefix creation. + auto page_io = Diff_IO( + config_block_size, file, config.m_page_size, config.m_block_size, + initial_page_io_address, block_capacity, config.m_page_io_step_size, + tail_function, config.m_page_io_step_size - 1); + + // Reserve the descriptor RandomIO stream control chunk on top of + // page_io and write the initial empty stream control page. + RandomIO_Stream desc_io(page_io, descriptor_stream_stride, config.m_descriptor_page_size); + desc_io.flush(); + + // Flush the backing page_io so the descriptor stream control page is + // present before the config starts pointing at it. + page_io.flush(); + + // Persist the fixed descriptor stream head in the prefix config. + config.m_desc_io_head = desc_io.getHeadPageNum(); + file.write(0, config_block_size, config_buffer.data()); + } void BDevStorage::create(const std::string &file_name, std::optional page_size, - std::uint32_t dram_page_size_hint, std::optional step_size_hint) + std::uint32_t dram_page_size_hint, std::optional step_size_hint, + std::optional descriptor_page_size) { if (!page_size) { page_size = DEFAULT_PAGE_SIZE; } + if (!descriptor_page_size) { + descriptor_page_size = 16u << 10; + } + auto descriptor_stream_stride = getDesc_IOStride(*page_size, *descriptor_page_size); std::vector buffer(CONFIG_BLOCK_SIZE); // calculate block size to be page aligned and sufficient to fit a single sparse index node auto min_block_size = dram_page_size_hint + BlockIOStream::sizeOfHeaders(DRAM_IOStream::ENABLE_CHECKSUMS) + DRAM_IOStream::sizeOfHeader(); // page-align block size - auto block_size = (min_block_size + *page_size - 1) / (*page_size) * (*page_size); + auto page_alignment = std::lcm(*page_size, *descriptor_page_size); + auto block_size = (min_block_size + page_alignment - 1) / page_alignment * page_alignment; // adjust DRAM page size to fit the block auto dram_page_size = block_size - BlockIOStream::sizeOfHeaders(DRAM_IOStream::ENABLE_CHECKSUMS) - DRAM_IOStream::sizeOfHeader(); + auto page_io_step_size = getDiffIOStepSize(block_size, *page_size, step_size_hint); + auto block_capacity = block_size / *page_size; + auto min_descriptor_step_size = static_cast( + (descriptor_stream_stride + block_capacity - 1) / block_capacity); + page_io_step_size = std::max(page_io_step_size, min_descriptor_step_size); // create a new config using placement new auto config = new (buffer.data()) o_prefix_config( - block_size, *page_size, dram_page_size, getPageIOStepSize(block_size, step_size_hint) + block_size, *page_size, dram_page_size, page_io_step_size, *descriptor_page_size ); std::uint64_t offset = CONFIG_BLOCK_SIZE; @@ -180,6 +338,7 @@ namespace db0 // cofigure offsets for all inner streams (even though they have not been materialized yet) config->m_dram_io_offset = next_block_offset(); config->m_dram_changelog_io_offset = next_block_offset(); + config->m_desc_changelog_io_offset = next_block_offset(); config->m_dp_changelog_io_offset = next_block_offset(); config->m_meta_io_offset = next_block_offset(); @@ -198,6 +357,7 @@ namespace db0 { CFile file(file_name, AccessType::READ_WRITE); DRAM_ChangeLogStreamT *dram_changelog_io_ptr = nullptr; + DRAM_ChangeLogStreamT *desc_changelog_io_ptr = nullptr; DRAM_IOStream *dram_io_ptr = nullptr; std::unique_ptr ext_dram_changelog_io_ptr = nullptr; std::unique_ptr ext_dram_io_ptr = nullptr; @@ -206,7 +366,10 @@ namespace db0 { assert(dram_io_ptr && dram_changelog_io_ptr); // take max from the underlying I/O streams - auto result = std::max(offset, std::max(dram_io_ptr->tail(), dram_changelog_io_ptr->tail())); + assert(desc_changelog_io_ptr); + auto result = std::max(offset, std::max( + std::max(dram_io_ptr->tail(), dram_changelog_io_ptr->tail()), + desc_changelog_io_ptr->tail())); if (ext_dram_io_ptr && ext_dram_changelog_io_ptr) { result = std::max(result, std::max(ext_dram_io_ptr->tail(), ext_dram_changelog_io_ptr->tail())); } @@ -216,6 +379,9 @@ namespace db0 auto dram_changelog_io = DRAM_ChangeLogStreamT(file, config->m_dram_changelog_io_offset, config->m_block_size, tail_function, AccessType::READ_WRITE); dram_changelog_io_ptr = &dram_changelog_io; + auto desc_changelog_io = DRAM_ChangeLogStreamT(file, config->m_desc_changelog_io_offset, config->m_block_size, + tail_function, AccessType::READ_WRITE); + desc_changelog_io_ptr = &desc_changelog_io; auto dram_io = DRAM_IOStream(file, config->m_dram_io_offset, config->m_block_size, tail_function, AccessType::READ_WRITE, config->m_dram_page_size); dram_io_ptr = &dram_io; @@ -239,6 +405,7 @@ namespace db0 dram_changelog_io.flush(); dram_io.close(); dram_changelog_io.close(); + desc_changelog_io.close(); // create then flush the extension space if (has_ext_dram_io) { @@ -249,23 +416,44 @@ namespace db0 ext_dram_io_ptr->close(); ext_dram_changelog_io_ptr->close(); } + + createDesc_IO(file, *config, buffer, tail_function, descriptor_stream_stride, + CONFIG_BLOCK_SIZE); + file.flush(); file.close(); } } - + + Allocator::SlotId BDevStorage::getMetaSlotId(std::uint64_t page_num) const + { + auto address = page_num * static_cast(m_config.m_page_size); + return m_options.m_storage_slab_bucketing(address); + } + bool BDevStorage::tryFindMutation(std::uint64_t page_num, StateNumType state_num, StateNumType &mutation_id) const { std::shared_lock lock(m_mutex); - return db0::tryFindMutation(m_sparse_index, m_diff_index, page_num, state_num, mutation_id); + auto *sparse_pair = m_sparse_pair_manager.tryGetExisting(getMetaSlotId(page_num)); + if (!sparse_pair) { + return false; + } + return db0::tryFindMutation( + sparse_pair->getSparseIndex(), sparse_pair->getDiffIndex(), page_num, state_num, mutation_id); } StateNumType BDevStorage::findMutation(std::uint64_t page_num, StateNumType state_num) const { + if (state_num == 0) { + return 0; + } + StateNumType result; std::shared_lock lock(m_mutex); - if (!db0::tryFindMutation(m_sparse_index, m_diff_index, page_num, state_num, result)) { + auto *sparse_pair = m_sparse_pair_manager.tryGetExisting(getMetaSlotId(page_num)); + if (!sparse_pair || !db0::tryFindMutation( + sparse_pair->getSparseIndex(), sparse_pair->getDiffIndex(), page_num, state_num, result)) { assert(false && "BDevStorage::findMutation: page not found"); THROWF(db0::IOException) << "BDevStorage::findMutation: page_num " << page_num << " not found, state: " << state_num; @@ -297,15 +485,28 @@ namespace db0 if (chain_len) { *chain_len = 0; } + auto &manager = const_cast(m_sparse_pair_manager); + SparsePairQuery sparse_pair_query(m_options, m_config.m_page_size, begin_page, end_page, manager); std::byte *read_buf = reinterpret_cast(buffer); // lookup sparse index and read physical pages - for (auto page_num = begin_page; page_num != end_page; ++page_num, read_buf += m_config.m_page_size) { + for (; sparse_pair_query.hasNext(); ++sparse_pair_query, read_buf += m_config.m_page_size) { // query sparse index + diff index - SparseIndexQuery query(m_sparse_index, m_diff_index, page_num, state_num); + auto *sparse_pair = sparse_pair_query.currentSparsePair(); + if (!sparse_pair) { + if (flags[AccessOptions::read]) { + THROWF(db0::IOException) << "BDevStorage::read: page not found: " + << sparse_pair_query.pageNum() << ", state: " << state_num; + } + std::memset(read_buf, 0, m_config.m_page_size); + continue; + } + SparseIndexQuery query( + sparse_pair->getSparseIndex(), sparse_pair->getDiffIndex(), sparse_pair_query.pageNum(), state_num); if (query.empty()) { if (flags[AccessOptions::read]) { - THROWF(db0::IOException) << "BDevStorage::read: page not found: " << page_num << ", state: " << state_num; + THROWF(db0::IOException) << "BDevStorage::read: page not found: " + << sparse_pair_query.pageNum() << ", state: " << state_num; } // if requested access is write-only then simply fill the misssing (new) page with 0 std::memset(read_buf, 0, m_config.m_page_size); @@ -334,7 +535,7 @@ namespace db0 page_io_id = m_ext_space.getAbsolute(page_io_id); } // apply all diff-updates on top of the full-DP - m_page_io.applyFrom(page_io_id, read_buf, { page_num, diff_state_num }); + m_page_io.applyFrom(page_io_id, read_buf, { sparse_pair_query.pageNum(), diff_state_num }); // collect chain-len statistics if (chain_len) { ++(*chain_len); @@ -373,10 +574,14 @@ namespace db0 std::byte *write_buf = reinterpret_cast(buffer); std::unique_lock lock(m_mutex); + SparsePairQuery sparse_pair_query( + m_options, m_config.m_page_size, begin_page, end_page, m_sparse_pair_manager); // write as physical pages and register with the sparse index - for (auto page_num = begin_page; page_num != end_page; ++page_num, write_buf += m_config.m_page_size) { + for (; sparse_pair_query.hasNext(); ++sparse_pair_query, write_buf += m_config.m_page_size) { + auto &sparse_pair = sparse_pair_query.currentOrCreateSparsePair(); + auto &sparse_index = sparse_pair.getSparseIndex(); // look up if page has already been added in current transaction - auto item = m_sparse_index.lookup(page_num, state_num); + auto item = sparse_index.lookup(sparse_pair_query.pageNum(), state_num); if (item && item.m_state_num == state_num) { // page already added in current transaction / update in the stream // this may happen due to cache overflow and later modification of the same page @@ -395,7 +600,8 @@ namespace db0 // assign a relative page number page_io_id = m_ext_space.assignRelative(page_io_id, is_first_page); } - m_sparse_index.emplace(page_num, state_num, page_io_id); + sparse_index.emplace(sparse_pair_query.pageNum(), state_num, page_io_id); + m_root_sparse_pair.recordMaxStateNum(state_num); #ifndef NDEBUG m_page_io_raw_bytes += m_config.m_page_size; checkPoisonedOp(Settings::__write_poison); @@ -414,20 +620,23 @@ namespace db0 auto page_num = address / m_config.m_page_size; std::unique_lock lock(m_mutex); + auto slot_id = getMetaSlotId(page_num); + auto &sparse_pair = m_sparse_pair_manager.getOrCreate(slot_id); // Use SparseIndexQuery to determine the current sequence length & check limits - SparseIndexQuery query(m_sparse_index, m_diff_index, page_num, state_num); + SparseIndexQuery query(sparse_pair.getSparseIndex(), sparse_pair.getDiffIndex(), page_num, state_num); // if a page has already been written as full-DP in the current transaction then // we cannot append as diff but need to overwrite the full page instead if (state_num != query.firstStateNum() && query.leftLessThan(max_len)) { bool is_first_page; // append as diff-page (NOTE: diff-writes are only appended) - auto [page_io_id, overflow] = m_page_io.appendDiff(buffer, { page_num, state_num }, diff_data, &is_first_page); + auto [page_io_id, overflow] = m_page_io.appendDiff( + buffer, { page_num, state_num }, diff_data, &is_first_page + ); if (!!m_ext_space) { - // NOTE: first page (of each step) must be registered with REL_Index if it's maintained - // assign a relative page number page_io_id = m_ext_space.assignRelative(page_io_id, is_first_page); } - m_diff_index.insert(page_num, state_num, page_io_id, overflow); + sparse_pair.getDiffIndex().insert(page_num, state_num, page_io_id, overflow); + m_root_sparse_pair.recordMaxStateNum(state_num); } else { // Unable to write as diff // this mey be due to either: @@ -456,6 +665,10 @@ namespace db0 std::size_t BDevStorage::getDRAMPageSize() const { return m_config.m_dram_page_size; } + + std::size_t BDevStorage::getDescriptorPageSize() const { + return m_config.m_descriptor_page_size; + } bool BDevStorage::flushExt(StateNumType max_state_num) { @@ -480,20 +693,53 @@ namespace db0 if (m_access_type == AccessType::READ_ONLY) { THROWF(db0::IOException) << "BDevStorage::flush error: read-only stream"; } - - // check if there're any modifications to be flushed - if (m_sparse_pair.getChangeLogSize() == 0) { + + auto descriptor_io_was_modified = m_desc_io.modified(); + auto application_changed = m_sparse_pair_manager.commit(); + auto &meta_prefix = *m_meta_space.getMSPrefixPtr(); + auto state_num = m_root_sparse_pair.getMaxStateNum(); + auto meta_space_dirty = meta_prefix.getDirtySize() != 0; + if (meta_space_dirty && state_num < meta_prefix.getStateNum(false)) { + THROWF(db0::InternalException) + << "BDevStorage::flush requires caller to register state high watermark before flushing dirty metadata" + << "; root max state: " << state_num + << "; metadata state: " << meta_prefix.getStateNum(false) + << "; sparse pair manager changelog size: " << m_sparse_pair_manager.getChangeLogSize(); + } + auto meta_space_flushed = db0::flush(meta_prefix, m_desc_io, timer.get()); + if (meta_space_flushed) { + m_meta_space.commit(timer.get()); + } + auto descriptor_io_modified = descriptor_io_was_modified || m_desc_io.modified() || meta_space_flushed; + bool root_metadata_changed = false; + if (descriptor_io_modified) { + if (state_num == 0) { + THROWF(db0::InternalException) + << "BDevStorage::flush requires registered state high watermark before flushing descriptor metadata"; + } + } + auto root_change_log_size = m_root_sparse_pair.getChangeLogSize(); + + // check if there're any modifications to be flushed + if (!application_changed && !root_metadata_changed && root_change_log_size == 0) { + if (descriptor_io_modified) { + m_desc_io.flush(); + m_page_io.flush(); + m_file.fsync(); + return false; + } // no modifications to be flushed return false; } // save metadata checkpoints before making any updates to the managed streams // NOTE: the checkpoint is only saved after exceeding specific threshold of updates in the managed streams - auto state_num = m_sparse_pair.getMaxStateNum(); - - m_meta_io.checkAndAppend(state_num); - m_meta_io.flush(); + if (application_changed) { + m_meta_io.checkAndAppend(state_num); + m_meta_io.flush(); + } + m_desc_io.flush(); m_page_io.flush(); // Extract & flush sparse index change log first (on condition of any updates) // we also need to collect the end storage page number, possibly relative (sentinel) @@ -504,25 +750,33 @@ namespace db0 end_page_io_page_num = m_ext_space.assignRelative(end_page_io_page_num, is_first); } - m_sparse_pair.extractChangeLog(m_dp_changelog_io, end_page_io_page_num); + if (application_changed) { + auto changed_page_nums = m_sparse_pair_manager.extractChangeLogPages(); + appendSparsePairManagerChangeLog(m_desc_changelog_io, std::move(changed_page_nums), state_num); + m_root_sparse_pair.recordMaxStateNum(state_num); + m_root_sparse_pair.recordNextStoragePageNum(end_page_io_page_num); + } m_dram_io.flushUpdates(state_num, m_dram_changelog_io); - m_dp_changelog_io.flush(); + m_desc_io.flush(); + m_page_io.flush(); // Flush ext streams (if existing) flushExt(state_num); // NOTE: fsync has stronger guarantees than flush in a multi-process environments m_file.fsync(); // flush changelog AFTER all updates from all other streams have been flushed + m_desc_changelog_io.flush(); m_dram_changelog_io.flush(); // the last fsync finalizes the commit m_file.fsync(); // commit to collect future updates correctly - m_sparse_pair.commit(); - return true; + m_sparse_pair_manager.commit(); + m_root_sparse_pair.commit(); + return application_changed; } void BDevStorage::close() - { + { if (m_access_type == AccessType::READ_WRITE) { flush(); } @@ -536,8 +790,10 @@ namespace db0 m_dram_io.close(); m_dram_changelog_io.close(); + m_desc_changelog_io.close(); m_dp_changelog_io.close(); m_meta_io.close(); + m_desc_io.close(); m_file.close(); } @@ -564,7 +820,8 @@ namespace db0 if (!first_block_pos) { return nullptr; } - return std::make_unique(m_file, first_block_pos, m_config.m_block_size, + auto block_size = m_config.m_block_size; + return std::make_unique(m_file, first_block_pos, block_size, getTailFunction(), access_type, dram_page_size); } @@ -573,6 +830,7 @@ namespace db0 // take max from the 4 underlying I/O streams auto result = std::max(m_dram_io.tail(), m_meta_io.tail()); result = std::max(result, m_dram_changelog_io.tail()); + result = std::max(result, m_desc_changelog_io.tail()); result = std::max(result, m_dp_changelog_io.tail()); result = std::max(result, m_page_io.tail()); @@ -586,15 +844,46 @@ namespace db0 } Diff_IO BDevStorage::getPage_IO(std::optional next_page_hint, std::uint32_t step_size) - { - auto block_capacity = m_config.m_block_size / m_config.m_page_size; + { + auto descriptor_end_page_num = m_config.m_desc_io_head + + getDesc_IOStride(m_config.m_page_size, m_config.m_descriptor_page_size); + if (!next_page_hint && m_flags[StorageFlagOption::NO_LOAD]) { + next_page_hint = (m_file.size() - CONFIG_BLOCK_SIZE) / m_config.m_page_size; + } + if (!next_page_hint || *next_page_hint < descriptor_end_page_num) { + next_page_hint = descriptor_end_page_num; + } + auto tail_function = getPageIOTailFunction(); + auto block_tail_address = alignStorageAddress(m_file.size(), m_config.m_page_size, CONFIG_BLOCK_SIZE); + auto block_tail_page_num = (block_tail_address - CONFIG_BLOCK_SIZE) / m_config.m_page_size; + if (!next_page_hint || *next_page_hint < block_tail_page_num) { + next_page_hint = block_tail_page_num; + } + auto initial_tail_address = next_page_hint ? 0 : tail_function(); + return getDiff_IO( + next_page_hint, m_config.m_page_size, step_size, tail_function, initial_tail_address); + } + + RandomIO_Stream BDevStorage::getDesc_IO() + { + return { + m_page_io, m_config.m_desc_io_head, + getDesc_IOStride(m_config.m_page_size, m_config.m_descriptor_page_size), + m_access_type, m_config.m_descriptor_page_size + }; + } + + Diff_IO BDevStorage::getDiff_IO(std::optional next_page_hint, std::uint32_t page_size, + std::uint32_t step_size, std::function tail_function, std::uint64_t initial_tail_address) + { + auto block_capacity = m_config.m_block_size / page_size; std::optional block_num; std::uint64_t address = 0; std::uint32_t page_count = 0; if (next_page_hint) { - auto block_id = (*next_page_hint * m_config.m_page_size) / m_config.m_block_size; + auto block_id = (*next_page_hint * page_size) / m_config.m_block_size; address = CONFIG_BLOCK_SIZE + block_id * m_config.m_block_size; page_count = static_cast(*next_page_hint % block_capacity); @@ -602,30 +891,26 @@ namespace db0 if (page_count == 0) { address -= m_config.m_block_size; page_count = block_capacity; + --block_id; } - } else { - // assign first page - address = std::max(m_dram_io.tail(), m_meta_io.tail()); - address = std::max(address, m_dram_changelog_io.tail()); - address = std::max(address, m_dp_changelog_io.tail()); - if (m_ext_dram_io) { - assert(m_ext_dram_changelog_io); - address = std::max(address, m_ext_dram_io->tail()); - address = std::max(address, m_ext_dram_changelog_io->tail()); - } - - // NOTE: initialize with a known block num = 0 (first block of the first step) - block_num = 0; + block_num = static_cast(block_id % step_size); + } else { + address = alignStorageAddress(initial_tail_address, page_size, CONFIG_BLOCK_SIZE); + // Seed the cursor as a full previous block. The first real write + // will call allocateNextBlock(), which consults tail_function() + // after all BDevStorage streams have been constructed. + page_count = block_capacity; + block_num = step_size - 1; } // NOTE: block num is unknown in this case - return { CONFIG_BLOCK_SIZE, m_file, m_config.m_page_size, m_config.m_block_size, address, page_count, - step_size, getBlockIOTailFunction(), block_num + return { CONFIG_BLOCK_SIZE, m_file, page_size, m_config.m_block_size, address, page_count, + step_size, tail_function, block_num }; } std::uint32_t BDevStorage::getMaxStateNum() const { - return m_sparse_pair.getMaxStateNum(); + return m_root_sparse_pair.getMaxStateNum(); } std::function BDevStorage::getTailFunction() const @@ -635,19 +920,24 @@ namespace db0 }; } - std::function BDevStorage::getBlockIOTailFunction() const + std::uint64_t BDevStorage::blockIOTail() const + { + auto result = std::max(m_dram_io.tail(), m_meta_io.tail()); + result = std::max(result, m_dram_changelog_io.tail()); + result = std::max(result, m_desc_changelog_io.tail()); + result = std::max(result, m_dp_changelog_io.tail()); + if (m_ext_dram_io) { + assert(m_ext_dram_changelog_io); + result = std::max(result, m_ext_dram_io->tail()); + result = std::max(result, m_ext_dram_changelog_io->tail()); + } + return result; + } + + std::function BDevStorage::getPageIOTailFunction() const { - // get tail from BlockIOStreams return [this]() -> std::uint64_t { - auto result = std::max(m_dram_io.tail(), m_meta_io.tail()); - result = std::max(result, m_dram_changelog_io.tail()); - result = std::max(result, m_dp_changelog_io.tail()); - if (m_ext_dram_io) { - assert(m_ext_dram_changelog_io); - result = std::max(result, m_ext_dram_io->tail()); - result = std::max(result, m_ext_dram_changelog_io->tail()); - } - return result; + return blockIOTail(); }; } @@ -657,9 +947,10 @@ namespace db0 THROWF(db0::IOException) << "BDevStorage::refresh allowed only in read-only mode"; } if (!m_refresh_pending) { + // The main DRAM changelog is the transaction marker. Descriptor and + // extension changelogs are refreshed after observing a new DRAM + // transaction in completeRefresh(). m_refresh_pending = m_dram_changelog_io.refresh(); - // NOTE: inclusion of ext-space is not necessary here since DRAM changelog - // is sufficient to determine if there're any updates } return m_refresh_pending; } @@ -678,6 +969,7 @@ namespace db0 do { // safe stream positions for rollback on file read failure auto dram_changelog_io_pos = m_dram_changelog_io.getStreamPos(); + auto desc_changelog_io_pos = m_desc_changelog_io.getStreamPos(); std::pair ext_dram_changelog_io_pos; if (!!m_ext_space) { assert(m_ext_dram_changelog_io); @@ -687,6 +979,7 @@ namespace db0 // reverts streams to previous positions auto revert_streams = [&]() { m_dram_changelog_io.setStreamPos(dram_changelog_io_pos); + m_desc_changelog_io.setStreamPos(desc_changelog_io_pos); m_dp_changelog_io.setStreamPos(dp_changelog_io_pos); if (!!m_ext_space) { assert(m_ext_dram_changelog_io); @@ -698,6 +991,7 @@ namespace db0 auto dram_state_num = m_dram_io.beginApplyChanges(m_dram_changelog_io); if (!dram_state_num) { // no updates to process + m_refresh_pending = false; break; } dram_changelog_io_pos = m_dram_changelog_io.getStreamPos(); @@ -717,6 +1011,9 @@ namespace db0 is_consistent &= m_ext_dram_io->completeApplyChanges(*ext_dram_state_num); m_ext_space.refresh(); } + if (!!m_ext_space && (!ext_dram_state_num || *ext_dram_state_num != *dram_state_num)) { + is_consistent = false; + } if (!is_consistent) { // must continue with the refresh until getting a consistent state @@ -724,58 +1021,46 @@ namespace db0 continue; } - // refresh underlying sparse index / diff index after DRAM update - m_sparse_pair.refresh(); - - // this is the state number to sync-up to (which must be identical as dram_state_num) - auto max_state_num = m_sparse_pair.getMaxStateNum(); - if (dram_state_num != max_state_num) { - // NOTE: this critical and irrecoverable error indicates corruption of the DRAM changelog stream - THROWF(db0::InternalException) << "Inconsistent state: DRAM changelog state number " - << *dram_state_num << " does not match max known state number " << max_state_num; - } - - // send all page-update notifications to the provided handler - if (on_page_updated) { - StateNumType updated_state_num = 0; - m_dp_changelog_io.refresh(); - // NOTE: readers allow reading the same contents multiple times - auto reader = m_dp_changelog_io.getStreamReader(); - // feed the reader with all available chunks, in case of IOException the stream is getting reverted - // this is to make the operation atomic - while (auto chunk_ptr = reader.readChangeLogChunk()) { - if (chunk_ptr->m_state_num == max_state_num) { - // stop at the max known state number - break; + m_desc_changelog_io.refresh(); + auto desc_changelog_io_end = m_desc_changelog_io.getStreamPos(); + + // Descriptor changelog entries are stored separately from DRAM IO. + // Reload pages after restoring stream position because page reload + // may consult stream tails. + std::vector updated_desc_pages; + m_desc_changelog_io.setStreamPos(desc_changelog_io_pos); + auto desc_state_is_consistent = true; + // NOTE: descriptor pages don't report to the on_page_updated callback + scanChangeLogs(m_desc_changelog_io, + [&](const DRAM_ChangeLogT &change_log) { + if (change_log.m_state_num > *dram_state_num) { + desc_state_is_consistent = false; + return; } - if (chunk_ptr->m_state_num > max_state_num) { - // NOTE: this critical and irrecoverable error indicates corruption of the DP changelog stream - THROWF(db0::InternalException) << "Inconsistent state: DP changelog state number " - << chunk_ptr->m_state_num << " exceeds max known state number " << max_state_num; + for (auto entry: change_log) { + updated_desc_pages.push_back(entry); } } - - // reset to read all updates again - reader.reset(); - for (;;) { - auto dp_change_log_ptr = reader.readChangeLogChunk(); - if (!dp_change_log_ptr || dp_change_log_ptr->m_state_num > max_state_num) { - // end of the stream or the max known state number reached - break; - } - - assert(dp_change_log_ptr->m_state_num != updated_state_num); - updated_state_num = dp_change_log_ptr->m_state_num; - // Elements are logical page numbers (mutated in that transaction) - for (auto page_num: *dp_change_log_ptr) { - on_page_updated(page_num, updated_state_num); - } - } + ); + if (!desc_state_is_consistent) { + m_desc_changelog_io.setStreamPos(desc_changelog_io_pos); + continue; } - + m_desc_changelog_io.setStreamPos(desc_changelog_io_end); + + // Root metadata is part of DRAM IO. Refresh it before applying + // sparse-pair-manager changelog entries so slot detaches see + // the latest MetaSpace allocator state. + m_flags = m_flags & ~StorageFlags { StorageFlagOption::NO_LOAD }; + m_root_sparse_pair.refresh(); + // refresh the updated descriptor pages + m_sparse_pair_manager.refreshPages(updated_desc_pages); + + m_dp_changelog_io.refresh(); } catch (db0::IOException &) { revert_streams(); // NOTE: this may be a temporary problem, refresh needs repeating + m_refresh_pending = false; break; } @@ -807,7 +1092,12 @@ namespace db0 callback("file_bytes_read", file_io_bytes.first); callback("file_bytes_written", file_io_bytes.second); // total size of data pages - callback("dp_size_total", m_sparse_pair.size() * m_page_io.getPageSize()); + std::uint64_t sparse_pair_size = 0; + auto &manager = const_cast(m_sparse_pair_manager); + manager.forCachedPairs([&](Allocator::SlotId, PlainSparsePair &sparse_pair) { + sparse_pair_size += sparse_pair.size(); + }); + callback("dp_size_total", sparse_pair_size * m_page_io.getPageSize()); callback("prefix_size", m_file.size()); auto page_io_stats = m_page_io.getStats(); callback("page_io_total_bytes", page_io_stats.first); @@ -837,12 +1127,9 @@ namespace db0 #endif void BDevStorage::fetchDP_ChangeLogs(StateNumType begin_state, std::optional end_state, - std::function f) const + std::function callback) const { std::unique_lock lock(m_mutex); - if (m_dp_changelog_io.modified()) { - THROWF(db0::IOException) << "BDevStorage::fetchChangeLogs: dp-changelog is modified and needs to be flushed first"; - } auto &dp_changelog_io = const_cast(m_dp_changelog_io); DP_ChangeLogStreamT::State dp_state; dp_changelog_io.saveState(dp_state); @@ -852,7 +1139,7 @@ namespace db0 // try locating the nearest meta-log entry to position the dp-changelog auto meta_log_ptr = m_meta_io.lowerBound(begin_state, buf); if (meta_log_ptr) { - // the 1st meta-item is associated with tha dp_change_log + // the 1st meta-item is associated with the dp_change_log auto &item = *meta_log_ptr->getMetaItems().begin(); dp_changelog_io.setStreamPos(item.m_address, item.m_stream_pos); } else { @@ -862,6 +1149,7 @@ namespace db0 } try { + std::map > change_log_pages; for (;;) { auto change_log = dp_changelog_io.readChangeLogChunk(); if (!change_log) { @@ -873,14 +1161,30 @@ namespace db0 // end of the range reached break; } - if (state_num >= begin_state) { - f(*change_log); + if (state_num >= begin_state && change_log->begin() != change_log->end()) { + auto &page_nums = change_log_pages[state_num]; + for (auto page_num: *change_log) { + page_nums.push_back(page_num); + } + } + } + + std::vector buffer; + for (auto &[state_num, page_nums]: change_log_pages) { + if (page_nums.empty()) { + continue; } + std::sort(page_nums.begin(), page_nums.end()); + ChangeLogData data(page_nums, false, false, true); + auto size_of = DP_ChangeLogT::measure(data, state_num, 0); + buffer.resize(size_of); + auto &dp_change_log = DP_ChangeLogT::__new(buffer.data(), data, state_num, 0); + callback(dp_change_log); } - } catch (...) { + } catch (...) { dp_changelog_io.restoreState(dp_state); throw; - } + } dp_changelog_io.restoreState(dp_state); } @@ -902,42 +1206,70 @@ namespace db0 m_file.fsync(); } + void loadRootSparsePairForNoLoadCopy(DRAM_IOStream &dram_io, + BDevStorage::DRAM_ChangeLogStreamT &dram_changelog_io, SparsePair &root_sparse_pair, + AccessType access_type, StorageFlags flags) + { + dram_changelog_io.setStreamPosHead(); + dram_io.setStreamPosHead(); + dram_io.load(dram_changelog_io); + root_sparse_pair.refresh(); + } + void BDevStorage::copyTo(BDevStorage &out) { if (!out.m_ext_space) { THROWF(db0::IOException) << "BDevStorage::copyTo: destination storage must have ext-space initialized"; } + if (m_flags[StorageFlagOption::NO_LOAD]) { + auto dram_changelog_io = getChangeLogIOStream( + m_config.m_dram_changelog_io_offset, m_access_type); + auto dram_io = getDRAMIOStream( + m_config.m_dram_io_offset, m_config.m_dram_page_size, m_access_type); + loadRootSparsePairForNoLoadCopy( + dram_io, dram_changelog_io, m_root_sparse_pair, m_access_type, m_flags); + } + auto copy_state_num = m_root_sparse_pair.getMaxStateNum(); auto writer = out.m_dram_changelog_io.getStreamWriter(); - auto maybe_max_state_num = copyDRAM_IO(m_dram_io, m_dram_changelog_io, out.m_dram_io, writer); + auto maybe_max_state_num = copyDRAM_IO( + m_dram_io, m_dram_changelog_io, out.m_dram_io, writer, copy_state_num); if (!maybe_max_state_num) { // nothing to copy return; } auto max_state_num = *maybe_max_state_num; + auto src_page_tail = getNextStoragePageNum(); // copy up to the max_state_num (inclusive) - auto dp_header = copyDPStream(m_dp_changelog_io, out.m_dp_changelog_io, max_state_num); - if (!dp_header) { - THROWF(db0::IOException) << "BDevStorage::copyTo: failed to copy DP changelog"; - } + auto dp_header = copyDPStream(m_dp_changelog_io, out.m_dp_changelog_io, max_state_num); + writer.flush(); - // assure copied streams are consistent - if (dp_header->m_state_num != max_state_num) { - THROWF(db0::IOException) - << "BDevStorage::copyTo: inconsistent max_state_num in DP changelog: " - << (StateNumType)(dp_header->m_state_num) << " != " << max_state_num; + out.m_dram_changelog_io.setStreamPosHead(); + out.m_dram_io.setStreamPosHead(); + out.m_dram_io.load(out.m_dram_changelog_io, max_state_num); + out.m_root_sparse_pair.refresh(); + + out.m_ext_space.refresh(); + out.m_ext_space.clearMappings(); + + out.m_page_io.setAtTail(); + std::uint64_t end_page_num = 0; + if (dp_header) { + end_page_num = dp_header->m_end_storage_page_num; } - std::uint64_t end_page_num = dp_header->m_end_storage_page_num; - // NOTE: end_page_num may be relative, need to translate to absolute - if (!!m_ext_space) { + if (!!m_ext_space && end_page_num != 0) { end_page_num = m_ext_space.getAbsolute(end_page_num); } + if (src_page_tail) { + end_page_num = std::max(end_page_num, *src_page_tail); + } copyPageIO(m_page_io, m_ext_space, out.m_page_io, end_page_num, out.m_ext_space); - + // NOTE: meta_is stream can't be copied since it's structure depends on the managed streams // NOTE: for simplicity we don't generate the entire meta-io, just save the last checkpoint out.m_meta_io.checkAndAppend(max_state_num); + out.m_meta_io.flush(); // flush ext-space only, the other streams are already flushed by copy operators // NOTE: we need to use max state num from the source storage since the desination @@ -955,13 +1287,7 @@ namespace db0 std::optional BDevStorage::getNextStoragePageNum() const { - // NOTE: in no-load mode we cannot use sparse_pair - // therefore will calculate end page bound from the file size (absolute page number) - if (m_flags[StorageOptions::NO_LOAD]) { - return (m_file.size() - CONFIG_BLOCK_SIZE) / m_config.m_page_size; - } - - auto page_io_id = m_sparse_pair.getNextStoragePageNum(); + auto page_io_id = m_root_sparse_pair.getNextStoragePageNum(); if (!!m_ext_space && page_io_id) { // convert to absolute page number page_io_id = m_ext_space.getAbsolute(*page_io_id); @@ -975,8 +1301,9 @@ namespace db0 // no synchronization required in read-only mode return std::nullopt; } - // synchronize to the same state as the DRAM IO - return getMaxStateNum(); + // Synchronize ext-space to the root metadata state. DP changelog entries + // may be absent when a transaction only updates sparse-pair metadata. + return m_root_sparse_pair.getMaxStateNum(); } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/BDevStorage.hpp b/src/dbzero/core/storage/BDevStorage.hpp index 94a1cac9..a39464de 100644 --- a/src/dbzero/core/storage/BDevStorage.hpp +++ b/src/dbzero/core/storage/BDevStorage.hpp @@ -11,9 +11,13 @@ #include "BlockIOStream.hpp" #include "Page_IO.hpp" #include "Diff_IO.hpp" +#include "RandomIO_Stream.hpp" +#include "StorageOptions.hpp" #include -#include +#include +#include #include +#include #include "BaseStorage.hpp" #include "DRAM_IOStream.hpp" #include "ChangeLogIOStream.hpp" @@ -23,6 +27,7 @@ #include #include "ExtSpace.hpp" #include "MemBaseStorage.hpp" +#include "SparsePairManager.hpp" namespace db0 @@ -35,9 +40,10 @@ DB0_PACKED_BEGIN { // magic number for the .db0 file static constexpr std::uint64_t DB0_MAGIC = 0x0DB0DB0DB0DB0DB0; + static constexpr std::uint32_t DB0_VERSION = 2; std::uint64_t m_magic = DB0_MAGIC; - std::uint32_t m_version = 1; + std::uint32_t m_version = DB0_VERSION; std::uint32_t m_block_size; // the prefix page size std::uint32_t m_page_size; @@ -51,14 +57,18 @@ DB0_PACKED_BEGIN // a a single indivisible "step". // This value (entire step) corresponts to a single entry in the REL_Index (if it's used) std::uint32_t m_page_io_step_size; + std::uint32_t m_descriptor_page_size = 0; + std::uint64_t m_desc_io_head = 0; std::uint64_t m_ext_dram_io_offset = 0; std::uint32_t m_ext_dram_page_size = 0; std::uint64_t m_ext_dram_changelog_io_offset = 0; + std::uint64_t m_desc_changelog_io_offset = 0; // reserved for future use (0-filled) - std::array m_reserved; + std::array m_reserved; o_prefix_config(std::uint32_t block_size, std::uint32_t page_size, std::uint32_t dram_page_size, - std::uint32_t page_io_step_size); + std::uint32_t page_io_step_size, std::uint32_t descriptor_page_size, + std::uint64_t desc_io_head = 0); }; DB0_PACKED_END @@ -79,7 +89,7 @@ DB0_PACKED_END * @param meta_io_step_size - the size of the step in the MetaIOStream (16MB by default) */ BDevStorage(const std::string &file_name, AccessType = AccessType::READ_WRITE, LockFlags lock_flags = {}, - std::optional meta_io_step_size = {}, StorageFlags = {}); + std::optional meta_io_step_size = {}, StorageFlags = {}, StorageOptions = {}); ~BDevStorage(); /** @@ -87,7 +97,8 @@ DB0_PACKED_END * @param step_size_hint defines requested Page IO step size in bytes */ static void create(const std::string &file_name, std::optional page_size = {}, - std::uint32_t dram_page_size_hint = (16u << 10) - 256, std::optional step_size_hint = {}); + std::uint32_t dram_page_size_hint = (16u << 10) - 256, std::optional step_size_hint = {}, + std::optional descriptor_page_size = {}); void read(std::uint64_t address, StateNumType state_num, std::size_t size, void *buffer, FlagSet = { AccessOptions::read, AccessOptions::write }) const override; @@ -117,6 +128,7 @@ DB0_PACKED_END std::size_t getPageSize() const override; std::size_t getDRAMPageSize() const; + std::size_t getDescriptorPageSize() const; StateNumType getMaxStateNum() const override; @@ -166,11 +178,13 @@ DB0_PACKED_END // all prefix configuration must fit into this block static constexpr unsigned int CONFIG_BLOCK_SIZE = 4096; CFile m_file; - const o_prefix_config m_config; + o_prefix_config m_config; // DRAM-changelog stream stores the sequence of updates to DRAM pages // DRAM-changelog must be initialized before DRAM_IOStream DRAM_ChangeLogStreamT m_dram_changelog_io; + // Descriptor-IO pages change log + DRAM_ChangeLogStreamT m_desc_changelog_io; // data-page change log, each chunk corresponds to a separate data transaction // holds logical data page numbers mutated in that transaction DP_ChangeLogStreamT m_dp_changelog_io; @@ -178,17 +192,21 @@ DB0_PACKED_END MetaIOStream m_meta_io; // memory-mapped file I/O DRAM_IOStream m_dram_io; - // SparseIndex + DiffIndex (based over the dram_io) - SparsePair m_sparse_pair; - // DRAM-backed sparse index tree - SparseIndex &m_sparse_index; - DiffIndex &m_diff_index; + // Root SparsePair maps MS_MetaSpace's own metadata pages. + SparsePair m_root_sparse_pair; // extension DRAM IO (only initialized when holding extension indexes e.g. REL_Index) std::unique_ptr m_ext_dram_changelog_io; std::unique_ptr m_ext_dram_io; ExtSpace m_ext_space; + StorageOptions m_options; // the stream for storing & reading full-DPs and diff-encoded DPs Diff_IO m_page_io; + // the stream for descriptor-backed metadata, stored on top of m_page_io + // this is not a separate stream, rather a view over m_page_io + RandomIO_Stream m_desc_io; + // Multi-slot metadata space hosts application data-page sparse pairs. + MS_MetaSpace m_meta_space; + SparsePairManager m_sparse_pair_manager; #ifndef NDEBUG MemBaseStorage m_data_mirror; #endif @@ -233,8 +251,9 @@ DB0_PACKED_END std::unique_ptr tryGetChangeLogIOStream(std::uint64_t first_block_pos, AccessType access_type) { if (first_block_pos) { + auto block_size = m_config.m_block_size; return std::make_unique( - m_file, first_block_pos, m_config.m_block_size, getTailFunction(), access_type + m_file, first_block_pos, block_size, getTailFunction(), access_type ); } else { // stream does not exist @@ -244,9 +263,16 @@ DB0_PACKED_END MetaIOStream getMetaIOStream(std::uint64_t first_block_pos, std::size_t step_size, AccessType); - Diff_IO getPage_IO(std::optional next_page_hint, std::uint32_t step_size); + Diff_IO getPage_IO(std::optional next_page_hint, std::uint32_t step_size); + Diff_IO getDiff_IO(std::optional next_page_hint, std::uint32_t page_size, + std::uint32_t step_size, std::function tail_function, + std::uint64_t initial_tail_address); + // Create the descriptor stream on top of the page I/O stream + RandomIO_Stream getDesc_IO(); o_prefix_config readConfig() const; + + Allocator::SlotId getMetaSlotId(std::uint64_t page_num) const; /** * Get the first available address (i.e. end of the file) @@ -255,7 +281,9 @@ DB0_PACKED_END std::function getTailFunction() const; - std::function getBlockIOTailFunction() const; + std::uint64_t blockIOTail() const; + + std::function getPageIOTailFunction() const; // non-virtual version of tryFindMutation bool tryFindMutationImpl(std::uint64_t page_num, StateNumType state_num, @@ -273,4 +301,4 @@ DB0_PACKED_END std::optional getMaxExtStateNum() const; }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/BaseStorage.hpp b/src/dbzero/core/storage/BaseStorage.hpp index 05e1fcb0..978375fa 100644 --- a/src/dbzero/core/storage/BaseStorage.hpp +++ b/src/dbzero/core/storage/BaseStorage.hpp @@ -128,12 +128,13 @@ namespace db0 virtual void endCommit(); // Retrieve the complete change log (i.e. DP updates) for each transaction from the given range + // this function is required for change capture API // @param begin_state the first state number to be included in the change log // @param end_state the first state number past the last state number to be included // in the change log (or up to the last state number if not specified) - // @param f function to be called for each transaction's change log + // @param callback function to be called for each transaction's change log virtual void fetchDP_ChangeLogs(StateNumType begin_state, std::optional end_state, - std::function f) const; + std::function callback) const; // Throws where this conversion is not possible virtual BDevStorage &asFile(); diff --git a/src/dbzero/core/storage/BlockIOStream.cpp b/src/dbzero/core/storage/BlockIOStream.cpp index 37a26248..3807f0a4 100644 --- a/src/dbzero/core/storage/BlockIOStream.cpp +++ b/src/dbzero/core/storage/BlockIOStream.cpp @@ -224,6 +224,8 @@ namespace db0 // end-of-stream reached, need to call refresh to be able data appended in meantime return 0; } + State state; + saveState(state); o_block_io_chunk_header chunk_header; if (!peek(&chunk_header, o_block_io_chunk_header::sizeOf(), address)) { // end of stream (maybe process crashed when flushing?) @@ -236,14 +238,19 @@ namespace db0 return 0; } if (expected_size && chunk_header.m_chunk_size != expected_size) { - THROWF(db0::InternalException) << "BlockIOStream::readChunk: chunk size mismatch"; + THROWF(db0::IOException) << "Unexpected chunk size"; + } + if (!skip(chunk_header.sizeOf())) { + restoreState(state); + m_eos = true; + return 0; } - skip(chunk_header.sizeOf()); assert(chunk_header.isValid()); if (buffer.size() < chunk_header.m_chunk_size) { buffer.resize(chunk_header.m_chunk_size); } if (!read(buffer.data(), chunk_header.m_chunk_size)) { + restoreState(state); m_eos = true; return 0; } @@ -351,6 +358,22 @@ namespace db0 // contents might've changed without file size change m_file.refresh(); + if (m_block_header.hasNext()) { + auto next_block_address = m_block_header.m_next_block_address; + if (m_file.size() >= next_block_address + m_block_size) { + std::vector buffer(m_block_size); + if (readBlock(next_block_address, buffer.data())) { + m_eos = false; + if (m_block_pos == m_block_end) { + memcpy(m_block_begin, buffer.data(), buffer.size()); + m_address = next_block_address; + m_block_pos = m_block_begin; + ++m_block_num; + } + return true; + } + } + } if (m_address + m_block_size <= m_file.size()) { std::vector buffer(m_block_size); @@ -556,7 +579,10 @@ namespace db0 m_chunk_left_bytes = 0; // try reading a full block if ((m_address + m_block_size > m_file.size()) || !readBlock(m_address, m_block_begin)) { - THROWF(db0::InternalException) << "BlockIOStream unable to restore state"; + m_file.refresh(); + if ((m_address + m_block_size > m_file.size()) || !readBlock(m_address, m_block_begin)) { + THROWF(db0::IOException) << "BlockIOStream unable to restore state"; + } } } @@ -564,4 +590,4 @@ namespace db0 THROWF(db0::InternalException) << "BlockIOStream::readChunk() operation not supported" << THROWF_END; } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/CFile.cpp b/src/dbzero/core/storage/CFile.cpp index 09141918..d8e7f43a 100644 --- a/src/dbzero/core/storage/CFile.cpp +++ b/src/dbzero/core/storage/CFile.cpp @@ -213,7 +213,9 @@ namespace db0 assert(m_file_pos == (std::uint64_t)ftell(m_file)); assert(!overlap(m_protected, { address, size })); if (fwrite(buffer, size, 1, m_file) != 1) { - THROWF(db0::IOException) << "CFile::write: fwrite failed"; + int err = errno; + THROWF(db0::IOException) << "CFile::write: fwrite failed at address " << address + << ", size " << size << ", error: " << strerror(err); } m_file_pos += size; m_file_size = std::max(m_file_size, m_file_pos); @@ -275,4 +277,4 @@ namespace db0 } #endif -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/ChangeLogTypes.hpp b/src/dbzero/core/storage/ChangeLogTypes.hpp index b210d8aa..27e20362 100644 --- a/src/dbzero/core/storage/ChangeLogTypes.hpp +++ b/src/dbzero/core/storage/ChangeLogTypes.hpp @@ -32,14 +32,13 @@ DB0_PACKED_BEGIN struct DB0_PACKED_ATTR o_dram_changelog_header: o_fixed { // state number this change log corresponds to - StateNumType m_state_num; - // reserved for future use + StateNumType m_state_num; std::array m_reserved = { 0, 0 }; o_dram_changelog_header(StateNumType state_num) : m_state_num(state_num) { - } + } }; DB0_PACKED_END diff --git a/src/dbzero/core/storage/DRAM_IOStream.cpp b/src/dbzero/core/storage/DRAM_IOStream.cpp index 48e70651..13853901 100644 --- a/src/dbzero/core/storage/DRAM_IOStream.cpp +++ b/src/dbzero/core/storage/DRAM_IOStream.cpp @@ -114,23 +114,30 @@ namespace db0 void DRAM_IOStream::load(DRAM_ChangeLogStreamT &changelog_io, std::optional max_state_num) { - // Exhaust the change-log stream first and retrieve the last valid state number + m_prefix->close(); + m_allocator->reset(); + m_reusable_chunks.clear(); + m_page_map.clear(); + + // Exhaust the change-log stream first and retrieve the last valid DRAM IO state number. // its position marks the synchronization point - while (changelog_io.readChangeLogChunk()); + std::optional last_dram_state_num; + while (auto change_log_ptr = changelog_io.readChangeLogChunk()) { + last_dram_state_num = change_log_ptr->m_state_num; + } std::vector buffer(m_chunk_size, 0); const auto &header = o_dram_chunk_header::__ref(buffer.data()); auto bytes = buffer.data() + header.sizeOf(); - auto last_chunk_ptr = changelog_io.getLastChangeLogChunk(); - if (!last_chunk_ptr) { + if (!last_dram_state_num) { // no data to load return; } // The last known consistent state number (unless explicitly provided) if (!max_state_num) { - max_state_num = last_chunk_ptr->m_state_num; + max_state_num = *last_dram_state_num; } std::unordered_set allocs; for (;;) { @@ -191,23 +198,12 @@ namespace db0 auto &reusable_header = o_dram_chunk_header::__new(buffer, state_num); buffer += reusable_header.sizeOf(); - std::unordered_set last_changelog; - if (dram_changelog_io.getLastChangeLogChunk()) { - for (auto addr: *dram_changelog_io.getLastChangeLogChunk()) { - last_changelog.insert(addr); - } - } - - // Finds reusable block, note that blocks from the last change log are not reused - // otherwise the reader process might not be able to access the last transaction + // Do not overwrite old DRAM chunks while publishing a new transaction. + // A reader may deterministically select the previous DRAM changelog state + // while the writer has already flushed newer DRAM pages but has not yet + // finalized the newer changelog. Reusing chunks can destroy pages needed + // by that previous root state and make the root sparse pair open at 0. auto find_reusable = [&, this]() -> std::optional { - for (auto it = m_reusable_chunks.begin(); it != m_reusable_chunks.end(); ++it) { - if (last_changelog.find(*it) == last_changelog.end()) { - auto result = *it; - m_reusable_chunks.erase(it); - return result; - } - } return std::nullopt; }; @@ -508,4 +504,4 @@ namespace db0 return m_state_num == 0 && m_page_num == 0 && m_hash == 0; } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/DiffIndex.cpp b/src/dbzero/core/storage/DiffIndex.cpp index 80b09be1..8ed289f5 100644 --- a/src/dbzero/core/storage/DiffIndex.cpp +++ b/src/dbzero/core/storage/DiffIndex.cpp @@ -132,21 +132,20 @@ namespace db0 void DI_CompressedItem::append(std::uint32_t state_num, std::uint64_t storage_page_num) { DiffArrayT::__ref(m_diff_data.data()).emplaceBack(state_num, storage_page_num); } - - DiffIndex::DiffIndex(std::size_t node_size, std::vector *change_log_ptr) - : SparseIndexBase(node_size, change_log_ptr) - { - } DiffIndex::DiffIndex(DRAM_Pair dram_pair, AccessType access_type, Address address, - std::vector *change_log_ptr, StorageFlags flags) - : SparseIndexBase(dram_pair, access_type, address, change_log_ptr, flags) + std::vector *change_log_ptr, StorageFlags flags, SlotId slot_num, + bool encode_change_log_entries) + : SparseIndexBase(dram_pair, access_type, address, change_log_ptr, flags, slot_num) { + (void)encode_change_log_entries; } - DiffIndex::DiffIndex(tag_create, DRAM_Pair dram_pair, std::vector *change_log_ptr) - : SparseIndexBase(typename super_t::tag_create{}, dram_pair, change_log_ptr) + DiffIndex::DiffIndex(tag_create, DRAM_Pair dram_pair, std::vector *change_log_ptr, + Allocator::SlotId slot_num, bool encode_change_log_entries) + : SparseIndexBase(typename super_t::tag_create{}, dram_pair, change_log_ptr, slot_num) { + (void)encode_change_log_entries; } bool DiffIndex::empty() const { @@ -156,6 +155,36 @@ namespace db0 std::size_t DiffIndex::size() const { return super_t::size(); } + + void DiffIndex::refresh() { + super_t::refresh(); + } + + void DiffIndex::detach() const { + super_t::detach(); + } + + void DiffIndex::commit() const { + super_t::commit(); + } + + bool DiffIndex::operator!() const { + return super_t::operator!(); + } + + Address DiffIndex::getIndexAddress() const { + return super_t::getIndexAddress(); + } + + void DiffIndex::clear() { + super_t::clear(); + } + + void DiffIndex::forPageRange(PageNumT first_page_num, PageNumT last_page_num, + std::function callback) const + { + super_t::forPageRange(first_page_num, last_page_num, std::move(callback)); + } void DiffIndex::insert(PageNumT page_num, StateNumT state_num, PageNumT storage_page_num, bool overflow) { @@ -167,15 +196,10 @@ namespace db0 if (item_ptr && node->header().getPageNum(*item_ptr) == page_num && item_ptr->beginAppend(relative_state_num, relative_storage_page_num)) { // NOTE: relative_state_num & relative_storage_page_num get converted from absolute to relative values db0::modifyMember(node, *item_ptr).append(relative_state_num, relative_storage_page_num); - // collect the change-log - this->update(page_num, state_num, storage_page_num + (overflow ? 1 : 0)); + this->recordChange(page_num); } else { // create new item (with no history of updates) super_t::emplace(page_num, state_num, storage_page_num); - // we also need to account for the overflow - if (overflow) { - this->update(storage_page_num + 1); - } } } @@ -190,34 +214,6 @@ namespace db0 } return super_t::findUpper(page_num, state_num); } - - Address DiffIndex::getIndexAddress() const { - return super_t::getIndexAddress(); - } - - std::optional DiffIndex::getNextStoragePageNum() const { - return super_t::getNextStoragePageNum(); - } - - typename DiffIndex::StateNumT DiffIndex::getMaxStateNum() const { - return super_t::getMaxStateNum(); - } - - void DiffIndex::refresh() { - super_t::refresh(); - } - - void DiffIndex::reopen(Address address) { - super_t::reopen(address); - } - - bool DiffIndex::isOpen() const { - return super_t::isOpen(); - } - - void DiffIndex::commit() { - super_t::commit(); - } DiffIndex::StateNumT DiffIndex::findLower(PageNumT page_num, StateNumT state_num) const { diff --git a/src/dbzero/core/storage/DiffIndex.hpp b/src/dbzero/core/storage/DiffIndex.hpp index bb42d22b..7856ddc6 100644 --- a/src/dbzero/core/storage/DiffIndex.hpp +++ b/src/dbzero/core/storage/DiffIndex.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include #include "SparseIndex.hpp" @@ -115,18 +116,20 @@ DB0_PACKED_BEGIN }; DB0_PACKED_END - class DiffIndex: protected SparseIndexBase + class DiffIndex: protected SparseIndexBase { public: - using super_t = SparseIndexBase; + using super_t = SparseIndexBase; using PageNumT = typename super_t::PageNumT; using StateNumT = typename super_t::StateNumT; + using SlotId = typename super_t::SlotId; - DiffIndex(std::size_t node_size, std::vector *change_log_ptr = nullptr); - DiffIndex(DRAM_Pair, AccessType, Address, std::vector *change_log_ptr = nullptr, StorageFlags = {}); + DiffIndex(DRAM_Pair, AccessType, Address, std::vector *change_log_ptr = nullptr, StorageFlags = {}, + SlotId slot_num = 0, bool encode_change_log_entries = false); struct tag_create {}; - DiffIndex(tag_create, DRAM_Pair, std::vector *change_log_ptr = nullptr); + DiffIndex(tag_create, DRAM_Pair, std::vector *change_log_ptr = nullptr, + SlotId slot_num = 0, bool encode_change_log_entries = false); // Either insert into a new item or extend the existing one // @param overflow flag indicating if the stored page has @@ -135,25 +138,36 @@ DB0_PACKED_END bool empty() const; std::size_t size() const; - - // Find mutation of page_num where state >= state_num - DI_Item findUpper(PageNumT page_num, StateNumT state_num) const; - // Find mutation ID of page_num where state <= state_num - StateNumT findLower(PageNumT page_num, StateNumT state_num) const; - - std::optional getNextStoragePageNum() const; - - StateNumT getMaxStateNum() const; + // SparseIndexBase is a protected implementation detail; republish only + // the operations SparsePair needs to manage the paired index lifecycle. + void refresh(); + void detach() const; + void commit() const; + bool operator!() const; + Address getIndexAddress() const; - void commit(); - - void refresh(); + /** + * Erase all diff descriptors while preserving tree-header mix-in data. + */ + void clear(); + + void forPageRange(PageNumT first_page_num, PageNumT last_page_num, + std::function callback) const; - void reopen(Address); + auto sortedBeginFrom(const DI_Item &first) const { + return super_t::sortedBeginFrom(first); + } - bool isOpen() const; + auto sortedBegin() const { + return super_t::sortedBegin(); + } + + // Find mutation of page_num where state >= state_num + DI_Item findUpper(PageNumT page_num, StateNumT state_num) const; + // Find mutation ID of page_num where state <= state_num + StateNumT findLower(PageNumT page_num, StateNumT state_num) const; }; } diff --git a/src/dbzero/core/storage/Diff_IO.cpp b/src/dbzero/core/storage/Diff_IO.cpp index 644a8b15..98dd07be 100644 --- a/src/dbzero/core/storage/Diff_IO.cpp +++ b/src/dbzero/core/storage/Diff_IO.cpp @@ -2,250 +2,28 @@ // Copyright (c) 2025 DBZero Software sp. z o.o. #include "Diff_IO.hpp" -#include #include -#include -#include namespace db0 { - -DB0_PACKED_BEGIN - struct DB0_PACKED_ATTR o_diff_header: public o_fixed - { - // the number of objects contained - std::uint16_t m_size = 0; - // offset of the first valid object - // (bytes before offset can be taken by remnants of the object from the previous page) - std::uint16_t m_offset = 0; - }; -DB0_PACKED_END - - class DiffWriter - { - public: - // buffer is 2 pages long - DiffWriter(Page_IO &, std::byte *begin, std::byte *end); - - // Append as o_diff_buffer object, if overflow occurs then - // remainig contents needs to be written to the next (+1) storage page - // @return false if append unsuccessful (must be appended to next page) - bool append(const std::byte *dp_data, std::pair page_and_state, - const std::vector &diff_data, bool &overflow); - - // Flush all buffered contents - // @return the number of bytes written - std::size_t flush(); - - // Flush current page with the Page_IO and handle overflow data if such exists - // only flushed if there's been contents written - // @return the number of bytes written - std::size_t flushDP(); - - // Revert the last append operation - void revert(); - - // check if a full-page worth of data has been written - bool isFull() const; - - bool empty() const; - - private: - Page_IO &m_page_io; - std::byte * const m_begin; - std::byte *m_current; - std::byte const *m_end; - const std::uint32_t m_page_size; - // current page's header - o_diff_header &m_header; - std::uint32_t m_last_size = 0; - }; - - class DiffReader - { - public: - // buffer is 2 pages long - DiffReader(Page_IO &, std::uint64_t page_num, std::byte *begin, std::byte *end); - - // appy diffs from a specific page / state number into a provided data buffer - // if underflow occurs then next page needs to be fetched and apply repeated - bool apply(std::byte *dp_data, std::pair page_and_state, - bool &underflow); - - // Load continued data from the next page - void loadNext(); - - private: - Page_IO &m_page_io; - const std::uint32_t m_page_size; - const std::uint64_t m_page_num; - std::byte * const m_begin; - const std::byte *m_current; - std::byte const *m_end; - // the number of objects remaining to be read - unsigned int m_size = 0; - }; - - DiffWriter::DiffWriter(Page_IO &page_io, std::byte *begin, std::byte *end) - : m_page_io(page_io) - , m_begin(begin) - , m_current(begin) - , m_end(end) - , m_page_size(page_io.getPageSize()) - , m_header(o_diff_header::__new(m_current)) - { - m_current += m_header.sizeOf(); - } - - bool DiffWriter::append(const std::byte *dp_data, std::pair page_and_state, - const std::vector &diff_data, bool &overflow) - { - using PairT = o_packed_int_pair; - assert(m_current + o_diff_buffer::measure(dp_data, diff_data) + PairT::measure(page_and_state) <= m_end); - auto begin = m_current; - PairT::write(m_current, page_and_state); - if (m_current + o_diff_buffer::sizeOfHeader() > m_begin + m_page_size) { - // unable to fit headers onto current page, revert - m_current = begin; - return false; - } - auto &diff_buf = o_diff_buffer::__new(m_current, dp_data, diff_data); - m_current += diff_buf.sizeOf(); - assert(m_current <= m_end); - m_last_size = m_current - begin; - ++m_header.m_size; - // overflows a single DP - overflow = m_current > (m_begin + m_page_size); - return true; - } - - std::size_t DiffWriter::flush() - { - std::size_t result = 0; - while (!empty()) { - result += flushDP(); - } - return result; - } - - std::size_t DiffWriter::flushDP() - { - if (empty()) { - return 0; - } - - m_page_io.append(m_begin); - m_header.m_size = 0; - // handle overflowed contents if such exists - if (m_current > (m_begin + m_page_size)) { - // offset is equal number of overflowed bytes - m_header.m_offset = m_current - m_begin - m_page_size; - m_current = m_begin + m_header.sizeOf(); - std::memcpy(m_current, m_begin + m_page_size, m_header.m_offset); - m_current += m_header.m_offset; - } else { - m_header.m_offset = 0; - m_current = m_begin + m_header.sizeOf(); - } - return m_page_size; - } - - void DiffWriter::revert() - { - assert(m_header.m_size > 0); - assert(m_current - m_last_size >= m_begin); - --m_header.m_size; - m_current -= m_last_size; - } - - bool DiffWriter::isFull() const { - return m_current >= (m_begin + m_page_size); - } - - bool DiffWriter::empty() const { - return m_header.m_size == 0 && m_header.m_offset == 0; - } - - DiffReader::DiffReader(Page_IO &page_io, std::uint64_t page_num, std::byte *begin, std::byte *end) - : m_page_io(page_io) - , m_page_size(page_io.getPageSize()) - , m_page_num(page_num) - , m_begin(begin) - , m_current(begin + m_page_size) - , m_end(end) - { - page_io.read(page_num, m_begin + m_page_size); - m_size = o_diff_header::__const_ref(m_current).m_size; - // position at the first diff block - m_current += o_diff_header::sizeOf() + o_diff_header::__const_ref(m_current).m_offset; - if (m_current > m_end) { - Settings::m_decode_error(); - } - } - - bool DiffReader::apply(std::byte *dp_data, std::pair page_and_state, - bool &underflow) - { - using PairT = o_packed_int_pair; - while (m_size > 0) { - auto revert_to = m_current; - auto revert_to_size = m_size; - auto next_page_and_state = PairT::read(m_current); - auto diff_buf_size = o_diff_buffer::safeSizeOf(m_current); - if (next_page_and_state == page_and_state) { - if (m_current + diff_buf_size > m_end) { - m_current = revert_to; - m_size = revert_to_size; - // need to handle the underflow - underflow = true; - return false; - } - - auto &diff_buf = o_diff_buffer::__safe_const_ref( - const_bounded_buf_t(Settings::m_decode_error, m_current, m_end) - ); - diff_buf.apply(dp_data, dp_data + m_page_size); - m_current += diff_buf_size; - --m_size; - return true; - } - m_current += diff_buf_size; - --m_size; - } - // unable to locate the diff block - return false; - } - - void DiffReader::loadNext() - { - assert(m_current >= (m_begin + m_page_size)); - // move underflown contents - auto offset = m_current - (m_begin + m_page_size); - auto size = m_end - m_current; - std::memcpy(m_begin + offset, m_current, size); - m_current = m_begin + offset; - // read the next page - m_page_io.read(m_page_num + 1, m_begin + m_page_size); - // and merge neighboring parts of the diff block (note that header gets overwritten) - std::memmove((void*)(m_current + o_diff_header::sizeOf()), m_current, size); - m_current += o_diff_header::sizeOf(); - } Diff_IO::Diff_IO(std::size_t header_size, CFile &file, std::uint32_t page_size, std::uint32_t block_size, std::uint64_t address, std::uint32_t page_count, std::uint32_t step_size, std::function tail_function, std::optional block_num) : Page_IO(header_size, file, page_size, block_size, address, page_count, step_size, tail_function, block_num) + , m_codec_access(reinterpret_cast(*this)) , m_write_buf(page_size * 2) , m_read_buf(page_size * 2) - , m_writer(std::make_unique( - reinterpret_cast(*this), m_write_buf.data(), m_write_buf.data() + m_write_buf.size()) + , m_writer(std::make_unique>( + m_codec_access, m_write_buf.data(), m_write_buf.data() + m_write_buf.size()) ) { } Diff_IO::Diff_IO(std::size_t header_size, CFile &file, std::uint32_t page_size) : Page_IO(header_size, file, page_size) + , m_codec_access(reinterpret_cast(*this)) , m_read_buf(page_size * 2) { } @@ -261,39 +39,10 @@ DB0_PACKED_END // must lock because the write-buffer is shared std::unique_lock lock(m_mx_write); assert(m_writer); - for (;;) { - if (m_writer->isFull()) { - m_diff_bytes_written += m_writer->flushDP(); - } - bool overflow = false; - auto next_page_num = Page_IO::getNextPageNum(is_first_page); - assert(next_page_num.second > 0); - if (is_first_page) { - // Must be first write into the first page (of the step) - // to report result as the is_first_page = true - *is_first_page &= m_writer->empty(); - } - if (m_writer->append((const std::byte*)dp_data, page_and_state, diff_data, overflow)) { - if (overflow) { - // on overflow we can either append remnants to the next storage page (+1) - // if such is available or revert the append and try again with a fresh buffer - if (next_page_num.second > 1) { - // flush with the Page_IO - m_diff_bytes_written += m_writer->flushDP(); - } else { - m_writer->revert(); - m_diff_bytes_written += m_writer->flushDP(); - // continue with a fresh buffer - continue; - } - } - return { next_page_num.first, overflow }; - } else { - // continue with a fresh buffer - m_diff_bytes_written += m_writer->flushDP(); - continue; - } - } + auto result = detail::appendDiff(m_codec_access, *m_writer, dp_data, page_and_state, diff_data, + is_first_page, &m_diff_bytes_written); + m_modified = true; + return result; } void Diff_IO::applyFrom(std::uint64_t page_num, void *buffer, @@ -301,19 +50,8 @@ DB0_PACKED_END { // must lock because the read-buffer is shared std::unique_lock lock(m_mx_read); - DiffReader reader((Page_IO&)*this, page_num, m_read_buf.data(), m_read_buf.data() + m_read_buf.size()); - for (;;) { - bool underflow = false; - if (reader.apply((std::byte*)buffer, page_and_state, underflow)) { - return; - } - if (underflow) { - // repeat after fetching the next page - reader.loadNext(); - continue; - } - THROWF(db0::InternalException) << "Diff block not found"; - } + detail::applyFrom(m_codec_access, page_num, buffer, page_and_state, "Diff block not found", + m_read_buf.data(), m_read_buf.data() + m_read_buf.size()); } void Diff_IO::flush() @@ -322,6 +60,12 @@ DB0_PACKED_END if (m_writer) { m_diff_bytes_written += m_writer->flush(); } + m_modified = false; + } + + bool Diff_IO::modified() const + { + return m_modified; } void Diff_IO::write(std::uint64_t page_num, void *buffer) @@ -332,6 +76,7 @@ DB0_PACKED_END m_diff_bytes_written += m_writer->flush(); } Page_IO::write(page_num, buffer); + m_modified = true; } void Diff_IO::read(std::uint64_t page_num, void *buffer) const @@ -348,11 +93,23 @@ DB0_PACKED_END m_diff_bytes_written += m_writer->flush(); } m_full_dp_bytes_written += m_page_size; + m_modified = true; return Page_IO::append(buffer, is_first_page_ptr); } + + std::uint64_t Diff_IO::reserve(std::uint32_t page_count, bool *is_first_page) + { + // Reservations advance the shared Page_IO cursor. Flush pending diff + // pages first so page numbers already returned by appendDiff remain valid. + std::unique_lock lock(m_mx_write); + if (m_writer) { + m_diff_bytes_written += m_writer->flush(); + } + return Page_IO::reserve(page_count, is_first_page); + } std::pair Diff_IO::getStats() const { return { m_full_dp_bytes_written + m_diff_bytes_written, m_diff_bytes_written }; } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/Diff_IO.hpp b/src/dbzero/core/storage/Diff_IO.hpp index 6b47843e..17907b43 100644 --- a/src/dbzero/core/storage/Diff_IO.hpp +++ b/src/dbzero/core/storage/Diff_IO.hpp @@ -3,16 +3,14 @@ #pragma once +#include "Diff_IOCodec.hpp" #include "Page_IO.hpp" -#include "diff_buffer.hpp" #include namespace db0 { - class DiffWriter; - // Diff_IO is a Page_IO extension specialized in // storage & retrieval of diff sequences class Diff_IO: public Page_IO @@ -45,28 +43,55 @@ namespace db0 // Flush needs to be called before closing the stream // and after each transaction void flush(); + + bool modified() const; // Write as full-DP void write(std::uint64_t page_num, void *buffer); std::uint64_t append(const void *buffer, bool *is_first_page = nullptr); + std::uint64_t reserve(std::uint32_t page_count, bool *is_first_page = nullptr); + void read(std::uint64_t page_num, void *buffer) const; // @return total bytes written/ diff bytes written std::pair getStats() const; protected: + class CodecAccess + { + public: + explicit CodecAccess(Page_IO &page_io) + : m_page_io(page_io) + { + } + + std::uint32_t getPageSize() const { return m_page_io.getPageSize(); } + std::pair getNextPageNum(bool *is_first_page) + { + return m_page_io.getNextPageNum(is_first_page); + } + std::uint64_t append(const void *buffer) { return m_page_io.append(buffer); } + void read(std::uint64_t page_num, void *buffer) const { m_page_io.read(page_num, buffer); } + std::uint64_t nextPageNum(std::uint64_t page_num) const { return page_num + 1; } + + private: + Page_IO &m_page_io; + }; + mutable std::mutex m_mx_write; + CodecAccess m_codec_access; // the data buffer to hold up to 2 data pages std::vector m_write_buf; mutable std::mutex m_mx_read; mutable std::vector m_read_buf; - std::unique_ptr m_writer; + std::unique_ptr> m_writer; // total bytes written to the stream (since class creation) using full-DP method std::size_t m_full_dp_bytes_written = 0; // total bytes written using the diff mechanism std::size_t m_diff_bytes_written = 0; + bool m_modified = false; }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/Diff_IOCodec.hpp b/src/dbzero/core/storage/Diff_IOCodec.hpp new file mode 100644 index 00000000..4d66271e --- /dev/null +++ b/src/dbzero/core/storage/Diff_IOCodec.hpp @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2026 DBZero Software sp. z o.o. + +#pragma once + +#include "diff_buffer.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace db0::detail + +{ + +DB0_PACKED_BEGIN + struct DB0_PACKED_ATTR o_diff_io_codec_header: public o_fixed + { + std::uint16_t m_size = 0; + std::uint16_t m_offset = 0; + }; +DB0_PACKED_END + + template + class DiffIOCodecWriter + { + public: + DiffIOCodecWriter(AccessT &access, std::byte *begin, std::byte *end) + : m_access(access) + , m_begin(begin) + , m_current(begin) + , m_end(end) + , m_page_size(access.getPageSize()) + , m_header(o_diff_io_codec_header::__new(m_current)) + { + m_current += m_header.sizeOf(); + } + + bool append(const std::byte *dp_data, std::pair page_and_state, + const std::vector &diff_data, bool &overflow) + { + using PairT = o_packed_int_pair; + assert(m_current + o_diff_buffer::measure(dp_data, diff_data) + PairT::measure(page_and_state) <= m_end); + auto begin = m_current; + PairT::write(m_current, page_and_state); + if (m_current + o_diff_buffer::sizeOfHeader() > m_begin + m_page_size) { + m_current = begin; + return false; + } + auto &diff_buf = o_diff_buffer::__new(m_current, dp_data, diff_data); + m_current += diff_buf.sizeOf(); + assert(m_current <= m_end); + m_last_size = m_current - begin; + ++m_header.m_size; + overflow = m_current > (m_begin + m_page_size); + return true; + } + + std::size_t flush() + { + std::size_t result = 0; + while (!empty()) { + result += flushDP(); + } + return result; + } + + std::size_t flushDP() + { + if (empty()) { + return 0; + } + + m_access.append(m_begin); + m_header.m_size = 0; + if (m_current > (m_begin + m_page_size)) { + m_header.m_offset = m_current - m_begin - m_page_size; + m_current = m_begin + m_header.sizeOf(); + std::memcpy(m_current, m_begin + m_page_size, m_header.m_offset); + m_current += m_header.m_offset; + } else { + m_header.m_offset = 0; + m_current = m_begin + m_header.sizeOf(); + } + return m_page_size; + } + + void revert() + { + assert(m_header.m_size > 0); + assert(m_current - m_last_size >= m_begin); + --m_header.m_size; + m_current -= m_last_size; + } + + bool isFull() const + { + return m_current >= (m_begin + m_page_size); + } + + bool empty() const + { + return m_header.m_size == 0 && m_header.m_offset == 0; + } + + private: + AccessT &m_access; + std::byte * const m_begin; + std::byte *m_current; + std::byte const *m_end; + const std::uint32_t m_page_size; + o_diff_io_codec_header &m_header; + std::uint32_t m_last_size = 0; + }; + + template + class DiffIOCodecReader + { + public: + DiffIOCodecReader(const AccessT &access, std::uint64_t page_num, std::byte *begin, std::byte *end) + : m_access(access) + , m_page_size(access.getPageSize()) + , m_page_num(page_num) + , m_begin(begin) + , m_current(begin + m_page_size) + , m_end(end) + { + m_access.read(page_num, m_begin + m_page_size); + m_size = o_diff_io_codec_header::__const_ref(m_current).m_size; + m_current += o_diff_io_codec_header::sizeOf() + + o_diff_io_codec_header::__const_ref(m_current).m_offset; + if (m_current > m_end) { + Settings::m_decode_error(); + } + } + + bool apply(std::byte *dp_data, std::pair page_and_state, + bool &underflow) + { + using PairT = o_packed_int_pair; + while (m_size > 0) { + auto revert_to = m_current; + auto revert_to_size = m_size; + auto next_page_and_state = PairT::read(m_current); + auto diff_buf_size = o_diff_buffer::safeSizeOf(m_current); + if (next_page_and_state == page_and_state) { + if (m_current + diff_buf_size > m_end) { + m_current = revert_to; + m_size = revert_to_size; + underflow = true; + return false; + } + + auto &diff_buf = o_diff_buffer::__safe_const_ref( + const_bounded_buf_t(Settings::m_decode_error, m_current, m_end) + ); + diff_buf.apply(dp_data, dp_data + m_page_size); + m_current += diff_buf_size; + --m_size; + return true; + } + m_current += diff_buf_size; + --m_size; + } + return false; + } + + void loadNext() + { + assert(m_current >= (m_begin + m_page_size)); + auto offset = m_current - (m_begin + m_page_size); + auto size = m_end - m_current; + std::memcpy(m_begin + offset, m_current, size); + m_current = m_begin + offset; + m_access.read(m_access.nextPageNum(m_page_num), m_begin + m_page_size); + std::memmove((void*)(m_current + o_diff_io_codec_header::sizeOf()), m_current, size); + m_current += o_diff_io_codec_header::sizeOf(); + } + + private: + const AccessT &m_access; + const std::uint32_t m_page_size; + const std::uint64_t m_page_num; + std::byte * const m_begin; + const std::byte *m_current; + std::byte const *m_end; + unsigned int m_size = 0; + }; + + template + std::pair appendDiff(AccessT &access, DiffIOCodecWriter &writer, + const void *dp_data, std::pair page_and_state, + const std::vector &diff_data, bool *is_first_page, std::size_t *bytes_written = nullptr) + { + auto add_bytes = [bytes_written](std::size_t bytes) { + if (bytes_written) { + *bytes_written += bytes; + } + }; + for (;;) { + if (writer.isFull()) { + add_bytes(writer.flushDP()); + } + bool overflow = false; + auto next_page_num = access.getNextPageNum(is_first_page); + assert(next_page_num.second > 0); + if (is_first_page) { + *is_first_page &= writer.empty(); + } + if (writer.append((const std::byte*)dp_data, page_and_state, diff_data, overflow)) { + if (overflow) { + if (next_page_num.second > 1) { + add_bytes(writer.flushDP()); + } else { + writer.revert(); + add_bytes(writer.flushDP()); + continue; + } + } + return { next_page_num.first, overflow }; + } + add_bytes(writer.flushDP()); + } + } + + template + void applyFrom(const AccessT &access, std::uint64_t page_num, void *buffer, + std::pair page_and_state, const char *error_context, + std::byte *read_begin, std::byte *read_end) + { + DiffIOCodecReader reader(access, page_num, read_begin, read_end); + for (;;) { + bool underflow = false; + if (reader.apply((std::byte*)buffer, page_and_state, underflow)) { + return; + } + if (underflow) { + reader.loadNext(); + continue; + } + THROWF(db0::IOException) << error_context << ": storage_page_num=" << page_num + << ", page_num=" << page_and_state.first << ", state_num=" << page_and_state.second; + } + } + +} diff --git a/src/dbzero/core/storage/ExtSpace.cpp b/src/dbzero/core/storage/ExtSpace.cpp index 15732af8..fc937c7a 100644 --- a/src/dbzero/core/storage/ExtSpace.cpp +++ b/src/dbzero/core/storage/ExtSpace.cpp @@ -66,6 +66,13 @@ namespace db0 m_rel_index->commit(); } } + + void ExtSpace::clearMappings() + { + if (m_rel_index) { + m_rel_index->clearMappings(); + } + } db0::v_object ExtSpace::tryOpenRoot() const { @@ -95,4 +102,4 @@ namespace db0 return std::make_unique(m_rel_index->cbegin()); } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/ExtSpace.hpp b/src/dbzero/core/storage/ExtSpace.hpp index 186d8874..26940831 100644 --- a/src/dbzero/core/storage/ExtSpace.hpp +++ b/src/dbzero/core/storage/ExtSpace.hpp @@ -82,6 +82,8 @@ DB0_PACKED_END assert(m_rel_index); m_rel_index->addMapping(storage_page_num, rel_page_num, count); } + + void clearMappings(); // Begins the iterator over sorted elements (on condition that ExtSpace is valid) std::unique_ptr tryBegin() const; @@ -102,4 +104,4 @@ DB0_PACKED_END std::unique_ptr tryOpenPrimaryREL_Index(AccessType) const; }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/Page_IO.cpp b/src/dbzero/core/storage/Page_IO.cpp index e9f533ed..5fda10c5 100644 --- a/src/dbzero/core/storage/Page_IO.cpp +++ b/src/dbzero/core/storage/Page_IO.cpp @@ -8,6 +8,51 @@ namespace db0 { + void ReservePool::add(std::uint64_t page_num, std::uint32_t page_count) + { + if (page_count == 0) { + return; + } + if (!m_strides.empty() && m_strides.back().m_page_num + m_strides.back().m_page_count == page_num) { + m_strides.back().m_page_count += page_count; + } else { + m_strides.push_back({ page_num, page_count }); + } + } + + bool ReservePool::empty() const + { + return m_strides.empty(); + } + + std::pair ReservePool::next() const + { + assert(!m_strides.empty()); + return { m_strides.front().m_page_num, m_strides.front().m_page_count }; + } + + std::uint64_t ReservePool::pop() + { + auto result = tryPop(1); + assert(result); + return *result; + } + + std::optional ReservePool::tryPop(std::uint32_t page_count) + { + assert(page_count > 0); + if (m_strides.empty() || m_strides.front().m_page_count < page_count) { + return {}; + } + + auto result = m_strides.front().m_page_num; + m_strides.front().m_page_num += page_count; + m_strides.front().m_page_count -= page_count; + if (m_strides.front().m_page_count == 0) { + m_strides.pop_front(); + } + return result; + } Page_IO::Page_IO(std::size_t header_size, CFile &file, std::uint32_t page_size, std::uint32_t block_size, std::uint64_t address, std::uint32_t page_count, std::uint32_t step_size, std::function tail_function, @@ -44,13 +89,22 @@ namespace db0 std::uint64_t Page_IO::append(const void *buffer, bool *is_first_page_ptr) { assert(m_access_type == AccessType::READ_WRITE); + if (!m_reserve_pool.empty()) { + auto page_num = m_reserve_pool.pop(); + if (is_first_page_ptr) { + *is_first_page_ptr = isFirstPageInStep(page_num); + } + write(page_num, buffer); + return page_num; + } + if (m_page_count == m_block_capacity) { allocateNextBlock(); } if (is_first_page_ptr) { // first page of the first block in the step - *is_first_page_ptr = (m_page_count == 0) && (m_block_num && *m_block_num == 0); + *is_first_page_ptr = isFirstPageInStep(m_first_page_num + m_page_count); } m_file.write(m_address + m_page_count * m_page_size, m_page_size, buffer); @@ -60,25 +114,70 @@ namespace db0 std::uint64_t Page_IO::append(const void *buffer, std::uint64_t page_count) { assert(m_access_type == AccessType::READ_WRITE); - auto result = getNextPageNum().first; + if (page_count == 1) { + return append(buffer); + } + + if (auto available_page_num = m_reserve_pool.tryPop(page_count)) { + auto result = *available_page_num; + m_file.write(m_header_size + result * m_page_size, page_count * m_page_size, buffer); + return result; + } + + if (m_page_count == m_block_capacity) { + allocateNextBlock(); + } + + auto result = m_first_page_num + m_page_count; + auto step_remaining = getCurrentStepRemainingPages(); + if (page_count > step_remaining) { + THROWF(db0::InternalException) + << "Page_IO::append: multi-page append must fit in the current consecutive step"; + } const std::byte *byte_buffer = static_cast(buffer); - while (page_count > 0) { - // allocate next block or step - if (page_count > 0 && m_page_count == m_block_capacity) { + auto to_write_bytes = page_count * m_page_size; + m_file.write(m_address + m_page_count * m_page_size, to_write_bytes, byte_buffer); + byte_buffer += to_write_bytes; + moveBy(page_count); + return result; + } + + std::uint64_t Page_IO::reserve(std::uint32_t page_count, bool *is_first_page_ptr) + { + assert(m_access_type == AccessType::READ_WRITE); + if (page_count == 0) { + THROWF(db0::InternalException) + << "Page_IO::reserve: page count must be greater than zero"; + } + + if (m_page_count == m_block_capacity) { + allocateNextBlock(); + } + + if (m_block_num) { + if (page_count > m_step_size * m_block_capacity) { + THROWF(db0::InternalException) + << "Page_IO::reserve: unable to reserve more pages than fit in a step"; + } + while (getCurrentStepRemainingPages() < page_count) { + auto step_remaining = getCurrentStepRemainingPages(); + collectReservePool(step_remaining); + moveBy(step_remaining); allocateNextBlock(); } + } else if (page_count > (m_block_capacity - m_page_count)) { + THROWF(db0::InternalException) + << "Page_IO::reserve: unable to reserve a contiguous range without step access"; + } - // the number of pages remaining in the current step - auto step_remaining = getCurrentStepRemainingPages(); - if (step_remaining > 0) { - auto to_write_pages = std::min(static_cast(page_count), step_remaining); - auto to_write_bytes = to_write_pages * m_page_size; - m_file.write(m_address + m_page_count * m_page_size, to_write_bytes, byte_buffer); - byte_buffer += to_write_bytes; - // position at the new address (within the current step) - moveBy(to_write_pages); - page_count -= to_write_pages; - } + if (is_first_page_ptr) { + *is_first_page_ptr = isFirstPageInStep(m_first_page_num + m_page_count); + } + auto result = m_first_page_num + m_page_count; + if (m_block_num) { + moveBy(page_count); + } else { + m_page_count += page_count; } return result; } @@ -102,6 +201,12 @@ namespace db0 m_block_num = 0; } } + + void Page_IO::collectReservePool(std::uint32_t page_count) + { + auto page_num = m_first_page_num + m_page_count; + m_reserve_pool.add(page_num, page_count); + } void Page_IO::read(std::uint64_t page_num, void *buffer) const { m_file.read(m_header_size + page_num * m_page_size, m_page_size, buffer); @@ -111,14 +216,14 @@ namespace db0 m_file.read(m_header_size + page_num * m_page_size, page_count * m_page_size, buffer); } - void Page_IO::write(std::uint64_t page_num, void *buffer) { + void Page_IO::write(std::uint64_t page_num, const void *buffer) { m_file.write(m_header_size + page_num * m_page_size, m_page_size, buffer); } std::uint64_t Page_IO::getPageNum(std::uint64_t address) const { return (address - m_header_size) / m_page_size; } - + std::uint64_t Page_IO::tail() const { assert(m_access_type == AccessType::READ_WRITE); @@ -138,12 +243,20 @@ namespace db0 std::pair Page_IO::getNextPageNum(bool *is_first_page_ptr) { assert(m_access_type == AccessType::READ_WRITE); + if (!m_reserve_pool.empty()) { + auto result = m_reserve_pool.next(); + if (is_first_page_ptr) { + *is_first_page_ptr = isFirstPageInStep(result.first); + } + return result; + } + if (m_page_count == m_block_capacity) { allocateNextBlock(); } if (is_first_page_ptr) { // first page of the first block in the step - *is_first_page_ptr = (m_page_count == 0) && (m_block_num && *m_block_num == 0); + *is_first_page_ptr = isFirstPageInStep(m_first_page_num + m_page_count); } return { m_first_page_num + m_page_count, m_block_capacity - m_page_count }; } @@ -153,10 +266,45 @@ namespace db0 assert(m_access_type == AccessType::READ_WRITE); if (is_first_page_ptr) { // first page of the first block in the step - *is_first_page_ptr = (m_page_count == 0) && (m_block_num && *m_block_num == 0); + *is_first_page_ptr = isFirstPageInStep(m_first_page_num + m_page_count); } return m_first_page_num + m_page_count; } + + void Page_IO::setAtTail() + { + assert(m_access_type == AccessType::READ_WRITE); + auto address = m_tail_function(); + if (address <= m_header_size) { + address = m_header_size; + } else { + auto rel_address = address - m_header_size; + auto rel_pages = (rel_address + m_page_size - 1) / m_page_size; + address = m_header_size + rel_pages * m_page_size; + } + + setAtPageNum(getPageNum(address)); + } + + void Page_IO::setAtPageNum(std::uint64_t page_num) + { + assert(m_access_type == AccessType::READ_WRITE); + auto current_next_page_num = m_first_page_num + m_page_count; + if (page_num <= current_next_page_num) { + return; + } + + auto block_id = (page_num * m_page_size) / m_block_size; + m_address = m_header_size + block_id * m_block_size; + m_page_count = static_cast(page_num % m_block_capacity); + if (m_page_count == 0) { + m_address -= m_block_size; + m_page_count = m_block_capacity; + --block_id; + } + m_first_page_num = getPageNum(m_address); + m_block_num = static_cast(block_id % m_step_size); + } Page_IO::StepIterator::StepIterator(const ExtSpace &ext_space) : m_next_it(ext_space.tryBegin()) @@ -283,29 +431,26 @@ namespace db0 THROWF(db0::InternalException) << "Page_IO::moveBy: step access not initialized"; } - // move by the end of the current block - auto count = std::min(page_count, m_block_capacity - m_page_count); - auto new_block_num = *m_block_num + (page_count - count) / m_block_capacity + 1; - if (new_block_num > m_step_size) { + auto old_block_num = *m_block_num; + auto step_offset = old_block_num * m_block_capacity + m_page_count + page_count; + auto step_capacity = m_step_size * m_block_capacity; + if (step_offset > step_capacity) { THROWF(db0::InternalException) << "Page_IO::moveBy: attempt to move beyond the current step"; } - // positioned at the end of the step + + auto new_block_num = step_offset / m_block_capacity; + auto new_page_count = step_offset % m_block_capacity; if (new_block_num == m_step_size) { - --new_block_num; - } - - auto page_diff = count + (new_block_num - *m_block_num - 1) * m_block_capacity; - page_count -= page_diff; - if (page_count > m_block_capacity) { - THROWF(db0::InternalException) << "Page_IO::moveBy: attempt to move beyond the current step"; + new_block_num = m_step_size - 1; + new_page_count = m_block_capacity; } - // set new position variables (might be end of the block / step) - m_first_page_num += page_diff; - m_address += page_diff * m_page_size; + auto block_diff = new_block_num - old_block_num; + m_first_page_num += block_diff * m_block_capacity; + m_address += block_diff * m_block_size; assert(m_address == m_header_size + m_first_page_num * m_page_size); m_block_num = new_block_num; - m_page_count = page_count; + m_page_count = new_page_count; } std::uint32_t Page_IO::getCurrentStepRemainingPages() const @@ -327,4 +472,4 @@ namespace db0 return blocks_remaining * m_block_capacity + pages_remaining_in_block; } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/Page_IO.hpp b/src/dbzero/core/storage/Page_IO.hpp index 78ffa520..e239c793 100644 --- a/src/dbzero/core/storage/Page_IO.hpp +++ b/src/dbzero/core/storage/Page_IO.hpp @@ -5,11 +5,33 @@ #include "CFile.hpp" #include "ExtSpace.hpp" +#include #include +#include +#include namespace db0 { + + class ReservePool + { + public: + void add(std::uint64_t page_num, std::uint32_t page_count); + bool empty() const; + std::pair next() const; + std::uint64_t pop(); + std::optional tryPop(std::uint32_t page_count); + + private: + struct Stride + { + std::uint64_t m_page_num = 0; + std::uint32_t m_page_count = 0; + }; + + std::deque m_strides; + }; /** * Page_IO organizes file's data into blocks of pages @@ -43,19 +65,23 @@ namespace db0 // NOTE: first block (on first page) must be registered with REL_Index if it's maintained std::uint64_t append(const void *buffer, bool *is_first_page = nullptr); - // Appends one or more pages to the stream + // Appends one or more consecutive pages to the stream. // @return first appended page number (aka storage page number) std::uint64_t append(const void *buffer, std::uint64_t page_count); + + // Reserves one or more contiguous pages in the stream without writing page payloads. + // @return first reserved page number (aka storage page number) + std::uint64_t reserve(std::uint32_t page_count, bool *is_first_page = nullptr); void read(std::uint64_t page_num, void *buffer) const; // Read multiple consecutive pages void read(std::uint64_t page_num, void *buffer, std::uint32_t page_count) const; - + /** * Overwrite existing page */ - void write(std::uint64_t page_num, void *buffer); + void write(std::uint64_t page_num, const void *buffer); std::uint64_t tail() const; @@ -72,6 +98,12 @@ namespace db0 // Get the number of pages remaining in the current step (for append) std::uint32_t getCurrentStepRemainingPages() const; + + // Move the append cursor forward if another stream has extended the file tail. + void setAtTail(); + + // Move the append cursor forward to the given next page number. + void setAtPageNum(std::uint64_t page_num); // @return step size in number of blocks std::size_t getStepSize() const { @@ -155,13 +187,20 @@ namespace db0 const AccessType m_access_type; // block number within the step std::optional m_block_num; + // Pool of pages skipped to satisfy a larger contiguous reserve in a later step. + ReservePool m_reserve_pool; std::uint64_t getPageNum(std::uint64_t address) const; + bool isFirstPageInStep(std::uint64_t page_num) const { + return m_step_size > 0 && m_block_capacity > 0 + && (page_num % (m_step_size * m_block_capacity)) == 0; + } void allocateNextBlock(); + void collectReservePool(std::uint32_t page_count); // Update the stream's current location within the current step // @param page_count number of pages to move by within the current step void moveBy(std::uint32_t page_count); }; -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/REL_Index.cpp b/src/dbzero/core/storage/REL_Index.cpp index e36184a1..09d35b3b 100644 --- a/src/dbzero/core/storage/REL_Index.cpp +++ b/src/dbzero/core/storage/REL_Index.cpp @@ -188,9 +188,18 @@ namespace db0 std::uint64_t REL_Index::assignRelative(std::uint64_t storage_page_num, bool is_first_in_step) { + auto current_range_end = m_storage_page_num + (m_max_rel_page_num - m_rel_page_num); + if (storage_page_num < m_storage_page_num) { + super_t::insert({ ++m_max_rel_page_num, storage_page_num }); + m_storage_page_num = storage_page_num; + m_rel_page_num = m_max_rel_page_num; + return m_rel_page_num; + } + assert(storage_page_num >= m_storage_page_num); + auto starts_new_range = storage_page_num > current_range_end + 1; // prevent adding a duplicate mapping (e.g. might be called multiple times after appendDiff) - if (is_first_in_step && (storage_page_num != m_storage_page_num)) { + if ((is_first_in_step || starts_new_range) && (storage_page_num != m_storage_page_num)) { super_t::insert({ ++m_max_rel_page_num, storage_page_num }); assert(storage_page_num > m_storage_page_num); m_storage_page_num = storage_page_num; @@ -233,6 +242,14 @@ namespace db0 m_rel_page_num = this->treeHeader().m_rel_page_num; m_max_rel_page_num = this->treeHeader().m_max_rel_page_num; } + + void REL_Index::clearMappings() + { + super_t::clear(); + m_storage_page_num = 0; + m_rel_page_num = 0; + m_max_rel_page_num = 0; + } std::uint64_t REL_Index::getAbsolute(std::uint64_t rel_page_num) const { @@ -274,4 +291,4 @@ namespace std return os << item.toString(); } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/REL_Index.hpp b/src/dbzero/core/storage/REL_Index.hpp index 73cea49f..63af90fd 100644 --- a/src/dbzero/core/storage/REL_Index.hpp +++ b/src/dbzero/core/storage/REL_Index.hpp @@ -212,6 +212,8 @@ DB0_PACKED_END void refresh(); + void clearMappings(); + std::uint64_t size() const; const_iterator cbegin() const; @@ -231,4 +233,4 @@ namespace std ostream &operator<<(ostream &, const db0::REL_Item &); -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/RandomIO_Stream.cpp b/src/dbzero/core/storage/RandomIO_Stream.cpp new file mode 100644 index 00000000..010a62c9 --- /dev/null +++ b/src/dbzero/core/storage/RandomIO_Stream.cpp @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2026 DBZero Software sp. z o.o. + +#include "RandomIO_Stream.hpp" +#include "Diff_IOCodec.hpp" +#include +#include +#include +#include + +namespace db0 + +{ + + namespace + { + + struct RandomIOStreamControlPage + { + static constexpr std::uint64_t MAGIC = 0x44423052494f5354ULL; // "DB0RIOST" + static constexpr std::uint32_t VERSION = 1; + + std::uint64_t m_magic; + std::uint32_t m_version; + std::uint32_t m_generation; + std::uint32_t m_type; + std::uint32_t m_control_index; + std::uint32_t m_first_data_is_first_page; + std::uint64_t m_next_chunk_page_num; + }; + + constexpr std::uint32_t CONTROL_END = 1; + constexpr std::uint32_t CONTROL_LINK = 2; + + std::uint32_t calcPageRatio(std::uint32_t page_size, std::uint32_t underlying_page_size) + { + if (page_size < underlying_page_size || page_size % underlying_page_size != 0) { + THROWF(db0::InternalException) + << "RandomIO_Stream page size must be a multiple of the underlying page size"; + } + return page_size / underlying_page_size; + } + + std::uint32_t getDataPagesPerChunk(std::uint32_t stride, std::uint32_t page_ratio) + { + if (stride < page_ratio + 1) { + THROWF(db0::InternalException) + << "RandomIO_Stream stride must fit at least one data page and one control page"; + } + return (stride - 1) / page_ratio; + } + + bool isControlPage(const RandomIOStreamControlPage &control, std::uint32_t generation, + std::uint32_t max_control_index) + { + if (control.m_magic != RandomIOStreamControlPage::MAGIC + || control.m_version != RandomIOStreamControlPage::VERSION) { + return false; + } + if (control.m_generation != generation) { + return false; + } + if (control.m_control_index > max_control_index) { + return false; + } + return control.m_type == CONTROL_END || control.m_type == CONTROL_LINK; + } + + } + + class RandomIO_Stream::CodecAccess + { + public: + explicit CodecAccess(RandomIO_Stream &stream) + : m_stream(stream) + { + } + + std::uint32_t getPageSize() const { return m_stream.getPageSize(); } + std::pair getNextPageNum(bool *is_first_page) + { + return m_stream.getNextPageNum(is_first_page); + } + std::uint64_t append(const void *buffer) { return m_stream.append(buffer); } + void read(std::uint64_t page_num, void *buffer) const { m_stream.readRandom(page_num, buffer); } + std::uint64_t nextPageNum(std::uint64_t page_num) const + { + return page_num + m_stream.m_page_ratio; + } + + private: + RandomIO_Stream &m_stream; + }; + + class RandomIO_Stream::ConstCodecAccess + { + public: + explicit ConstCodecAccess(const RandomIO_Stream &stream) + : m_stream(stream) + { + } + + std::uint32_t getPageSize() const { return m_stream.getPageSize(); } + void read(std::uint64_t page_num, void *buffer) const { m_stream.readRandom(page_num, buffer); } + std::uint64_t nextPageNum(std::uint64_t page_num) const + { + return page_num + m_stream.m_page_ratio; + } + + private: + const RandomIO_Stream &m_stream; + }; + + RandomIO_Stream::RandomIO_Stream(Diff_IO &page_io, std::uint32_t stride, std::uint32_t page_size) + : m_page_io(page_io) + , m_access_type(AccessType::READ_WRITE) + , m_stride(stride) + , m_page_size(page_size ? page_size : page_io.getPageSize()) + , m_page_ratio(calcPageRatio(m_page_size, page_io.getPageSize())) + , m_data_pages_per_chunk(getDataPagesPerChunk(stride, m_page_ratio)) + , m_write_buf(m_page_size * 2) + , m_read_buf(m_page_size * 2) + , m_control_buf(page_io.getPageSize()) + { + if (sizeof(RandomIOStreamControlPage) > m_page_io.getPageSize()) { + THROWF(db0::InternalException) << "RandomIO_Stream control page does not fit into a page"; + } + + allocateFirstChunk(); + } + + RandomIO_Stream::RandomIO_Stream(Diff_IO &page_io, std::uint64_t page_num, std::uint32_t stride, + AccessType access_type, std::uint32_t page_size) + : m_page_io(page_io) + , m_access_type(access_type) + , m_stride(stride) + , m_page_size(page_size ? page_size : page_io.getPageSize()) + , m_page_ratio(calcPageRatio(m_page_size, page_io.getPageSize())) + , m_data_pages_per_chunk(getDataPagesPerChunk(stride, m_page_ratio)) + , m_write_buf(m_page_size * 2) + , m_read_buf(m_page_size * 2) + , m_control_buf(page_io.getPageSize()) + { + if (sizeof(RandomIOStreamControlPage) > m_page_io.getPageSize()) { + THROWF(db0::InternalException) << "RandomIO_Stream control page does not fit into a page"; + } + + openExisting(page_num); + } + + void RandomIO_Stream::openExisting(std::uint64_t page_num) + { + m_head_page_num = page_num; + // in read-only mode we don't allow stream access, just the random one + if (m_access_type == AccessType::READ_ONLY) { + return; + } + + if (page_num >= m_page_io.getEndPageNum()) { + THROWF(db0::InternalException) << "RandomIO_Stream does not exist"; + } + + std::uint64_t chunk_page_num = page_num; + while (true) { + std::uint32_t type = 0; + std::uint32_t control_index = 0; + std::uint64_t next_chunk_page_num = 0; + bool first_data_is_first_page = false; + if (!findControl(chunk_page_num, m_generation, type, control_index, next_chunk_page_num, + first_data_is_first_page)) { + THROWF(db0::InternalException) << "RandomIO_Stream control page not found"; + } + + m_current_chunk_page_num = chunk_page_num; + m_current_used_pages = control_index; + m_current_next_chunk_page_num = 0; + m_current_first_data_is_first_page = first_data_is_first_page; + + if (type != CONTROL_LINK) { + break; + } + + m_current_next_chunk_page_num = next_chunk_page_num; + chunk_page_num = next_chunk_page_num; + } + } + + std::pair RandomIO_Stream::appendDiff( + const void *dp_data, std::pair page_and_state, + const std::vector &diff_data, bool *is_first_page) + { + if (m_access_type == AccessType::READ_ONLY) { + THROWF(db0::AccessTypeException) << "RandomIO_Stream::appendDiff not allowed in read-only mode"; + } + CodecAccess access(*this); + detail::DiffIOCodecWriter writer( + access, m_write_buf.data(), m_write_buf.data() + m_write_buf.size()); + auto result = detail::appendDiff(access, writer, dp_data, page_and_state, diff_data, is_first_page); + writer.flush(); + m_modified = true; + return result; + } + + void RandomIO_Stream::applyFrom(std::uint64_t page_num, void *buffer, + std::pair page_and_state) const + { + ConstCodecAccess access(*this); + detail::applyFrom(access, page_num, buffer, page_and_state, "RandomIO_Stream diff block not found", + m_read_buf.data(), m_read_buf.data() + m_read_buf.size()); + } + + std::uint64_t RandomIO_Stream::append(const void *buffer, bool *is_first_page) + { + if (m_access_type == AccessType::READ_ONLY) { + THROWF(db0::AccessTypeException) << "RandomIO_Stream::append not allowed in read-only mode"; + } + auto [page_num, remaining_pages] = getNextPageNum(is_first_page); + assert(remaining_pages > 0); + + writeRandom(page_num, buffer); + ++m_current_used_pages; + m_modified = true; + return page_num; + } + + void RandomIO_Stream::readRandom(std::uint64_t page_num, void *buffer) const + { + static_cast(m_page_io).read(page_num, buffer, m_page_ratio); + } + + std::uint64_t RandomIO_Stream::appendRandom(const void *buffer) + { + if (m_access_type == AccessType::READ_ONLY) { + THROWF(db0::AccessTypeException) << "RandomIO_Stream::appendRandom not allowed in read-only mode"; + } + m_modified = true; + auto page_num = m_page_io.reserve(m_page_ratio); + writeRandom(page_num, buffer); + return page_num; + } + + void RandomIO_Stream::writeRandom(std::uint64_t page_num, const void *buffer) + { + if (m_access_type == AccessType::READ_ONLY) { + THROWF(db0::AccessTypeException) << "RandomIO_Stream::writeRandom not allowed in read-only mode"; + } + const std::byte *byte_buffer = static_cast(buffer); + auto underlying_page_size = m_page_io.getPageSize(); + for (std::uint32_t i = 0; i < m_page_ratio; ++i) { + static_cast(m_page_io).write(page_num + i, byte_buffer + i * underlying_page_size); + } + m_modified = true; + } + + void RandomIO_Stream::flush() + { + if (m_access_type == AccessType::READ_ONLY) { + return; + } + if (!m_modified) { + return; + } + writeCurrentControl(CONTROL_END, m_current_used_pages); + m_modified = false; + } + + void RandomIO_Stream::close() + { + flush(); + } + + void RandomIO_Stream::clear() + { + if (m_access_type == AccessType::READ_ONLY) { + THROWF(db0::AccessTypeException) << "RandomIO_Stream::clear not allowed in read-only mode"; + } + ++m_generation; + loadNextChunk(m_head_page_num); + m_modified = true; + flush(); + } + + std::pair RandomIO_Stream::getNextPageNum(bool *is_first_page) + { + while (m_current_used_pages == m_data_pages_per_chunk) { + advanceChunk(); + } + + if (is_first_page) { + *is_first_page = m_current_used_pages == 0 && m_current_first_data_is_first_page; + } + + return { + dataPageNum(m_current_chunk_page_num, m_current_used_pages), + m_data_pages_per_chunk - m_current_used_pages + }; + } + + std::uint64_t RandomIO_Stream::getPageNum() const + { + return m_head_page_num; + } + + std::uint32_t RandomIO_Stream::getPageSize() const + { + return m_page_size; + } + + std::uint64_t RandomIO_Stream::getHeadPageNum() const + { + return m_head_page_num; + } + + bool RandomIO_Stream::modified() const + { + return m_modified; + } + + void RandomIO_Stream::advanceChunk() + { + if (!m_current_next_chunk_page_num) { + allocateNextChunk(); + } else { + writeCurrentControl(CONTROL_LINK, m_current_used_pages, m_current_next_chunk_page_num); + loadNextChunk(m_current_next_chunk_page_num); + } + } + + void RandomIO_Stream::allocateFirstChunk() + { + bool is_first_page = false; + m_current_chunk_page_num = m_page_io.reserve(m_stride, &is_first_page); + m_head_page_num = m_current_chunk_page_num; + m_current_next_chunk_page_num = 0; + m_current_used_pages = 0; + m_current_first_data_is_first_page = is_first_page; + m_modified = true; + } + + void RandomIO_Stream::allocateNextChunk() + { + bool is_first_page = false; + auto next_chunk_page_num = m_page_io.reserve(m_stride, &is_first_page); + + m_current_next_chunk_page_num = next_chunk_page_num; + writeCurrentControl(CONTROL_LINK, m_current_used_pages, next_chunk_page_num); + + m_current_chunk_page_num = next_chunk_page_num; + m_current_next_chunk_page_num = 0; + m_current_used_pages = 0; + m_current_first_data_is_first_page = is_first_page; + } + + void RandomIO_Stream::loadNextChunk(std::uint64_t page_num) + { + m_current_chunk_page_num = page_num; + m_current_next_chunk_page_num = 0; + m_current_used_pages = 0; + m_current_first_data_is_first_page = false; + + std::uint32_t old_type = 0; + std::uint32_t old_control_index = 0; + std::uint64_t old_next_chunk_page_num = 0; + bool old_first_data_is_first_page = false; + if (!findControl(page_num, m_generation - 1, old_type, old_control_index, old_next_chunk_page_num, + old_first_data_is_first_page)) { + return; + } + + m_current_first_data_is_first_page = old_first_data_is_first_page; + if (old_type == CONTROL_LINK) { + m_current_next_chunk_page_num = old_next_chunk_page_num; + } + } + + std::uint64_t RandomIO_Stream::controlPageNum(std::uint64_t chunk_page_num, + std::uint32_t control_index) const + { + return chunk_page_num + control_index * m_page_ratio; + } + + std::uint64_t RandomIO_Stream::dataPageNum(std::uint64_t chunk_page_num, std::uint32_t page_index) const + { + return chunk_page_num + page_index * m_page_ratio; + } + + void RandomIO_Stream::writeCurrentControl(std::uint32_t type, std::uint32_t control_index, + std::uint64_t next_chunk_page_num) + { + assert(m_access_type == AccessType::READ_WRITE); + assert(control_index <= m_data_pages_per_chunk); + RandomIOStreamControlPage control = { + RandomIOStreamControlPage::MAGIC, + RandomIOStreamControlPage::VERSION, + m_generation, + type, + control_index, + m_current_first_data_is_first_page ? 1u : 0u, + next_chunk_page_num + }; + std::fill(m_control_buf.begin(), m_control_buf.end(), std::byte{0}); + std::memcpy(m_control_buf.data(), &control, sizeof(control)); + static_cast(m_page_io).write(controlPageNum(m_current_chunk_page_num, control_index), + m_control_buf.data()); + } + + bool RandomIO_Stream::findControl(std::uint64_t chunk_page_num, std::uint32_t generation, + std::uint32_t &type, std::uint32_t &control_index, std::uint64_t &next_chunk_page_num, + bool &first_data_is_first_page) const + { + RandomIOStreamControlPage control = {}; + for (std::uint32_t index = 0; index <= m_data_pages_per_chunk; ++index) { + static_cast(m_page_io).read(controlPageNum(chunk_page_num, index), + m_control_buf.data()); + std::memcpy(&control, m_control_buf.data(), sizeof(control)); + if (isControlPage(control, generation, m_data_pages_per_chunk)) { + type = control.m_type; + control_index = control.m_control_index; + next_chunk_page_num = control.m_next_chunk_page_num; + first_data_is_first_page = control.m_first_data_is_first_page != 0; + return true; + } + } + return false; + } + +} diff --git a/src/dbzero/core/storage/RandomIO_Stream.hpp b/src/dbzero/core/storage/RandomIO_Stream.hpp new file mode 100644 index 00000000..07cd763d --- /dev/null +++ b/src/dbzero/core/storage/RandomIO_Stream.hpp @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2026 DBZero Software sp. z o.o. + +#pragma once + +#include "Diff_IO.hpp" +#include "diff_buffer.hpp" +#include +#include +#include + +namespace db0 + +{ + + /** + * RandomIO_Stream exposes stream-style append/read iteration and random page + * access on top of a shared Diff_IO/Page_IO store. + * + * The stream is identified by an externally stored head page number and + * stride. Data pages use absolute underlying Page_IO page numbers, so + * multiple streams can coexist in one Diff_IO without changing address + * semantics. The logical stream page size may be larger than the underlying + * Page_IO page size; in that case one logical page is translated to a + * contiguous group of underlying pages. + * + * clear() marks the stream empty by writing a new control sentinel and keeps + * previously allocated chunks linked so later appends can reuse them. + */ + class RandomIO_Stream + { + public: + /** + * Create a new read/write stream. + * + * The constructor reserves the first stream chunk immediately and + * initializes the append cursor. Use this only when allocating a new + * managed stream; reopening an existing stream must use the constructor + * that takes page_num and an explicit access_type. + * + * @param page_io shared underlying page store used for all reads/writes + * @param stride number of underlying Page_IO pages reserved per stream chunk + * @param page_size logical stream page size in bytes; defaults to the + * underlying Page_IO page size and must be its exact multiple + */ + RandomIO_Stream(Diff_IO &page_io, std::uint32_t stride, std::uint32_t page_size = 0); + + /** + * Open an existing stream from a known head page. + * + * In READ_ONLY mode, this does not scan stream control pages. Read-only + * users only need random access to pages that are indexed elsewhere, + * so the constructor records page_num and leaves cursor state unopened. + * + * In READ_WRITE mode, this scans the control chain once to position the + * append cursor at the stream tail. Because there is a single-writer + * guarantee, malformed or missing control pages are definitive errors. + * + * @param page_io shared underlying page store used for all reads/writes + * @param page_num head page number of the existing stream + * @param stride number of underlying Page_IO pages reserved per stream chunk + * @param access_type READ_ONLY for random page reads, READ_WRITE to append + * @param page_size logical stream page size in bytes; defaults to the + * underlying Page_IO page size and must be its exact multiple + */ + RandomIO_Stream(Diff_IO &page_io, std::uint64_t page_num, std::uint32_t stride, + AccessType access_type, std::uint32_t page_size = 0); + + /** + * Append/read data through the managed RandomIO stream. + * + * append() stores a full logical page at the stream cursor, advances the + * stream, and makes the page visible to Reader. appendDiff() does the + * same for a diff block encoded against page_and_state. applyFrom() + * resolves a diff block by walking this stream's managed page chain. + * + * These methods differ from appendRandom/readRandom/writeRandom: random + * access methods operate on absolute underlying Diff_IO page numbers and + * do not update stream membership or cursor state. + */ + std::pair appendDiff(const void *dp_data, + std::pair page_and_state, + const std::vector &diff_data, bool *is_first_page = nullptr); + void applyFrom(std::uint64_t page_num, void *buffer, + std::pair page_and_state) const; + std::uint64_t append(const void *buffer, bool *is_first_page = nullptr); + + /** + * Append/read/write absolute page locations in the underlying Diff_IO store. + * + * These methods do not consult or update the managed stream cursor and + * do not make the page visible to Reader. They are intentionally random + * access operations over the shared backing store; clear() only changes + * stream membership and does not invalidate unrelated random locations. + * readRandom() can also read an absolute page number returned by stream + * append operations such as append() or appendDiff(). + */ + std::uint64_t appendRandom(const void *buffer); + void readRandom(std::uint64_t page_num, void *buffer) const; + void writeRandom(std::uint64_t page_num, const void *buffer); + + std::uint32_t getPageSize() const; + + std::uint64_t getHeadPageNum() const; + + bool modified() const; + + void flush(); + void close(); + + // Clear the stream part only + void clear(); + + protected: + std::uint64_t getPageNum() const; + + private: + class CodecAccess; + class ConstCodecAccess; + + Diff_IO &m_page_io; + const AccessType m_access_type; + const std::uint32_t m_stride; + const std::uint32_t m_page_size; + const std::uint32_t m_page_ratio; + const std::uint32_t m_data_pages_per_chunk; + std::vector m_write_buf; + mutable std::vector m_read_buf; + mutable std::vector m_control_buf; + std::uint64_t m_head_page_num = 0; + std::uint64_t m_current_chunk_page_num = 0; + std::uint64_t m_current_next_chunk_page_num = 0; + std::uint32_t m_current_used_pages = 0; + std::uint32_t m_generation = 1; + bool m_current_first_data_is_first_page = false; + bool m_modified = false; + + std::pair getNextPageNum(bool *is_first_page = nullptr); + void advanceChunk(); + void allocateFirstChunk(); + void allocateNextChunk(); + void openExisting(std::uint64_t page_num); + void loadNextChunk(std::uint64_t page_num); + std::uint64_t controlPageNum(std::uint64_t chunk_page_num, std::uint32_t control_index) const; + std::uint64_t dataPageNum(std::uint64_t chunk_page_num, std::uint32_t page_index) const; + void writeCurrentControl(std::uint32_t type, std::uint32_t control_index, + std::uint64_t next_chunk_page_num = 0); + bool findControl(std::uint64_t chunk_page_num, std::uint32_t generation, + std::uint32_t &type, std::uint32_t &control_index, std::uint64_t &next_chunk_page_num, + bool &first_data_is_first_page) const; + }; + +} diff --git a/src/dbzero/core/storage/SparseIndex.hpp b/src/dbzero/core/storage/SparseIndex.hpp index 7fa5701e..87331c21 100644 --- a/src/dbzero/core/storage/SparseIndex.hpp +++ b/src/dbzero/core/storage/SparseIndex.hpp @@ -129,7 +129,9 @@ DB0_PACKED_BEGIN }; DB0_PACKED_END - using SparseIndex = SparseIndexBase; + using RootSparseIndex = SparseIndexBase; + using PlainSparseIndex = SparseIndexBase; + using SparseIndex = RootSparseIndex; } @@ -139,4 +141,4 @@ namespace std ostream &operator<<(ostream &, const db0::SI_Item &); -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/SparseIndexBase.hpp b/src/dbzero/core/storage/SparseIndexBase.hpp index 6c3a8ddb..cdffec03 100644 --- a/src/dbzero/core/storage/SparseIndexBase.hpp +++ b/src/dbzero/core/storage/SparseIndexBase.hpp @@ -3,24 +3,26 @@ #pragma once -#include #include +#include "StorageRootMetadata.hpp" namespace db0 { // Forward declarations for operator<< to be used in SGB_LookupTree.hpp - template class SparseIndexBase; + template class SparseIndexBase; - template - std::ostream &operator<<(std::ostream &os, const typename db0::SparseIndexBase::BlockHeader &header); + template + std::ostream &operator<<(std::ostream &os, const typename db0::SparseIndexBase::BlockHeader &header); } #include #include #include #include -#include +#include +#include #include +#include namespace db0 @@ -28,7 +30,7 @@ namespace db0 class DRAM_Prefix; class DRAM_Allocator; - + /** * The in-memory sparse index implementation * it utilizes DRAMSpace (in-memory) for storage and SGB_Tree as the data structure @@ -36,41 +38,83 @@ namespace db0 * @tparam ItemT the (uncompressed item type) for operations * @tparam CompressedItemT the compressed item type for storage */ - template class SparseIndexBase + template + class SparseIndexBase { public: using SI_ItemT = ItemT; using SI_CompressedItemT = CompressedItemT; + using MixInT = SparseIndexMixinT; + using TreeHeaderMixinT = typename SparseIndexMixinT::OverlayT; + using MixInAPIT = typename SparseIndexMixinT::template ApiT; using PageNumT = std::uint64_t; using StateNumT = std::uint32_t; using ItemCompT = typename ItemT::CompT; using ItemEqualT = typename ItemT::EqualT; using CompressedItemCompT = typename CompressedItemT::CompT; using CompressedItemEqualT = typename CompressedItemT::EqualT; - - /** - * Create empty as read/write - * @param node_size size of a single in-memory data block / node - */ - SparseIndexBase(std::size_t node_size, std::vector *change_log_ptr = nullptr); + using SlotId = Allocator::SlotId; + + // Create a new empty sparse index + struct tag_create {}; + SparseIndexBase(tag_create, DRAM_Pair, std::vector *change_log_ptr = nullptr, + SlotId slot_num = 0); /** * Create pre-populated with existing data (e.g. after reading from disk) * open either for read or read/write * @param address pass 0 to use the first assigned address */ - SparseIndexBase(DRAM_Pair, AccessType, Address address = {}, - std::vector *change_log_ptr = nullptr, StorageFlags= {}); - - // Create a new empty sparse index - struct tag_create {}; - SparseIndexBase(tag_create, DRAM_Pair, std::vector *change_log_ptr = nullptr); + SparseIndexBase(DRAM_Pair, AccessType, Address, std::vector *change_log_ptr = nullptr, + StorageFlags= {}, SlotId slot_num = 0); void insert(const ItemT &item); template void emplace(Args&&... args) { insert(ItemT(std::forward(args)...)); } + + /** + * Replace older descriptors for a page with a descriptor for the supplied state. + * + * This is intended for compaction-style rewrites that publish a new full-DP + * as the only remaining descriptor for a logical page. + */ + void update(PageNumT page_num, StateNumT state_num, std::uint64_t storage_page_num); + + /** + * Erase a single descriptor identified by an exact key. + * + * @param page_num logical page number of the descriptor to erase + * @param state_num state number of the descriptor to erase + * @return true if a descriptor was erased, false if no exact descriptor exists + */ + bool erase(PageNumT page_num, StateNumT state_num); + + /** + * Erase descriptors for a page in the half-open state range [first_state_num, last_state_num). + * + * @param page_num logical page number whose descriptors should be erased + * @param first_state_num optional inclusive lower state bound; if empty, erase from the first state on page_num + * @param last_state_num optional exclusive upper state bound; if empty, erase through the last state on page_num + * @return number of descriptors erased + */ + std::size_t eraseRange(PageNumT page_num, std::optional first_state_num = {}, + std::optional last_state_num = {}); + + /** + * Erase descriptors for a page with state numbers below state_num. + * + * @param page_num logical page number whose descriptors should be erased + * @param state_num exclusive upper state bound + * @return number of descriptors erased + */ + std::size_t eraseBelow(PageNumT page_num, StateNumT state_num); + + /** + * Erase all descriptors while preserving tree-header mix-in data. + */ + void clear(); /** * Note that 'lookup' may fail in presence of duplicate items, the behavior is undefined @@ -87,31 +131,43 @@ namespace db0 const DRAM_Prefix &getDRAMPrefix() const; - /** - * Get next storage page number expected to be assigned - */ - std::optional getNextStoragePageNum() const; - - /** - * Get the maximum used state number - */ - StateNumT getMaxStateNum() const; - /** * Refresh cache after underlying DRAM has been updated */ void refresh(); + + void open(Address address = {}); + + void detach() const; void forAll(std::function callback) const { m_index.forAll(callback); } + + void forPageRange(PageNumT first_page_num, PageNumT last_page_num, + std::function callback) const; + // Iterate over unique pages only (ignoring entries for different state numbers) + void forUniquePageRange(PageNumT first_page_num, PageNumT last_page_num, + std::function callback) const; + + auto cbegin() const { + return m_index.cbegin(); + } + + auto sortedBeginFrom(const ItemT &first) const { + return m_index.sortedBeginFrom(first); + } + + auto sortedBegin() const { + return m_index.sortedBegin(); + } bool empty() const; // Get the total number of data page descriptors stored in the index std::size_t size() const; - void commit(); + void commit() const; bool operator!() const; @@ -140,27 +196,35 @@ namespace db0 Address getIndexAddress() const; - protected: - friend class SparsePair; + /** + * Access metadata colocated with the index root page. + * + * The mix-in is intentionally embedded in the sparse-index tree header + * rather than stored in a separate object: small collections and limited + * updates often dirty the root page anyway, so colocating tiny metadata + * avoids forcing an additional dirty metadata page. + */ + const MixInAPIT &mixIn() const; -DB0_PACKED_BEGIN - // tree-level header type - struct DB0_PACKED_ATTR o_sparse_index_header: o_fixed_versioned - { - PageNumT m_next_page_num = 0; - StateNumT m_max_state_num = 0; - // the extra-data slot currently used to store reference to the dff-index - std::uint64_t m_extra_data = 0; - // reserved space for future use - std::array m_reserved = {0, 0, 0, 0}; - }; -DB0_PACKED_END + /** + * Mutating access to the colocated metadata API. + * + * Use this when the storage owner updates root-level metadata that is + * logically separate from sparse-index descriptor operations but shares + * the root page to reduce write amplification for small updates. + */ + MixInAPIT &modifyMixIn(); + + protected: + template friend class SparsePairBase; + template friend class MetadataAPI; + template friend class StorageRootMetadataAPI; // DRAM space deployed sparse index (in-memory) using IndexT = SGB_CompressedLookupTree< ItemT, CompressedItemT, BlockHeader, ItemCompT, CompressedItemCompT, ItemEqualT, CompressedItemEqualT, - o_sparse_index_header>; + TreeHeaderMixinT>; using ConstNodeIterator = typename IndexT::sg_tree_const_iterator; using ConstItemIterator = typename IndexT::ConstItemIterator; @@ -169,184 +233,230 @@ DB0_PACKED_END ConstItemIterator findLower(PageNumT, StateNumT) const; - void setExtraData(std::uint64_t); - - std::uint64_t getExtraData() const; - - void update(std::uint64_t max_storage_page_num); - void update(PageNumT page_num, StateNumT state_num, std::uint64_t max_storage_page_num); - void reopen(Address address = {}); - bool isOpen() const; + void recordChange(PageNumT page_num); private: std::shared_ptr m_dram_prefix; std::shared_ptr m_dram_allocator; Memspace m_dram_space; const AccessType m_access_type; + // slot ID is required to properly allocate SparseIndex nodes + SlotId m_slot_num = 0; // the actual index IndexT m_index; - // copied from tree header (cached) - PageNumT m_next_page_num = 0; - StateNumT m_max_state_num = 0; - // change log contains the list of updates (modified items / page numbers)qweqwe - // first element is the state number + MixInAPIT m_mixin_api; + // change log contains the list of updates (modified items / page numbers) std::vector *m_change_log_ptr = nullptr; IndexT openIndex(Address, AccessType access_type, StorageFlags); IndexT createIndex(); }; - template - SparseIndexBase::SparseIndexBase(std::size_t node_size, std::vector *change_log_ptr) - : m_dram_space(DRAMSpace::create(node_size, [this](DRAM_Pair dram_pair) { - this->m_dram_prefix = dram_pair.first; - this->m_dram_allocator = dram_pair.second; - })) - , m_access_type(AccessType::READ_WRITE) - , m_index(m_dram_space, node_size, AccessType::READ_WRITE) - , m_change_log_ptr(change_log_ptr) - { - } - - template - SparseIndexBase::SparseIndexBase(DRAM_Pair dram_pair, AccessType access_type, Address address, - std::vector *change_log_ptr, StorageFlags flags) + template + SparseIndexBase::SparseIndexBase(DRAM_Pair dram_pair, AccessType access_type, Address address, + std::vector *change_log_ptr, StorageFlags flags, SlotId slot_num) : m_dram_prefix(dram_pair.first) , m_dram_allocator(dram_pair.second) , m_dram_space(DRAMSpace::create(dram_pair)) , m_access_type(access_type) + , m_slot_num(slot_num) , m_index(openIndex(address, access_type, flags)) - // NOTE: index may NOT be loaded - , m_next_page_num(!!m_index ? m_index.treeHeader().m_next_page_num : 0) - , m_max_state_num(!!m_index ? m_index.treeHeader().m_max_state_num : 0) + , m_mixin_api(*this) , m_change_log_ptr(change_log_ptr) { } - template - SparseIndexBase::SparseIndexBase(tag_create, DRAM_Pair dram_pair, std::vector *change_log_ptr) + template + SparseIndexBase::SparseIndexBase(tag_create, DRAM_Pair dram_pair, + std::vector *change_log_ptr, SlotId slot_num) : m_dram_prefix(dram_pair.first) , m_dram_allocator(dram_pair.second) , m_dram_space(DRAMSpace::create(dram_pair)) , m_access_type(AccessType::READ_WRITE) + , m_slot_num(slot_num) , m_index(createIndex()) - , m_next_page_num(m_index.treeHeader().m_next_page_num) - , m_max_state_num(m_index.treeHeader().m_max_state_num) + , m_mixin_api(*this) , m_change_log_ptr(change_log_ptr) { } - - template - void SparseIndexBase::update(std::uint64_t max_storage_page_num) - { - // update tree header if necessary - if (max_storage_page_num >= m_next_page_num) { - m_next_page_num = max_storage_page_num + 1; - m_index.modifyTreeHeader().m_next_page_num = m_next_page_num; - } - } - template - void SparseIndexBase::update(PageNumT page_num, StateNumT state_num, std::uint64_t max_storage_page_num) + template + void SparseIndexBase::recordChange(PageNumT page_num) { - // update tree header if necessary - this->update(max_storage_page_num); - if (state_num > m_max_state_num) { - m_max_state_num = state_num; - m_index.modifyTreeHeader().m_max_state_num = state_num; - } - // put the currently generated state number as the first element in the change-log if (m_change_log_ptr) { - m_change_log_ptr->push_back(page_num); + m_change_log_ptr->push_back(MS_Address::encode(m_slot_num, page_num)); } } + + template + void SparseIndexBase::update(PageNumT page_num, StateNumT state_num, + std::uint64_t storage_page_num) + { + this->eraseBelow(page_num, state_num); + m_index.insert(ItemT(page_num, state_num, storage_page_num)); + this->recordChange(page_num); + } - template - void SparseIndexBase::insert(const ItemT &item) + template + void SparseIndexBase::insert(const ItemT &item) { m_index.insert(item); - this->update(item.m_page_num, item.m_state_num, item.m_storage_page_num); + this->recordChange(item.m_page_num); + } + + template + void SparseIndexBase::forPageRange(PageNumT first_page_num, PageNumT end_page_num, + std::function callback) const + { + m_index.forRange( + ItemT(first_page_num, 0), + ItemT(end_page_num, 0), + std::move(callback) + ); } - template - typename SparseIndexBase::IndexT - SparseIndexBase::openIndex(Address address, AccessType access_type, StorageFlags flags) + template + void SparseIndexBase::forUniquePageRange(PageNumT first_page_num, PageNumT end_page_num, + std::function callback) const { - assert((!m_dram_prefix->empty() || flags[StorageOptions::NO_LOAD]) + std::optional last_page_num; + // NOTE: since forRange iterates in ascending order we can de-duplicate pages + // on the fly by tracking the last seen page number + m_index.forRange( + ItemT(first_page_num, 0), + ItemT(end_page_num, 0), + [&](const ItemT &item) { + if (!last_page_num || item.m_page_num != *last_page_num) { + callback(item); + last_page_num = item.m_page_num; + } + } + ); + } + + template + bool SparseIndexBase::erase(PageNumT page_num, StateNumT state_num) + { + if (!m_index.erase_equal(std::make_pair(page_num, state_num))) { + return false; + } + return true; + } + + template + std::size_t SparseIndexBase::eraseBelow(PageNumT page_num, StateNumT state_num) + { + return eraseRange(page_num, {}, state_num); + } + + template + std::size_t SparseIndexBase::eraseRange(PageNumT page_num, + std::optional first_state_num, std::optional last_state_num) + { + auto first = ItemT(page_num, first_state_num.value_or(0)); + if (last_state_num) { + return m_index.erase_range(first, ItemT(page_num, *last_state_num)); + } + if (page_num != std::numeric_limits::max()) { + return m_index.erase_range(first, ItemT(page_num + 1, 0)); + } + + auto removed = m_index.erase_range(first, ItemT(page_num, std::numeric_limits::max())); + removed += m_index.erase_equal(std::make_pair(page_num, std::numeric_limits::max())) ? 1 : 0; + return removed; + } + + template + void SparseIndexBase::clear() + { + m_index.clear(); + } + + template + typename SparseIndexBase::IndexT + SparseIndexBase::openIndex(Address address, AccessType access_type, StorageFlags flags) + { + assert((!m_dram_prefix->empty() || flags[StorageFlagOption::NO_LOAD]) && "SparseIndexBase::openIndex: DRAM prefix is empty" ); // NOTE: Index NOT opened if NO_LOAD flag is set - if (flags[StorageOptions::NO_LOAD]) { + if (flags[StorageFlagOption::NO_LOAD]) { return {}; } else { - if (!address.isValid()) { - address = m_dram_allocator->firstAlloc(); - } - return IndexT(m_dram_space.myPtr(address), m_dram_prefix->getPageSize(), access_type); + // Use the first address if no specified + // this is the default address where the SparseIndex is located + if (!address) { + address = m_dram_allocator->firstAlloc(m_slot_num); + } + return IndexT(m_dram_space.myPtr(address), m_dram_prefix->getPageSize(), access_type, + {}, {}, {}, IndexT::DEFAULT_SORT_THRESHOLD, m_slot_num); } } - template - typename SparseIndexBase::IndexT - SparseIndexBase::createIndex() { - return IndexT(m_dram_space, m_dram_prefix->getPageSize(), AccessType::READ_WRITE); + template + typename SparseIndexBase::IndexT + SparseIndexBase::createIndex() + { + // Sparse Index is created at the root address (or the slot's first address) + return IndexT(m_dram_space, m_dram_prefix->getPageSize(), AccessType::READ_WRITE, + {}, {}, {}, IndexT::DEFAULT_SORT_THRESHOLD, m_slot_num); } - template - const DRAM_Prefix &SparseIndexBase::getDRAMPrefix() const { + template + const DRAM_Prefix &SparseIndexBase::getDRAMPrefix() const { return *m_dram_prefix; } - template - CompressedItemT SparseIndexBase::BlockHeader::compressFirst(const ItemT &item) + template + CompressedItemT SparseIndexBase::BlockHeader::compressFirst(const ItemT &item) { m_first_page_num = item.m_page_num >> 24; return CompressedItemT(m_first_page_num, item); } - template - CompressedItemT SparseIndexBase::BlockHeader::compress(const ItemT &item) const + template + CompressedItemT SparseIndexBase::BlockHeader::compress(const ItemT &item) const { assert(m_first_page_num == (item.m_page_num >> 24)); return CompressedItemT(m_first_page_num, item); } - template - CompressedItemT SparseIndexBase::BlockHeader::compress(std::pair item) const + template + CompressedItemT SparseIndexBase::BlockHeader::compress(std::pair item) const { assert(m_first_page_num == (item.first >> 24)); return CompressedItemT(m_first_page_num, item.first, item.second); } - template - ItemT SparseIndexBase::BlockHeader::uncompress(const CompressedItemT &item) const { + template + ItemT SparseIndexBase::BlockHeader::uncompress(const CompressedItemT &item) const { return item.uncompress(this->m_first_page_num); } - template - typename SparseIndexBase::PageNumT - SparseIndexBase::BlockHeader::getPageNum(const CompressedItemT &item) const { + template + typename SparseIndexBase::PageNumT + SparseIndexBase::BlockHeader::getPageNum(const CompressedItemT &item) const { return item.getPageNum(this->m_first_page_num); } - template - bool SparseIndexBase::BlockHeader::canFit(const ItemT &item) const { + template + bool SparseIndexBase::BlockHeader::canFit(const ItemT &item) const { return this->m_first_page_num == (item.m_page_num >> 24); } - template - bool SparseIndexBase::BlockHeader::canFit(std::pair item) const + template + bool SparseIndexBase::BlockHeader::canFit(std::pair item) const { return this->m_first_page_num == (item.first >> 24); } - template - ItemT SparseIndexBase::lookup(PageNumT page_num, StateNumT state_num) const { + template + ItemT SparseIndexBase::lookup(PageNumT page_num, StateNumT state_num) const { return lookup(std::make_pair(page_num, state_num)); } - template - ItemT SparseIndexBase::lookup(std::pair page_and_state) const + template + ItemT SparseIndexBase::lookup(std::pair page_and_state) const { auto result = m_index.lower_equal_bound(page_and_state); if (!result || result->m_page_num != page_and_state.first) { @@ -355,8 +465,8 @@ DB0_PACKED_END return *result; } - template - ItemT SparseIndexBase::lookup(const ItemT &item) const + template + ItemT SparseIndexBase::lookup(const ItemT &item) const { auto result = m_index.lower_equal_bound(item); if (!result || result->m_page_num != item.m_page_num) { @@ -365,93 +475,64 @@ DB0_PACKED_END return *result; } - template - std::optional::PageNumT> - SparseIndexBase::getNextStoragePageNum() const - { - if (this->empty() ) { - return std::nullopt; - } - return m_next_page_num; - } - - template - typename SparseIndexBase::StateNumT - SparseIndexBase::getMaxStateNum() const { - return m_max_state_num; - } - - template - void SparseIndexBase::refresh() + template + void SparseIndexBase::refresh() { if (!m_index) { - this->reopen(); + open(); return; } - m_index.detach(); - m_next_page_num = m_index.treeHeader().m_next_page_num; - m_max_state_num = m_index.treeHeader().m_max_state_num; + m_mixin_api.refresh(); } - template - void SparseIndexBase::reopen(Address address) + template + void SparseIndexBase::open(Address address) { - if (m_dram_prefix->empty()) { - return; - } - - if (!address.isValid()) { - address = m_dram_allocator->firstAlloc(); - } - if (!address.isValid()) { - return; - } - + assert(!m_index && "SparseIndexBase::open: index is already open"); m_index.~IndexT(); - new (&m_index) IndexT(m_dram_space.myPtr(address), m_dram_prefix->getPageSize(), m_access_type); - m_next_page_num = m_index.treeHeader().m_next_page_num; - m_max_state_num = m_index.treeHeader().m_max_state_num; + new (&m_index) IndexT(openIndex(address, m_access_type, {})); + m_mixin_api.refresh(); } - template - bool SparseIndexBase::isOpen() const + template + void SparseIndexBase::detach() const { - return !!m_index; + m_index.detach(); } - - template - std::string SparseIndexBase::BlockHeader::toString(const CompressedItemT &item) const { + + template + std::string SparseIndexBase::BlockHeader::toString(const CompressedItemT &item) const { return item.toString(); } - template - std::string SparseIndexBase::BlockHeader::toString() const + template + std::string SparseIndexBase::BlockHeader::toString() const { std::stringstream _str; _str << "BlockHeader { first_page_num: " << m_first_page_num << " }"; return _str.str(); } - template - bool SparseIndexBase::empty() const { + template + bool SparseIndexBase::empty() const { return m_index.empty(); } - template - std::size_t SparseIndexBase::size() const { + template + std::size_t SparseIndexBase::size() const { return m_index.size(); } - template - const CompressedItemT *SparseIndexBase::lowerEqualBound( + template + const CompressedItemT *SparseIndexBase::lowerEqualBound( PageNumT page_num, StateNumT state_num, ConstNodeIterator &node) const { return m_index.lower_equal_bound(std::make_pair(page_num, state_num), node); } - template - ItemT SparseIndexBase::findUpper(PageNumT page_num, StateNumT state_num) const + template + ItemT SparseIndexBase::findUpper(PageNumT page_num, StateNumT state_num) const { auto result = m_index.upper_equal_bound(std::make_pair(page_num, state_num)); if (!result || result->m_page_num != page_num) { @@ -460,35 +541,37 @@ DB0_PACKED_END return *result; } - template - void SparseIndexBase::setExtraData(std::uint64_t data) { - m_index.modifyTreeHeader().m_extra_data = data; + template + Address SparseIndexBase::getIndexAddress() const { + return m_index.getAddress(); } - template - std::uint64_t SparseIndexBase::getExtraData() const { - return m_index.treeHeader().m_extra_data; + template + const typename SparseIndexBase::MixInAPIT & + SparseIndexBase::mixIn() const { + return m_mixin_api; } - - template - Address SparseIndexBase::getIndexAddress() const { - return m_index.getAddress(); + + template + typename SparseIndexBase::MixInAPIT & + SparseIndexBase::modifyMixIn() { + return m_mixin_api; } - template - typename SparseIndexBase::ConstItemIterator - SparseIndexBase::findLower(PageNumT page_num, StateNumT state_num) const { + template + typename SparseIndexBase::ConstItemIterator + SparseIndexBase::findLower(PageNumT page_num, StateNumT state_num) const { return m_index.findLower(std::make_pair(page_num, state_num)); } - template - void SparseIndexBase::commit() { + template + void SparseIndexBase::commit() const { m_index.commit(); } - template - bool SparseIndexBase::operator!() const { + template + bool SparseIndexBase::operator!() const { return !m_index; } - + } diff --git a/src/dbzero/core/storage/SparseIndexQuery.cpp b/src/dbzero/core/storage/SparseIndexQuery.cpp index 132eabe5..5dae2e2a 100644 --- a/src/dbzero/core/storage/SparseIndexQuery.cpp +++ b/src/dbzero/core/storage/SparseIndexQuery.cpp @@ -7,7 +7,8 @@ namespace db0 { - SparseIndexQuery::SparseIndexQuery(const SparseIndex &sparse_index, const DiffIndex &diff_index, + template + SparseIndexQuery::SparseIndexQuery(const SparseIndexT &sparse_index, const DiffIndex &diff_index, std::uint64_t page_num, StateNumType state_num) : m_query_page_num(page_num) , m_query_state_num(state_num) @@ -27,11 +28,13 @@ namespace db0 } } - bool SparseIndexQuery::empty() const { + template + bool SparseIndexQuery::empty() const { return !m_non_empty || lessThan(1); } - bool SparseIndexQuery::next(StateNumType &state_num, std::uint64_t &storage_page_num) + template + bool SparseIndexQuery::next(StateNumType &state_num, std::uint64_t &storage_page_num) { // unable to iterate past the queried state number if (m_state_num >= m_query_state_num) { @@ -77,7 +80,8 @@ namespace db0 } } - bool SparseIndexQuery::lessThan(unsigned int size) const + template + bool SparseIndexQuery::lessThan(unsigned int size) const { assert(size > 0 && "SparseIndexQuery::lessThan: size must be > 0"); if (m_full_dp) { @@ -101,7 +105,8 @@ namespace db0 return lessThanFrom(size, diff_dp, diff_it, last_state_num); } - bool SparseIndexQuery::leftLessThan(unsigned int size) const + template + bool SparseIndexQuery::leftLessThan(unsigned int size) const { assert(size > 0 && "SparseIndexQuery::lessThan: size must be > 0"); auto diff_dp = m_diff_dp; @@ -110,7 +115,8 @@ namespace db0 return lessThanFrom(size, diff_dp, diff_it, last_state_num); } - bool SparseIndexQuery::lessThanFrom(unsigned int size, DI_Item &diff_dp, typename DI_Item::ConstIterator &diff_it, + template + bool SparseIndexQuery::lessThanFrom(unsigned int size, DI_Item &diff_dp, typename DI_Item::ConstIterator &diff_it, StateNumType &last_state_num) const { assert(size > 0 && "SparseIndexQuery::lessThan: size must be > 0"); @@ -161,7 +167,8 @@ namespace db0 return false; } - bool tryFindMutation(const SparseIndex &sparse_index, const DiffIndex &diff_index, std::uint64_t page_num, + template + bool tryFindMutation(const SparseIndexT &sparse_index, const DiffIndex &diff_index, std::uint64_t page_num, StateNumType state_num, StateNumType &mutation_id) { // query the diff index first @@ -175,5 +182,12 @@ namespace db0 mutation_id = std::max((StateNumType)item.m_state_num, mutation_id); return true; } - + + template class SparseIndexQuery; + template class SparseIndexQuery; + template bool tryFindMutation(const RootSparseIndex &, const DiffIndex &, + std::uint64_t, StateNumType, StateNumType &); + template bool tryFindMutation(const PlainSparseIndex &, const DiffIndex &, + std::uint64_t, StateNumType, StateNumType &); + } \ No newline at end of file diff --git a/src/dbzero/core/storage/SparseIndexQuery.hpp b/src/dbzero/core/storage/SparseIndexQuery.hpp index 210f2f14..4bd1192b 100644 --- a/src/dbzero/core/storage/SparseIndexQuery.hpp +++ b/src/dbzero/core/storage/SparseIndexQuery.hpp @@ -14,33 +14,33 @@ namespace db0 // The SparseIndexQuery allows retrieving a DP location // as a combination of full-DP + optional multiple diff-DPs // it combines the use of SparseIndex and DiffIndex - class SparseIndexQuery + template class SparseIndexQuery { public: - SparseIndexQuery(const SparseIndex &, const DiffIndex &, std::uint64_t page_num, StateNumType state_num); - + SparseIndexQuery(const SparseIndexT &, const DiffIndex &, std::uint64_t page_num, StateNumType state_num); + inline StateNumType firstStateNum() const { return m_full_dp.m_state_num; } // NOTE: the first returned storage page num will be full-DP // @return 0 if no associated DP found - inline std::uint64_t first() const + inline std::uint64_t first() const { m_state_num = m_full_dp.m_state_num; return m_full_dp.m_storage_page_num; } - + inline std::uint64_t first(StateNumType &state_num) const { state_num = m_full_dp.m_state_num; m_state_num = state_num; return m_full_dp.m_storage_page_num; } - + // and the subsequent ones - diff-DPs until false is returned bool next(StateNumType &state_num, std::uint64_t &storage_page_num); - + // Check if the total number of query results (first + next) is less than the given value bool lessThan(unsigned int) const; @@ -60,14 +60,26 @@ namespace db0 DI_Item m_diff_dp; typename DI_Item::ConstIterator m_diff_it; bool m_non_empty = true; - + // Common implemetation part for lessThan and leftLessThan - bool lessThanFrom(unsigned int size, DI_Item &, typename DI_Item::ConstIterator &, + bool lessThanFrom(unsigned int size, DI_Item &, typename DI_Item::ConstIterator &, StateNumType &last_state_num) const; }; - - // Try identifying the state number (but not larger than state_num) swhen a specific page was modified - bool tryFindMutation(const SparseIndex &, const DiffIndex &, std::uint64_t page_num, StateNumType state_num, + + template + SparseIndexQuery(const SparseIndexT &, const DiffIndex &, std::uint64_t, StateNumType) + -> SparseIndexQuery; + + // Try identifying the state number (but not larger than state_num) when a specific page was modified. + template + bool tryFindMutation(const SparseIndexT &, const DiffIndex &, std::uint64_t page_num, StateNumType state_num, StateNumType &mutation_id); - -} + + extern template class SparseIndexQuery; + extern template class SparseIndexQuery; + extern template bool tryFindMutation(const RootSparseIndex &, const DiffIndex &, + std::uint64_t page_num, StateNumType state_num, StateNumType &mutation_id); + extern template bool tryFindMutation(const PlainSparseIndex &, const DiffIndex &, + std::uint64_t page_num, StateNumType state_num, StateNumType &mutation_id); + +} diff --git a/src/dbzero/core/storage/SparsePair.cpp b/src/dbzero/core/storage/SparsePair.cpp index 1e09b3e5..d31b882c 100644 --- a/src/dbzero/core/storage/SparsePair.cpp +++ b/src/dbzero/core/storage/SparsePair.cpp @@ -2,114 +2,287 @@ // Copyright (c) 2025 DBZero Software sp. z o.o. #include "SparsePair.hpp" +#include +#include #include namespace db0 { + namespace + { + template + class SparsePairUniquePageRangeIterator + { + public: + using PageNumT = typename SparseIndexT::PageNumT; + using SparseIteratorT = decltype(std::declval().sortedBegin()); + using DiffIteratorT = decltype(std::declval().sortedBegin()); + + SparsePairUniquePageRangeIterator(const SparseIndexT &sparse_index, const DiffIndex &diff_index) + : m_sparse_it(sparse_index.sortedBegin()) + , m_diff_it(diff_index.sortedBegin()) + { + m_sparse_page_num = currentPageFrom(m_sparse_it); + m_diff_page_num = currentPageFrom(m_diff_it); + m_current = fromRange(selectCurrent()); + } + + SparsePairUniquePageRangeIterator(const SparseIndexT &sparse_index, const DiffIndex &diff_index, + PageNumT first_page_num, PageNumT end_page_num) + : m_sparse_it(sparse_index.sortedBeginFrom(SI_Item(first_page_num, 0))) + , m_diff_it(diff_index.sortedBeginFrom(DI_Item(first_page_num, 0))) + , m_end_page_num(end_page_num) + { + m_sparse_page_num = currentPageFrom(m_sparse_it); + m_diff_page_num = currentPageFrom(m_diff_it); + m_current = fromRange(selectCurrent()); + } + + bool is_end() const { + return !m_current; + } + + PageNumT operator*() const { + assert(m_current); + return *m_current; + } + + SparsePairUniquePageRangeIterator &operator++() + { + assert(m_current); + advancePast(*m_current); + m_current = fromRange(selectCurrent()); + return *this; + } + + private: + SparseIteratorT m_sparse_it; + DiffIteratorT m_diff_it; + std::optional m_end_page_num; + std::optional m_sparse_page_num; + std::optional m_diff_page_num; + std::optional m_current; + + template + std::optional currentPageFrom(const IteratorT &it) const + { + if (it.is_end()) { + return std::nullopt; + } + + auto item = *it; + PageNumT page_num = item.m_page_num; + return page_num; + } + + void advancePast(PageNumT page_num) + { + if (m_sparse_page_num && *m_sparse_page_num <= page_num) { + m_sparse_page_num = detail::advancePageIteratorPast(m_sparse_it, page_num); + } + if (m_diff_page_num && *m_diff_page_num <= page_num) { + m_diff_page_num = detail::advancePageIteratorPast(m_diff_it, page_num); + } + } + + std::optional fromRange(std::optional page_num) const + { + if (page_num && (!m_end_page_num || *page_num < *m_end_page_num)) { + return page_num; + } + return std::nullopt; + } + + std::optional selectCurrent() const + { + if (!m_diff_page_num) { + return m_sparse_page_num; + } else if (!m_sparse_page_num) { + return m_diff_page_num; + } + // both available, return the smaller one + return *m_sparse_page_num < *m_diff_page_num ? m_sparse_page_num : m_diff_page_num; + } + }; + } - SparsePair::SparsePair(std::size_t node_size) - : m_sparse_index(node_size, &m_change_log) - , m_diff_index(node_size, &m_change_log) + template + SparsePairBase::SparsePairBase(DRAM_Pair dram_pair, AccessType access_type, Address root_address, + StorageFlags flags, Allocator::SlotId slot_num, ChangeLogT *change_log) + : m_change_log(change_log ? change_log : &m_owned_change_log) + , m_dram_space(DRAMSpace::create(dram_pair)) + // sparse index locate at the slot's root address + , m_sparse_index(dram_pair, access_type, root_address, + m_change_log, flags, slot_num) + , m_diff_index(dram_pair, access_type, getDiffIndexAddress(m_sparse_index), + m_change_log, flags, slot_num) { } - SparsePair::SparsePair(DRAM_Pair dram_pair, AccessType access_type, StorageFlags flags) - : m_sparse_index(dram_pair, access_type, {}, &m_change_log, flags) - , m_diff_index(dram_pair, access_type, getDiffIndexAddress(m_sparse_index, flags), &m_change_log, flags) + template + SparsePairBase::SparsePairBase(tag_create, DRAM_Pair dram_pair, Allocator::SlotId slot_num, + ChangeLogT *change_log) + : m_change_log(change_log ? change_log : &m_owned_change_log) + , m_dram_space(DRAMSpace::create(dram_pair)) + , m_sparse_index(typename SparseIndexT::tag_create(), dram_pair, m_change_log, slot_num) + , m_diff_index(DiffIndex::tag_create(), dram_pair, m_change_log, slot_num) { + // validate SparseIndex address + assert(m_sparse_index.getIndexAddress() == dram_pair.second->firstAlloc(slot_num)); + // write in the Sparse Index header + storeDiffIndexAddresses(); } - SparsePair::SparsePair(tag_create, DRAM_Pair dram_pair) - : m_sparse_index(SparseIndex::tag_create(), dram_pair, &m_change_log) - , m_diff_index(DiffIndex::tag_create(), dram_pair, &m_change_log) + template + std::optional::PageNumT> SparsePairBase::getNextStoragePageNum() const { - // store the diff-index's address as extra data in the sparse index - m_sparse_index.setExtraData(m_diff_index.getIndexAddress().getOffset()); + if constexpr (ConfigT::has_storage_root_metadata) { + return m_sparse_index.mixIn().getNextStoragePageNum(); + } else { + return std::nullopt; + } } - SparsePair::~SparsePair() + template + typename SparsePairBase::StateNumT SparsePairBase::getMaxStateNum() const { + if constexpr (ConfigT::has_storage_root_metadata) { + return m_sparse_index.mixIn().getMaxStateNum(); + } else { + return 0; + } } - - std::optional SparsePair::getNextStoragePageNum() const { - return optional_max(m_sparse_index.getNextStoragePageNum(), m_diff_index.getNextStoragePageNum()); + + template + Address SparsePairBase::getAddress() const + { + return m_sparse_index.getIndexAddress(); } - typename SparsePair::StateNumT SparsePair::getMaxStateNum() const { - return std::max(m_sparse_index.getMaxStateNum(), m_diff_index.getMaxStateNum()); + template + void SparsePairBase::recordMaxStateNum(StateNumT state_num) + { + if constexpr (ConfigT::has_storage_root_metadata) { + m_sparse_index.modifyMixIn().recordMaxStateNum(state_num); + } else { + (void)state_num; + } } - - void SparsePair::refresh() + + template + void SparsePairBase::recordNextStoragePageNum(PageNumT next_page_num) { - m_sparse_index.refresh(); - // A read-only storage may be opened before the writer's DRAM changelog - // update is visible, leaving SparsePair with unopened indexes. Refreshing - // later can apply the DRAM pages that contain the sparse index, but the - // diff index address is only available from the freshly opened sparse - // index header. Without reopening the diff index from that address, - // BDevStorage::completeRefresh() can see a DRAM changelog state ahead of - // getMaxStateNum() and report a false inconsistency. - // - // Reproduced by BDevStorageTest.testNoLoadReaderCanRefreshAfterWriterCommit - // and observed as intermittent Python failures in - // test_refreshing_group_by_results on concurrent read-only open. - if (!!m_sparse_index.m_index) { - auto diffIndexAddress = Address::fromOffset(m_sparse_index.getExtraData()); - if (!m_diff_index.isOpen() || m_diff_index.getIndexAddress() != diffIndexAddress) { - m_diff_index.reopen(diffIndexAddress); - } else { - m_diff_index.refresh(); - } + if constexpr (ConfigT::has_storage_root_metadata) { + m_sparse_index.modifyMixIn().recordNextStoragePageNum(next_page_num); } else { + (void)next_page_num; + } + } + + template + void SparsePairBase::refresh() + { + m_sparse_index.refresh(); + if (!!m_diff_index) { m_diff_index.refresh(); } } - std::size_t SparsePair::size() const { + template + void SparsePairBase::detach() const + { + m_sparse_index.detach(); + m_diff_index.detach(); + } + + template + std::size_t SparsePairBase::size() const + { return m_sparse_index.size() + m_diff_index.size(); } - - bool SparsePair::empty() const { + + template + bool SparsePairBase::empty() const + { return m_sparse_index.empty() && m_diff_index.empty(); } - - const SparsePair::DP_ChangeLogT &SparsePair::extractChangeLog(DP_ChangeLogStreamT &changelog_io, - std::uint64_t end_storage_page_num) - { - std::sort(m_change_log.begin(), m_change_log.end()); - ChangeLogData cl_data; - // add page numbers (logical) with deduplication - for (auto page_num : m_change_log) { - cl_data.m_rle_builder.append(page_num, false); + + template + void SparsePairBase::commit() const + { + m_sparse_index.commit(); + m_diff_index.commit(); + } + + template + void SparsePairBase::forUniquePageRange(PageNumT first_page_num, PageNumT end_page_num, + std::function callback) const + { + if (first_page_num >= end_page_num) { + return; + } + + if (!m_diff_index) { + m_sparse_index.forUniquePageRange(first_page_num, end_page_num, + [&](const auto &item) { + callback(item.m_page_num); + }); + return; + } + + SparsePairUniquePageRangeIterator it( + m_sparse_index, m_diff_index, first_page_num, end_page_num); + while (!it.is_end()) { + callback(*it); + ++it; + } + } + + template + void SparsePairBase::forUniquePageRange(std::function callback) const + { + SparsePairUniquePageRangeIterator it(m_sparse_index, m_diff_index); + while (!it.is_end()) { + callback(*it); + ++it; } - - // RLE encode, no duplicates - auto &result = changelog_io.appendChangeLog( - std::move(cl_data), this->getMaxStateNum(), end_storage_page_num - ); - m_change_log.clear(); - return result; } - std::size_t SparsePair::getChangeLogSize() const { - return m_change_log.size(); + template + std::size_t SparsePairBase::getChangeLogSize() const + { + return m_change_log ? m_change_log->size() : 0; + } + + template + Address SparsePairBase::getDiffIndexAddress( + const SparseIndexT &sparse_index) + { + if (!sparse_index) { + return {}; + } + return Address::fromOffset(sparse_index.mixIn().getExtraData()); } - void SparsePair::commit() + template + void SparsePairBase::storeDiffIndexAddresses() { - m_sparse_index.commit(); - m_diff_index.commit(); + m_sparse_index.modifyMixIn().setExtraData(m_diff_index.getIndexAddress().getOffset()); } - Address SparsePair::getDiffIndexAddress(const SparseIndex &sparse_index, StorageFlags flags) - { - assert(!!sparse_index || flags[StorageOptions::NO_LOAD]); - if (!!sparse_index) { - return Address::fromOffset(sparse_index.getExtraData()); + template + typename SparsePairBase::ChangeLogT SparsePairBase::extractChangeLogPages() + { + if (m_change_log != &m_owned_change_log) { + THROWF(db0::InternalException) << "extractChangeLogPages is only supported for SparsePair instances with owned change log"; } - // NOTE: address may not be available if NO_LOAD flag is set - return {}; + ChangeLogT page_nums; + page_nums.swap(m_owned_change_log); + return page_nums; } + template class SparsePairBase; + template class SparsePairBase; + } diff --git a/src/dbzero/core/storage/SparsePair.hpp b/src/dbzero/core/storage/SparsePair.hpp index cbf1650b..981d75a2 100644 --- a/src/dbzero/core/storage/SparsePair.hpp +++ b/src/dbzero/core/storage/SparsePair.hpp @@ -4,76 +4,136 @@ #pragma once #include +#include "SparsePairFwd.hpp" #include "SparseIndex.hpp" #include "DiffIndex.hpp" #include "BaseStorage.hpp" #include "ChangeLogIOStream.hpp" #include "StorageFlags.hpp" +#include +#include +#include +#include +#include +#include +#include namespace db0 { - - // The SparsePair combines SparseIndex and DiffIndex - class SparsePair + namespace detail + { + template + std::optional advancePageIteratorPast(IteratorT &it, PageNumT page_num) + { + while (!it.is_end()) { + auto item = *it; + PageNumT item_page_num = item.m_page_num; + if (item_page_num > page_num) { + return item_page_num; + } + ++it; + } + return std::nullopt; + } + } + + struct RootSparsePairConfig + { + using SparseIndexT = RootSparseIndex; + static constexpr bool has_storage_root_metadata = true; + }; + + struct PlainSparsePairConfig + { + using SparseIndexT = PlainSparseIndex; + static constexpr bool has_storage_root_metadata = false; + }; + + /** + * Combines SparseIndex and DiffIndex. + * + * The root configuration stores storage-level high-water metadata in the + * sparse-index root mix-in. The plain configuration is used by + * SparsePairManager and keeps that sparse-index mix-in empty; it only adds a + * tiny pair header so the paired sparse/diff index addresses can be opened. + */ + template class SparsePairBase { public: - using PageNumT = SparseIndex::PageNumT; - using StateNumT = SparseIndex::StateNumT; - using tag_create = SparseIndex::tag_create; - using DP_ChangeLogT = BaseStorage::DP_ChangeLogT; - using DP_ChangeLogStreamT = db0::ChangeLogIOStream; - - SparsePair(std::size_t node_size); - SparsePair(DRAM_Pair, AccessType, StorageFlags = {}); - SparsePair(tag_create, DRAM_Pair); - - ~SparsePair(); + using Config = ConfigT; + using SlotId = Allocator::SlotId; + using SparseIndexT = typename ConfigT::SparseIndexT; + using PageNumT = typename SparseIndexT::PageNumT; + using StateNumT = typename SparseIndexT::StateNumT; + using tag_create = typename SparseIndexT::tag_create; + + using ChangeLogT = std::vector; + using ChangeLogEntryT = std::uint64_t; - inline SparseIndex &getSparseIndex() { + SparsePairBase(DRAM_Pair, AccessType, Address, StorageFlags = {}, SlotId slot_num = 0, + ChangeLogT *change_log = nullptr); + SparsePairBase(tag_create, DRAM_Pair, SlotId slot_num = 0, ChangeLogT *change_log = nullptr); + + inline SparseIndexT &getSparseIndex() { return m_sparse_index; } - - inline const SparseIndex &getSparseIndex() const { + + inline const SparseIndexT &getSparseIndex() const { return m_sparse_index; } - + inline DiffIndex &getDiffIndex() { return m_diff_index; } - + inline const DiffIndex &getDiffIndex() const { return m_diff_index; - } + } - // combine from both underlyig indexes std::optional getNextStoragePageNum() const; - // combine from both underlyig indexes StateNumT getMaxStateNum() const; - + + Address getAddress() const; + + void recordMaxStateNum(StateNumT state_num); + + void recordNextStoragePageNum(PageNumT); + bool empty() const; std::size_t size() const; void refresh(); + + void detach() const; - /** - * Write internally managed change log into a specific stream - * and then clean the internal change log - */ - const DP_ChangeLogT &extractChangeLog(DP_ChangeLogStreamT &, std::uint64_t end_storage_page_num); + void commit() const; + + void forUniquePageRange(PageNumT first_page_num, PageNumT end_page_num, + std::function callback) const; + void forUniquePageRange(std::function callback) const; std::size_t getChangeLogSize() const; - - void commit(); - + + // only supported with owned change log + ChangeLogT extractChangeLogPages(); + private: - // Change log contains the list of updates (modified items / page numbers) - std::vector m_change_log; - SparseIndex m_sparse_index; + // owned change log used only for non-managed root instances + ChangeLogT m_owned_change_log; + ChangeLogT *m_change_log; + Memspace m_dram_space; + // Sparse Index is created at the root address (or the slot's first address) + // and in its header it stores the address of the diff index + SparseIndexT m_sparse_index; DiffIndex m_diff_index; - static Address getDiffIndexAddress(const SparseIndex &, StorageFlags); + static Address getDiffIndexAddress(const SparseIndexT &); + void storeDiffIndexAddresses(); }; - -} \ No newline at end of file + + extern template class SparsePairBase; + extern template class SparsePairBase; + +} diff --git a/src/dbzero/core/storage/SparsePairFwd.hpp b/src/dbzero/core/storage/SparsePairFwd.hpp new file mode 100644 index 00000000..6dac47e4 --- /dev/null +++ b/src/dbzero/core/storage/SparsePairFwd.hpp @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +namespace db0 + +{ + + struct RootSparsePairConfig; + struct PlainSparsePairConfig; + template class SparsePairBase; + + using RootSparsePair = SparsePairBase; + using PlainSparsePair = SparsePairBase; + using SparsePair = RootSparsePair; + +} diff --git a/src/dbzero/core/storage/SparsePairManager.cpp b/src/dbzero/core/storage/SparsePairManager.cpp new file mode 100644 index 00000000..9941defb --- /dev/null +++ b/src/dbzero/core/storage/SparsePairManager.cpp @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include "SparsePairManager.hpp" +#include +#include +#include +#include +#include + +namespace db0 + +{ + + SparsePairManager::SparsePairManager(MS_MetaSpace &metaspace, AccessType access_type, + StorageFlags flags, MappingPolicy mapping_policy) + : m_prefix(metaspace.getMSPrefixPtr()) + , m_allocator(metaspace.getMSAllocatorPtr()) + , m_ps_shift(db0::getPageShift(m_prefix->getPageSize())) + , m_mapping_policy(mapping_policy) + , m_access_type(access_type) + , m_flags(flags) + { + if (mapping_policy == MappingPolicy::eager) { + // fully initialize with "eager" mapping policy + db0::load(*m_prefix, *m_allocator); + } + } + + PlainSparsePair *SparsePairManager::tryGetCached(Allocator::SlotId slot_id) const noexcept + { + if (m_hot_pair && m_hot_slot_id == slot_id) { + return m_hot_pair; + } + + auto it = m_pairs.find(slot_id); + if (it == m_pairs.end()) { + return nullptr; + } + cacheHotPair(slot_id, *it->second); + return it->second.get(); + } + + PlainSparsePair &SparsePairManager::getOrCreate(Allocator::SlotId slot_id) + { + if (auto *existing = tryGetExisting(slot_id)) { + return *existing; + } + + // Create new sparse pair over a newly created slot + auto dram_pair = createDRAMPair(slot_id); + auto sparse_pair = std::make_unique( + PlainSparsePair::tag_create(), dram_pair, slot_id, &m_change_log); + // assert it was allocated at the expected address (1st alloc of the slot) + assert(sparse_pair->getAddress() == m_allocator->firstAlloc(slot_id)); + auto *result = sparse_pair.get(); + m_pairs.insert_or_assign(slot_id, std::move(sparse_pair)); + cacheHotPair(slot_id, *result); + return *result; + } + + PlainSparsePair *SparsePairManager::tryGetExisting(Allocator::SlotId slot_id, bool *is_new_slot) const + { + auto cached_it = m_pairs.find(slot_id); + if (cached_it != m_pairs.end()) { + cacheHotPair(slot_id, *cached_it->second); + if (is_new_slot) { + *is_new_slot = false; + } + return cached_it->second.get(); + } + + // Try opening an existing slot if not cached + auto root_address = m_allocator->tryFirstAlloc(slot_id); + if (!root_address) { + if (!m_prefix->tryLoadSlot(slot_id, *m_allocator)) { + // slot has no data yet, cannot be loaded + return nullptr; + } + if (is_new_slot) { + *is_new_slot = true; + } + root_address = m_allocator->tryFirstAlloc(slot_id); + } else if (is_new_slot) { + *is_new_slot = false; + } + + // sparse pair is located at the slot's root address + // Open existing sparse pair over an already existing slot + auto dram_pair = createDRAMPair(slot_id); + auto flags = m_flags & ~StorageFlags { StorageFlagOption::NO_LOAD }; + auto sparse_pair = std::make_unique( + dram_pair, m_access_type, *root_address, flags, slot_id, &m_change_log); + auto *result = sparse_pair.get(); + m_pairs.insert_or_assign(slot_id, std::move(sparse_pair)); + cacheHotPair(slot_id, *result); + return result; + } + + void SparsePairManager::evictSlot(Allocator::SlotId slot_id) + { + auto pair_it = m_pairs.find(slot_id); + if (pair_it == m_pairs.end()) { + return; + } + if (m_hot_pair == pair_it->second.get()) { + m_hot_pair = nullptr; + } + pair_it->second->detach(); + m_pairs.erase(pair_it); + } + + void SparsePairManager::refreshPages(const std::vector &page_nums) + { + if (page_nums.empty()) { + return; + } + + // Refresh pages from a single specific slot only + auto refresh_slot = [&](std::uint64_t slot_id, const std::uint64_t *begin, const std::uint64_t *end) + { + if (begin == end) { + // no pages to refresh, just return + return; + } + + // Use different paths depending on mapping policy + PlainSparsePair *sparse_pair = nullptr; + if (m_mapping_policy == MappingPolicy::eager) { + bool is_new_slot = false; + sparse_pair = tryGetExisting(slot_id, &is_new_slot); + if (is_new_slot) { + // no need for refreshing since the slot is newly loaded + return; + } + } else { + sparse_pair = tryGetCached(slot_id); + } + + if (!sparse_pair) { + return; + } + + // detach before reloading / refreshing + sparse_pair->detach(); + db0::load(*m_prefix, begin, end); + sparse_pair->refresh(); + + // also update the allocator if it's needed + auto updater = m_allocator->tryBeginUpdate(slot_id); + // NOTE: updater may not be available if the update not needed + if (!!updater) { + for (;begin != end; ++begin) { + // update with the local address + updater(MS_Address::from(*begin << m_ps_shift).local_address()); + } + } + }; + + // page_nums are sorted + // we can scan them refreshing slot by slot, only existing slots need refreshing + // but newly added slots should be loaded when the mapping policy == eager + const std::uint64_t *current = page_nums.data(); + const std::uint64_t *end = current; + std::uint64_t last_slot_id = 0; + for (auto page_num: page_nums) { + auto slot_id = MS_Address::from(page_num << m_ps_shift).slot_id(); + if (slot_id != last_slot_id) { + assert(slot_id > last_slot_id); + refresh_slot(last_slot_id, current, end); + // move on to the next slot + last_slot_id = slot_id; + current = end; + } + ++end; + } + + refresh_slot(last_slot_id, current, end); + m_prefix->refresh(); + } + + void SparsePairManager::forCachedPairs(std::function callback) + { + for (auto &item: m_pairs) { + callback(item.first, *item.second); + } + } + + std::size_t SparsePairManager::getChangeLogSize() const + { + return m_change_log.size(); + } + + SparsePairManager::ChangeLogT SparsePairManager::extractChangeLogPages() + { + ChangeLogT page_nums; + page_nums.swap(m_change_log); + return page_nums; + } + + bool SparsePairManager::commit() + { + if (m_change_log.empty()) { + return false; + } + + // Identify dirty slots from the change log and commit them (once) + std::unordered_set committed_slots; + for (auto entry: m_change_log) { + auto slot_id = MS_Address::from(entry << m_ps_shift).slot_id(); + if (!committed_slots.insert(slot_id).second) { + continue; + } + + auto pair_it = m_pairs.find(slot_id); + if (pair_it != m_pairs.end()) { + pair_it->second->commit(); + } + } + return true; + } + + DRAM_Pair SparsePairManager::createDRAMPair(Allocator::SlotId slot_id) const + { + (void)slot_id; + return { m_prefix, m_allocator }; + } + + void SparsePairManager::cacheHotPair(Allocator::SlotId slot_id, PlainSparsePair &sparse_pair) const noexcept + { + m_hot_slot_id = slot_id; + m_hot_pair = &sparse_pair; + } + +} diff --git a/src/dbzero/core/storage/SparsePairManager.hpp b/src/dbzero/core/storage/SparsePairManager.hpp new file mode 100644 index 00000000..47b8aea2 --- /dev/null +++ b/src/dbzero/core/storage/SparsePairManager.hpp @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include "SparsePair.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace db0 + +{ + + class DRAM_Prefix; + class MS_MetaAllocator; + + enum class MappingPolicy + { + eager, + lazy + }; + + /** + * Owns per-slot SparsePair instances stored inside one MS_MetaSpace. + * + * Each managed SparsePair uses the shared MS_MetaSpace prefix, but all of + * its internal sparse/diff index allocations are forced into the requested + * MS_MetaSpace slot. This lets callers keep independent sparse mappings for + * sparse slot ids while preserving the MetaSpace-level persistence and flush + * behavior. + * + * The manager requires a typed MS_MetaSpace, not a generic Memspace, because + * it needs access to MS_MetaAllocator slot metadata to open an existing + * SparsePair root allocation without scanning unrelated slots. Repeated + * lookups are optimized for the common same-slot case with a last-hit + * pointer before falling back to the slot-id map. + * + * SparsePairManager is scoped to one MS_MetaSpace instance and does not add + * synchronization; callers must provide external locking if they share it + * across threads. + */ + class SparsePairManager + { + public: + using ChangeLogT = PlainSparsePair::ChangeLogT; + using SlotId = Allocator::SlotId; + + SparsePairManager(MS_MetaSpace &metaspace, AccessType access_type = AccessType::READ_WRITE, + StorageFlags flags = {}, MappingPolicy = MappingPolicy::eager); + + PlainSparsePair &getOrCreate(SlotId slot_id); + + PlainSparsePair *tryGetExisting(SlotId, bool *is_new_slot = nullptr) const; + + PlainSparsePair *tryGetCached(SlotId) const noexcept; + + void evictSlot(SlotId slot_id); + + void refreshPages(const std::vector &page_nums); + + void forCachedPairs(std::function callback); + + std::size_t getChangeLogSize() const; + + ChangeLogT extractChangeLogPages(); + + bool commit(); + + private: + std::shared_ptr m_prefix; + std::shared_ptr m_allocator; + const std::uint32_t m_ps_shift; + const MappingPolicy m_mapping_policy; + AccessType m_access_type; + StorageFlags m_flags; + // shared change log for all managed pairs, cleared on commit + // it contains page numbers which after translating to MS_Address also reveal slot IDs + mutable ChangeLogT m_change_log; + + mutable std::unordered_map > m_pairs; + mutable SlotId m_hot_slot_id = 0; + mutable PlainSparsePair *m_hot_pair = nullptr; + + DRAM_Pair createDRAMPair(SlotId) const; + + void cacheHotPair(SlotId, PlainSparsePair &) const noexcept; + }; + +} diff --git a/src/dbzero/core/storage/SparsePairQuery.cpp b/src/dbzero/core/storage/SparsePairQuery.cpp new file mode 100644 index 00000000..c68a1a0c --- /dev/null +++ b/src/dbzero/core/storage/SparsePairQuery.cpp @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include "SparsePairQuery.hpp" +#include + +namespace db0 + +{ + + template + SparsePairQuery::SparsePairQuery(const StorageOptions &options, std::uint32_t page_size, + std::uint64_t begin_page_num, std::uint64_t end_page_num, + SparsePairManager &sparse_pair_manager) + : m_options(options) + , m_page_size(page_size) + , m_page_num(begin_page_num) + , m_end_page_num(end_page_num) + , m_sparse_pair_manager(sparse_pair_manager) + , m_use_bucket_mapping(end_page_num - begin_page_num >= 2 && !!m_options.m_storage_slab_bucket) + { + if (m_use_bucket_mapping) { + if constexpr (read_only) { + initSparsePair(begin_page_num); + } else { + initOrCreateSparsePair(begin_page_num); + } + } + } + + template + SparsePairQuery &SparsePairQuery::operator++() + { + assert(hasNext() && "SparsePairQuery page range exhausted"); + ++m_page_num; + return *this; + } + + template + Allocator::SlotId SparsePairQuery::slotId() const + { + assert(m_slot_initialized && "SparsePairQuery slot requested before current lookup"); + assert((!m_use_bucket_mapping || m_page_num < m_bucket_end_page_num) + && "SparsePairQuery slot requested past current bucket"); + return m_slot_id; + } + + template + PlainSparsePair *SparsePairQuery::currentSparsePair() + { + if (!m_use_bucket_mapping) { + m_slot_id = getMetaSlotId(m_page_num); + m_slot_initialized = true; + return m_sparse_pair_manager.tryGetExisting(m_slot_id); + } + if (m_page_num >= m_bucket_end_page_num) { + initSparsePair(m_page_num); + } + return m_sparse_pair; + } + + template + PlainSparsePair &SparsePairQuery::currentOrCreateSparsePair() + { + if (!m_use_bucket_mapping) { + m_slot_id = getMetaSlotId(m_page_num); + m_slot_initialized = true; + return m_sparse_pair_manager.getOrCreate(m_slot_id); + } + if (m_page_num >= m_bucket_end_page_num) { + initOrCreateSparsePair(m_page_num); + } + assert(m_sparse_pair && "SparsePairQuery get-or-create lookup returned null"); + return *m_sparse_pair; + } + + template + Allocator::SlotId SparsePairQuery::getMetaSlotId(std::uint64_t page_num) const + { + auto address = page_num * static_cast(m_page_size); + return m_options.m_storage_slab_bucketing(address); + } + + template + StorageOptions::StorageSlabBucket SparsePairQuery::getBucket(std::uint64_t page_num) const + { + auto page_address = page_num * static_cast(m_page_size); + return m_options.m_storage_slab_bucket(page_address); + } + + template + void SparsePairQuery::setBucketEndPageNum( + const StorageOptions::StorageSlabBucket &bucket, std::uint64_t page_num) + { + assert(page_num >= bucket.m_begin_page_num && "SparsePairQuery bucket does not cover begin page"); + assert(page_num < bucket.m_end_page_num && "SparsePairQuery bucket does not cover begin page"); + m_bucket_end_page_num = bucket.m_end_page_num; + } + + template + void SparsePairQuery::initSparsePair(std::uint64_t page_num) + { + auto bucket = getBucket(page_num); + setBucketEndPageNum(bucket, page_num); + m_slot_id = bucket.m_slot_id; + m_slot_initialized = true; + m_sparse_pair = m_sparse_pair_manager.tryGetExisting(m_slot_id); + } + + template + void SparsePairQuery::initOrCreateSparsePair(std::uint64_t page_num) + { + auto bucket = getBucket(page_num); + setBucketEndPageNum(bucket, page_num); + m_slot_id = bucket.m_slot_id; + m_slot_initialized = true; + m_sparse_pair = &m_sparse_pair_manager.getOrCreate(m_slot_id); + } + + template class SparsePairQuery; + template class SparsePairQuery; + +} diff --git a/src/dbzero/core/storage/SparsePairQuery.hpp b/src/dbzero/core/storage/SparsePairQuery.hpp new file mode 100644 index 00000000..bcde1488 --- /dev/null +++ b/src/dbzero/core/storage/SparsePairQuery.hpp @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include "SparsePairManager.hpp" +#include "StorageOptions.hpp" +#include + +namespace db0 + +{ + + // Retrieve the managed sparse pairs corresponding to the storage logical page numbers + template + class SparsePairQuery + { + public: + // NOTE: begin_page_num / end_page_num are logical main storage page numbers + // the slot_num is assigned by bucketing them into slabs (slab ID -> slot num) + SparsePairQuery(const StorageOptions &options, std::uint32_t page_size, + std::uint64_t begin_page_num, std::uint64_t end_page_num, + SparsePairManager &sparse_pair_manager); + + std::uint64_t pageNum() const + { + return m_page_num; + } + + bool hasNext() const + { + return m_page_num < m_end_page_num; + } + + Allocator::SlotId slotId() const; + + SparsePairQuery &operator++(); + + PlainSparsePair *currentSparsePair(); + PlainSparsePair ¤tOrCreateSparsePair(); + + private: + Allocator::SlotId getMetaSlotId(std::uint64_t page_num) const; + StorageOptions::StorageSlabBucket getBucket(std::uint64_t page_num) const; + void setBucketEndPageNum(const StorageOptions::StorageSlabBucket &bucket, std::uint64_t page_num); + void initSparsePair(std::uint64_t page_num); + void initOrCreateSparsePair(std::uint64_t page_num); + + const StorageOptions &m_options; + const std::uint32_t m_page_size; + std::uint64_t m_page_num; + const std::uint64_t m_end_page_num; + SparsePairManager &m_sparse_pair_manager; + const bool m_use_bucket_mapping; + Allocator::SlotId m_slot_id = 0; + bool m_slot_initialized = false; + std::uint64_t m_bucket_end_page_num = 0; + PlainSparsePair *m_sparse_pair = nullptr; + }; + + extern template class SparsePairQuery; + extern template class SparsePairQuery; + +} diff --git a/src/dbzero/core/storage/StorageFlags.hpp b/src/dbzero/core/storage/StorageFlags.hpp index 46dd2dbf..74192f75 100644 --- a/src/dbzero/core/storage/StorageFlags.hpp +++ b/src/dbzero/core/storage/StorageFlags.hpp @@ -3,18 +3,19 @@ #pragma once +#include #include namespace db0 { - enum class StorageOptions : std::uint16_t + enum class StorageFlagOption : std::uint16_t { // Prevents loading any data into memory (e.g. when opening for copying) NO_LOAD = 0x0001, }; - using StorageFlags = FlagSet; + using StorageFlags = FlagSet; } diff --git a/src/dbzero/core/storage/StorageOptions.hpp b/src/dbzero/core/storage/StorageOptions.hpp new file mode 100644 index 00000000..958984e7 --- /dev/null +++ b/src/dbzero/core/storage/StorageOptions.hpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include +#include +#include +#include +#include +#include "SparsePairManager.hpp" + +namespace db0 + +{ + + struct StorageOptions + { + using StorageSlabBucket = MetaAllocator::StorageSlabBucketingFunction::Bucket; + + MappingPolicy m_meta_mapping_policy = MappingPolicy::eager; + + /** + * Maps a raw application storage byte address to the meta-space slot that + * hosts the SparsePair metadata for pages in that address bucket. + */ + std::function m_storage_slab_bucketing; + + /** + * Extended storage bucketing API. + * + * Returns the same meta-space slot id as m_storage_slab_bucketing plus the + * half-open logical page span covered by the slot. This is populated by + * defaults and is used for multi-page read/write lookups. + */ + std::function m_storage_slab_bucket; + }; + +} diff --git a/src/dbzero/core/storage/StorageRootMetadata.hpp b/src/dbzero/core/storage/StorageRootMetadata.hpp new file mode 100644 index 00000000..acaf2cdd --- /dev/null +++ b/src/dbzero/core/storage/StorageRootMetadata.hpp @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace db0 +{ + +DB0_PACKED_BEGIN + // Persisted tree-level metadata for the root storage index. + struct DB0_PACKED_ATTR o_storage_root_metadata: o_fixed_versioned + { + // page_io stream positioning variable + std::uint64_t m_next_page_num = 0; + std::uint32_t m_max_state_num = 0; + // The extra-data slot currently stores the paired diff-index address. + std::uint64_t m_extra_data = 0; + // reserved for future use + std::array m_reserved = {0, 0, 0}; + }; +DB0_PACKED_END + +DB0_PACKED_BEGIN + // Persisted tree-level metadata for the Plain Storage Index + struct DB0_PACKED_ATTR o_plain_metadata: o_fixed_versioned + { + // The extra-data slot currently stores the paired diff-index address. + std::uint64_t m_extra_data = 0; + // reserved for future use + std::array m_reserved = {0, 0}; + }; +DB0_PACKED_END + + template class MetadataAPI + { + public: + using PageNumT = typename BaseT::PageNumT; + using StateNumT = typename BaseT::StateNumT; + using SI_ItemT = typename BaseT::SI_ItemT; + using SI_CompressedItemT = typename BaseT::SI_CompressedItemT; + using tag_create = typename BaseT::tag_create; + + explicit MetadataAPI(BaseT &base) + : m_base(&base) + { + refresh(); + } + + void setExtraData(std::uint64_t data) + { + m_base->m_index.modifyTreeHeader().m_extra_data = data; + } + + std::uint64_t getExtraData() const + { + return m_base->m_index.treeHeader().m_extra_data; + } + + void refresh() {} + + protected: + BaseT *m_base; + }; + + template + class StorageRootMetadataAPI: public MetadataAPI + { + public: + using PageNumT = typename BaseT::PageNumT; + using StateNumT = typename BaseT::StateNumT; + using SI_ItemT = typename BaseT::SI_ItemT; + using SI_CompressedItemT = typename BaseT::SI_CompressedItemT; + using tag_create = typename BaseT::tag_create; + + explicit StorageRootMetadataAPI(BaseT &base) + : MetadataAPI(base) + { + if (!!base) { + this->refresh(); + } + } + + void refresh() + { + auto &header = this->m_base->m_index.treeHeader(); + m_next_page_num = header.m_next_page_num; + m_max_state_num = header.m_max_state_num; + } + + std::optional getNextStoragePageNum() const + { + if (m_next_page_num == 0) { + return std::nullopt; + } + return m_next_page_num; + } + + StateNumT getMaxStateNum() const + { + return m_max_state_num; + } + + void recordMaxStateNum(StateNumT state_num) + { + if (state_num >= m_max_state_num && state_num != 0) { + m_max_state_num = state_num; + this->m_base->m_index.modifyTreeHeader().m_max_state_num = state_num; + } + } + + void recordNextStoragePageNum(PageNumT next_page_num) + { + if (next_page_num > m_next_page_num) { + m_next_page_num = next_page_num; + this->m_base->m_index.modifyTreeHeader().m_next_page_num = next_page_num; + } + } + + private: + PageNumT m_next_page_num = 0; + StateNumT m_max_state_num = 0; + }; + + struct StorageRootMetadataMixin + { + using OverlayT = o_storage_root_metadata; + template using ApiT = StorageRootMetadataAPI; + }; + + struct PlainMetadataMixin + { + using OverlayT = o_plain_metadata; + template using ApiT = MetadataAPI; + }; + + struct EmptyMixin + { + using OverlayT = o_plain_metadata; + template using ApiT = MetadataAPI; + }; + +} diff --git a/src/dbzero/core/storage/copy_prefix.cpp b/src/dbzero/core/storage/copy_prefix.cpp index 586058eb..3fa78f2e 100644 --- a/src/dbzero/core/storage/copy_prefix.cpp +++ b/src/dbzero/core/storage/copy_prefix.cpp @@ -51,29 +51,32 @@ namespace db0 } std::optional copyDRAM_IO(DRAM_IOStream &input_io, DRAM_ChangeLogStreamT &input_dram_changelog, - DRAM_IOStream &output_io, DRAM_ChangeLogStreamT::Writer &output_dram_changelog) + DRAM_IOStream &output_io, DRAM_ChangeLogStreamT::Writer &output_dram_changelog, + std::optional max_state_num) { using DRAM_ChangeLogT = DRAM_IOStream::DRAM_ChangeLogT; // Exhaust the input_dram_changelog first // NOTE: we don't need to copy the changelog, just insert an empty item with the latest state number input_dram_changelog.setStreamPosHead(); + std::optional maybe_state_num; for (;;) { - while (input_dram_changelog.readChangeLogChunk()); + while (auto change_log = input_dram_changelog.readChangeLogChunk()) { + if (!max_state_num || change_log->m_state_num <= *max_state_num) { + maybe_state_num = change_log->m_state_num; + } + } // continue refreshing until reaching the most recent state if (!input_dram_changelog.refresh()) { break; } } - auto last_chunk_ptr = input_dram_changelog.getLastChangeLogChunk(); - if (!last_chunk_ptr) { + if (!maybe_state_num) { // looks like the DRAM IO is empty return {}; } - - // retrieve the state number candidate - auto state_num = last_chunk_ptr->m_state_num; + auto state_num = *maybe_state_num; // Copy the entire DRAM_IO stream next (possibly inconsistent state) // collecting the mapping of chunk addresses @@ -98,6 +101,13 @@ namespace db0 copyStream(input_io, output_io, &chunk_addr_map, chunk_filter); + if (max_state_num) { + output_dram_changelog.appendChangeLog({}, state_num); + output_io.addChunk(0); + output_io.BlockIOStream::flush(); + return state_num; + } + // NOTE: the operation might need to be repeated multiple times // if unable to reach a consistent state in one pass (this might be due to a very slow reader process) for (;;) { @@ -105,15 +115,12 @@ namespace db0 // NOTE: in this step we prefetch to memory to be able to catch up with changes std::unordered_map > chunk_buf; while (input_dram_changelog.refresh()) { - fetchDRAM_IOChanges(input_io, input_dram_changelog, chunk_buf); + if (auto maybe_state_num = fetchDRAM_IOChanges(input_io, input_dram_changelog, chunk_buf)) { + // this is the actually copied last consistent state number + state_num = *maybe_state_num; + } } - last_chunk_ptr = input_dram_changelog.getLastChangeLogChunk(); - assert(last_chunk_ptr); - - // this is the actually copied last consistent state number - state_num = last_chunk_ptr->m_state_num; - // NOTE: at this stage we might also encounter incomplete // or new chunks beyond the copied stream which needs to be discarded chunk_buf = filterDRAM_Chunks(std::move(chunk_buf), dram_filter); @@ -134,7 +141,8 @@ namespace db0 } } - output_io.close(); + output_io.addChunk(0); + output_io.BlockIOStream::flush(); return state_num; } @@ -186,33 +194,27 @@ namespace db0 std::optional copyDPStream(DP_ChangeLogStreamT &in, DP_ChangeLogStreamT &out, StateNumType max_state_num) { - using DP_ChangeLogT = DP_ChangeLogStreamT::ChangeLogT; - auto chunk_filter = [&](const std::vector &buffer, const void *data_end) -> bool - { - const auto &header = DP_ChangeLogT::__const_ref(buffer.data()); - // only include chunks up to max_state_num - if (header.m_state_num == max_state_num) { - // NOTE: this is the last chunk, we include it and stop further copying - auto chunk_size = (char*)data_end - buffer.data(); - out.addChunk(chunk_size); - out.appendToChunk(buffer.data(), chunk_size); - return false; + using o_change_log_t = DP_ChangeLogStreamT::ChangeLogT; + in.setStreamPosHead(); + std::vector buffer; + std::size_t chunk_size = 0; + std::optional result; + while ((chunk_size = in.readChunk(buffer)) > 0) { + const auto &header = o_change_log_t::__const_ref(buffer.data()); + if (header.m_state_num > max_state_num) { + break; } - return header.m_state_num < max_state_num; - }; - - // NOTE: we use copy_all = false to stop on the first non-matching chunk - // since chunks are ordered by state number - auto last_chunk_buf = copyStream(in, out, nullptr, chunk_filter, false); - // we can retrieve the end page number from the last appended chunk - if (last_chunk_buf.empty()) { - // nothing copied - return {}; + out.addChunk(chunk_size); + out.appendToChunk(buffer.data(), chunk_size); + result = header; + + if (header.m_state_num == max_state_num) { + break; + } } - - using o_change_log_t = DP_ChangeLogStreamT::ChangeLogT; - return o_change_log_t::__const_ref(last_chunk_buf.data()); + out.flush(); + return result; } // Debug & validation function - to compare pages of the 2 streams (e.g. source and copy) @@ -276,4 +278,4 @@ namespace db0 } } -} \ No newline at end of file +} diff --git a/src/dbzero/core/storage/copy_prefix.hpp b/src/dbzero/core/storage/copy_prefix.hpp index f53fcd99..bc683fc2 100644 --- a/src/dbzero/core/storage/copy_prefix.hpp +++ b/src/dbzero/core/storage/copy_prefix.hpp @@ -24,7 +24,8 @@ namespace db0 // NOTE: output_changelog is NOT flushed (see the design) // @return the finalal copied state number (unless nothing was copied - then std::nullopt) std::optional copyDRAM_IO(DRAM_IOStream &input_io, DRAM_ChangeLogStreamT &input_dram_changelog, - DRAM_IOStream &output_io, DRAM_ChangeLogStreamT::Writer &output_dram_changelog); + DRAM_IOStream &output_io, DRAM_ChangeLogStreamT::Writer &output_dram_changelog, + std::optional max_state_num = {}); using ChunkFilterT = std::function &chunk_buffer, const void *data_end)>; using DRAM_FilterT = std::function; @@ -56,4 +57,4 @@ namespace db0 void copyPageIO(const Page_IO &in, const ExtSpace &src_ext_space, Page_IO &out, std::uint64_t end_page_num, ExtSpace &ext_space); -} \ No newline at end of file +} diff --git a/src/dbzero/core/vspace/v_object.hpp b/src/dbzero/core/vspace/v_object.hpp index 6dba17df..ee1df838 100644 --- a/src/dbzero/core/vspace/v_object.hpp +++ b/src/dbzero/core/vspace/v_object.hpp @@ -12,6 +12,8 @@ namespace db0 { struct tag_verified {}; + + struct tag_dynamic_slot {}; /** * Base class for vspace-mapped objects @@ -64,7 +66,7 @@ namespace db0 ); ContainerT::__new(reinterpret_cast(&this->modify()), std::get(std::forward(t))...); } - + /// Pre-locked constructor struct tag_prelocked {}; template::value-1> @@ -121,9 +123,28 @@ namespace db0 : v_object(memspace, tag_prelocked(), std::forward_as_tuple(std::forward(args)...), make_int_seq_t()) { } + + /** + * Allocating constructor with runtime slot selection. + * Dynamic slot selection is only valid for types without a static SLOT_NUM + * override, because runtime slots must not override a type-owned static slot. + */ + template + v_object(Memspace &memspace, tag_dynamic_slot, Allocator::SlotId slot_num, Args&&... args) + { + initNew( + memspace, + ContainerT::measure(std::forward(args)...), + {}, + slot_num + ); + ContainerT::__new(reinterpret_cast(&this->modify()), std::forward(args)...); + } // Standard allocating constructor - template, Args...>* = nullptr, last_type_is_not_t* = nullptr> + template, Args...>* = nullptr, + last_type_is_not_t* = nullptr, + first_type_is_not_t* = nullptr> v_object(Memspace &memspace, Args&&... args) : v_object(memspace, std::forward(args)..., FlagSet {}) { @@ -198,12 +219,14 @@ namespace db0 private: // Create a new instance - void initNew(Memspace &memspace, std::size_t size, FlagSet access_mode = {}) + void initNew(Memspace &memspace, std::size_t size, FlagSet access_mode = {}, + Allocator::SlotId slot_num = 0) { // read not allowed for instance creation assert(!access_mode[AccessOptions::read]); + assert((!slot_num || !SLOT_NUM) && "dynamic slot cannot override a static SLOT_NUM"); this->m_memspace_ptr = &memspace; - this->m_address = memspace.alloc(size, SLOT_NUM, REALM_ID, getLocality(access_mode)); + this->m_address = memspace.alloc(size, slot_num ? slot_num : SLOT_NUM, REALM_ID, getLocality(access_mode)); // lock for create & write // NOTE: must extract physical address for mapRange this->m_mem_lock = memspace.getPrefix().mapRange( @@ -277,4 +300,4 @@ namespace db0 return *(MemberT*)((std::byte*)(&obj.modify()) + offset); } -} \ No newline at end of file +} diff --git a/tests/unit_tests/BDevStorageTest.cpp b/tests/unit_tests/BDevStorageTest.cpp index 70f853e9..85509e3d 100644 --- a/tests/unit_tests/BDevStorageTest.cpp +++ b/tests/unit_tests/BDevStorageTest.cpp @@ -10,7 +10,9 @@ #include #include #include +#include #include +#include #include using namespace std; @@ -24,13 +26,16 @@ namespace tests class BDevStorageTest: public testing::Test { public: static constexpr const char *file_name = "my-test-prefix_1.db0"; + static constexpr const char *copy_file_name = "my-test-prefix-copy.db0"; virtual void SetUp() override { drop(file_name); + drop(copy_file_name); } virtual void TearDown() override { drop(file_name); + drop(copy_file_name); } }; @@ -38,22 +43,129 @@ namespace tests class BDevStorageWrapper: public BDevStorage { public: + struct DRAMChangeLogRecord + { + StateNumType m_state_num; + std::vector m_page_nums; + }; + + struct DPChangeLogRecord + { + StateNumType m_state_num; + std::vector m_page_nums; + }; + /** * Opens BDevStorage over an existing file */ - BDevStorageWrapper(const std::string &file_name, AccessType = AccessType::READ_WRITE) - : BDevStorage(file_name, AccessType::READ_WRITE) + BDevStorageWrapper(const std::string &file_name, AccessType access_type = AccessType::READ_WRITE, + LockFlags lock_flags = {}, std::optional meta_io_step_size = {}, + StorageFlags flags = {}, StorageOptions options = {}) + : BDevStorage(file_name, access_type, lock_flags, meta_io_step_size, flags, options) { } - SparseIndex &getSparseIndex() { - return m_sparse_index; + PlainSparseIndex &getSparseIndex() { + return getApplicationSparsePair(0).getSparseIndex(); + } + + PlainSparsePair &getApplicationSparsePair(std::uint64_t page_num) { + return m_sparse_pair_manager.getOrCreate(getMetaSlotId(page_num)); + } + + const SparsePair &getRootMetaSparsePair() const { + return m_root_sparse_pair; + } + + Allocator::SlotId metaSlotId(std::uint64_t page_num) const { + return getMetaSlotId(page_num); + } + + std::optional applicationStoragePageNum( + std::uint64_t logical_page_num, StateNumType state_num) + { + auto item = getApplicationSparsePair(logical_page_num) + .getSparseIndex().lookup(logical_page_num, state_num); + if (!item) { + return {}; + } + std::uint64_t storage_page_num = item.m_storage_page_num; + return storage_page_num; } const DRAM_IOStream &getDRAM_IOStream() const { return m_dram_io; } + std::vector readDRAMChangeLogRecords() + { + std::vector result; + DRAM_ChangeLogStreamT::State state; + m_dram_changelog_io.saveState(state); + m_dram_changelog_io.setStreamPosHead(); + while (auto change_log = m_dram_changelog_io.readChangeLogChunk()) { + DRAMChangeLogRecord record { change_log->m_state_num, {} }; + for (auto page_num: *change_log) { + record.m_page_nums.push_back(page_num); + } + result.push_back(std::move(record)); + } + m_dram_changelog_io.restoreState(state); + return result; + } + + std::vector readDPChangeLogRecords() + { + std::vector result; + DP_ChangeLogStreamT::State state; + m_dp_changelog_io.saveState(state); + m_dp_changelog_io.setStreamPosHead(); + while (auto change_log = m_dp_changelog_io.readChangeLogChunk()) { + DPChangeLogRecord record { change_log->m_state_num, {} }; + for (auto page_num: *change_log) { + record.m_page_nums.push_back(page_num); + } + result.push_back(std::move(record)); + } + m_dp_changelog_io.restoreState(state); + return result; + } + + std::uint32_t getConfigVersion() const { + return m_config.m_version; + } + + std::uint64_t appendDescriptorPage(const std::vector &page) { + return m_desc_io.appendRandom(page.data()); + } + + void readDescriptorPage(std::uint64_t page_num, std::vector &page) const { + m_desc_io.readRandom(page_num, page.data()); + } + + void dirtyMetaSpaceWithoutStateRegistration() { + auto address = m_meta_space.alloc(m_config.m_dram_page_size, 1); + auto lock = m_meta_space.getPrefix().mapRange( + address.getOffset(), m_config.m_dram_page_size, { AccessOptions::write }); + std::memset(lock.modify(), 0x5a, m_config.m_dram_page_size); + } + + void recordRootStateForTest(StateNumType state_num) { + m_root_sparse_pair.recordMaxStateNum(state_num); + } + + std::optional > descriptorPageRange() const { + return std::make_pair(m_config.m_desc_io_head, m_page_io.getEndPageNum()); + } + + std::uint64_t appendDataPage(const std::vector &page) { + return m_page_io.append(page.data()); + } + + static std::uint64_t physicalOffset(std::uint64_t page_num, std::uint32_t page_size) { + return CONFIG_BLOCK_SIZE + page_num * page_size; + } + void readMetered(std::uint64_t address, std::uint64_t state_num, std::size_t size, void *buffer, unsigned int &chain_len) const { @@ -67,6 +179,315 @@ namespace tests ASSERT_TRUE(file_exists(file_name)); } + TEST_F( BDevStorageTest , testApplicationSparsePairIsHostedInMetaSpace ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + std::vector page(page_size, 0x41); + + { + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + cut.write(0, 1, page.size(), page.data()); + ASSERT_TRUE(cut.flush()); + + auto &app_pair = cut.getApplicationSparsePair(0); + ASSERT_TRUE(app_pair.getSparseIndex().lookup(0, 1)); + ASSERT_GT(cut.getRootMetaSparsePair().size(), 0u); + cut.close(); + } + + { + BDevStorageWrapper reopened(file_name, AccessType::READ_ONLY); + std::vector read_buffer(page_size); + reopened.read(0, 1, read_buffer.size(), read_buffer.data(), { AccessOptions::read }); + ASSERT_TRUE(equal(page, read_buffer)); + ASSERT_TRUE(reopened.getApplicationSparsePair(0).getSparseIndex().lookup(0, 1)); + reopened.close(); + } + } + + TEST_F( BDevStorageTest , testDescriptorRandomIODoesNotOverlapDramBlocksAcrossReopens ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + std::vector data_page(page_size, 0x41); + std::vector descriptor_page(page_size * 4, std::byte{0x5a}); + + for (int i = 0; i < 40; ++i) { + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + data_page[0] = static_cast(i); + descriptor_page[0] = static_cast(i); + + cut.write(static_cast(i) * page_size, i + 1, data_page.size(), data_page.data()); + cut.appendDescriptorPage(descriptor_page); + ASSERT_TRUE(cut.flush()); + cut.close(); + } + + BDevStorageWrapper reopened(file_name, AccessType::READ_WRITE); + reopened.close(); + } + + TEST_F( BDevStorageTest , testApplicationSparsePairBucketingUsesConfiguredFunction ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + StorageOptions options; + options.m_storage_slab_bucketing = [page_size](std::uint64_t address) { + return address < page_size * 10 ? 5u : 9u; + }; + + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE, {}, {}, {}, options); + std::vector low_page(page_size, 0x15); + std::vector high_page(page_size, 0x19); + cut.write(0, 1, low_page.size(), low_page.data()); + cut.write(20 * page_size, 1, high_page.size(), high_page.data()); + + auto &low_pair = cut.getApplicationSparsePair(0); + auto &high_pair = cut.getApplicationSparsePair(20); + auto low_address = low_pair.getSparseIndex().getIndexAddress().getOffset(); + auto high_address = high_pair.getSparseIndex().getIndexAddress().getOffset(); + auto low_slot = MS_Address::from(low_address).slot_id(); + auto high_slot = MS_Address::from(high_address).slot_id(); + + ASSERT_EQ(cut.metaSlotId(0), 5u); + ASSERT_EQ(cut.metaSlotId(20), 9u); + ASSERT_EQ(low_slot, 5u); + ASSERT_EQ(high_slot, 9u); + ASSERT_NE(&low_pair, &high_pair); + cut.close(); + } + + TEST_F( BDevStorageTest , testSparsePairQueryUsesBucketSpanOnlyForMultiPageRanges ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + unsigned int single_page_mapping_calls = 0; + unsigned int bucket_mapping_calls = 0; + + StorageOptions options; + options.m_storage_slab_bucketing = [&](std::uint64_t) { + ++single_page_mapping_calls; + return 0u; + }; + options.m_storage_slab_bucket = [&](std::uint64_t) { + ++bucket_mapping_calls; + return StorageOptions::StorageSlabBucket { 0u, 0u, 1024u }; + }; + + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE, {}, {}, {}, options); + std::vector write_buffer(2 * page_size, 0x32); + single_page_mapping_calls = 0; + bucket_mapping_calls = 0; + cut.write(0, 1, write_buffer.size(), write_buffer.data()); + ASSERT_EQ(single_page_mapping_calls, 0u); + ASSERT_EQ(bucket_mapping_calls, 1u); + + single_page_mapping_calls = 0; + bucket_mapping_calls = 0; + std::vector single_page_read(page_size); + cut.read(0, 1, single_page_read.size(), single_page_read.data(), { AccessOptions::read }); + ASSERT_EQ(single_page_mapping_calls, 1u); + ASSERT_EQ(bucket_mapping_calls, 0u); + + single_page_mapping_calls = 0; + bucket_mapping_calls = 0; + std::vector multi_page_read(2 * page_size); + cut.read(0, 1, multi_page_read.size(), multi_page_read.data(), { AccessOptions::read }); + ASSERT_EQ(single_page_mapping_calls, 0u); + ASSERT_EQ(bucket_mapping_calls, 1u); + ASSERT_TRUE(equal(write_buffer, multi_page_read)); + + cut.close(); + } + + TEST_F( BDevStorageTest , testSparsePairQueryRefreshesAtBucketBoundaryWithoutSkippingFirstPage ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + unsigned int single_page_mapping_calls = 0; + unsigned int bucket_mapping_calls = 0; + + StorageOptions options; + options.m_storage_slab_bucketing = [&](std::uint64_t address) { + ++single_page_mapping_calls; + return static_cast(address / page_size); + }; + options.m_storage_slab_bucket = [&](std::uint64_t address) { + ++bucket_mapping_calls; + auto page_num = address / page_size; + return StorageOptions::StorageSlabBucket { + static_cast(page_num), page_num, page_num + 1 + }; + }; + + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE, {}, {}, {}, options); + std::vector write_buffer(2 * page_size); + std::fill(write_buffer.begin(), write_buffer.begin() + page_size, 0x31); + std::fill(write_buffer.begin() + page_size, write_buffer.end(), 0x42); + + cut.write(0, 1, write_buffer.size(), write_buffer.data()); + ASSERT_EQ(single_page_mapping_calls, 0u); + ASSERT_EQ(bucket_mapping_calls, 2u); + + single_page_mapping_calls = 0; + bucket_mapping_calls = 0; + std::vector read_buffer(2 * page_size); + cut.read(0, 1, read_buffer.size(), read_buffer.data(), { AccessOptions::read }); + ASSERT_EQ(single_page_mapping_calls, 0u); + ASSERT_EQ(bucket_mapping_calls, 2u); + ASSERT_TRUE(equal(write_buffer, read_buffer)); + + cut.close(); + } + + TEST_F( BDevStorageTest , testDescriptorIOUsesSeparatePageSizeAndDoesNotCollideWithPageIO ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + std::uint64_t descriptor_page_num = 0; + std::uint64_t data_page_num = 0; + std::vector descriptor_page(16u << 10, std::byte{0x55}); + std::vector data_page(page_size, std::byte{0x2a}); + std::vector state_page(page_size, 0x15); + + { + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + ASSERT_EQ(2u, cut.getConfigVersion()); + ASSERT_EQ(page_size, cut.getPageSize()); + ASSERT_EQ(16u << 10, cut.getDescriptorPageSize()); + + cut.write(0, 1, state_page.size(), state_page.data()); + descriptor_page_num = cut.appendDescriptorPage(descriptor_page); + data_page_num = cut.appendDataPage(data_page); + cut.close(); + } + + auto descriptor_begin = BDevStorageWrapper::physicalOffset(descriptor_page_num, 16u << 10); + auto descriptor_end = descriptor_begin + descriptor_page.size(); + auto data_begin = BDevStorageWrapper::physicalOffset(data_page_num, page_size); + auto data_end = data_begin + data_page.size(); + ASSERT_TRUE(descriptor_end <= data_begin || data_end <= descriptor_begin); + + { + BDevStorageWrapper cut(file_name, AccessType::READ_ONLY); + std::vector descriptor_read(descriptor_page.size()); + cut.readDescriptorPage(descriptor_page_num, descriptor_read); + ASSERT_EQ(descriptor_page, descriptor_read); + cut.close(); + } + } + + TEST_F( BDevStorageTest , testDescriptorIOCursorIsRestoredFromRootMetadata ) + { + BDevStorage::create(file_name); + + std::uint64_t first_page_num = 0; + std::uint64_t second_page_num = 0; + std::vector first_page(16u << 10, std::byte{0x11}); + std::vector second_page(16u << 10, std::byte{0x22}); + std::size_t page_size = 4096; + std::vector data_page(page_size, 0x33); + + { + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + cut.write(0, 1, data_page.size(), data_page.data()); + first_page_num = cut.appendDescriptorPage(first_page); + cut.close(); + } + + { + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + auto descriptor_page_range = cut.descriptorPageRange(); + ASSERT_TRUE(descriptor_page_range); + ASSERT_LE(descriptor_page_range->first, first_page_num); + ASSERT_GE(descriptor_page_range->second, first_page_num + 1); + cut.write(page_size, 2, data_page.size(), data_page.data()); + second_page_num = cut.appendDescriptorPage(second_page); + ASSERT_GT(second_page_num, first_page_num); + cut.close(); + } + + { + BDevStorageWrapper cut(file_name, AccessType::READ_ONLY); + std::vector first_read(first_page.size()); + std::vector second_read(second_page.size()); + cut.readDescriptorPage(first_page_num, first_read); + cut.readDescriptorPage(second_page_num, second_read); + ASSERT_EQ(first_page, first_read); + ASSERT_EQ(second_page, second_read); + auto descriptor_page_range = cut.descriptorPageRange(); + ASSERT_TRUE(descriptor_page_range); + ASSERT_LE(descriptor_page_range->first, first_page_num); + ASSERT_GE(descriptor_page_range->second, second_page_num + 1); + cut.close(); + } + } + + TEST_F( BDevStorageTest , testCopyToCopiesDescriptorIOExactly ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size, (16u << 10) - 256, 4u << 20); + + std::vector data_page(page_size, 0x11); + std::vector second_data_page(page_size, 0x22); + { + BDevStorageWrapper src(file_name, AccessType::READ_WRITE); + src.write(0, 1, data_page.size(), data_page.data()); + ASSERT_TRUE(src.flush()); + src.write(page_size, 2, second_data_page.size(), second_data_page.data()); + ASSERT_TRUE(src.flush()); + src.close(); + } + + std::pair descriptor_page_range; + { + BDevStorageWrapper src_before_copy(file_name, AccessType::READ_ONLY); + auto maybe_descriptor_page_range = src_before_copy.descriptorPageRange(); + ASSERT_TRUE(maybe_descriptor_page_range); + descriptor_page_range = *maybe_descriptor_page_range; + ASSERT_LT(descriptor_page_range.first, descriptor_page_range.second); + src_before_copy.close(); + } + + { + BDevStorageWrapper src(file_name, AccessType::READ_ONLY); + BDevStorage::create(copy_file_name, page_size, (16u << 10) - 256, 4u << 20); + BDevStorageWrapper out(copy_file_name, AccessType::READ_WRITE); + src.copyTo(out); + out.close(); + src.close(); + } + + BDevStorageWrapper src(file_name, AccessType::READ_ONLY); + BDevStorageWrapper out(copy_file_name, AccessType::READ_ONLY); + ASSERT_TRUE(src.descriptorPageRange()); + ASSERT_TRUE(out.descriptorPageRange()); + ASSERT_EQ(src.descriptorPageRange(), out.descriptorPageRange()); + ASSERT_EQ(descriptor_page_range, *out.descriptorPageRange()); + + for (auto descriptor_page_num = descriptor_page_range.first; + descriptor_page_num < descriptor_page_range.second; ++descriptor_page_num) { + std::vector src_descriptor_read(src.getDescriptorPageSize()); + std::vector out_descriptor_read(out.getDescriptorPageSize()); + src.readDescriptorPage(descriptor_page_num, src_descriptor_read); + out.readDescriptorPage(descriptor_page_num, out_descriptor_read); + ASSERT_EQ(src_descriptor_read, out_descriptor_read); + } + + std::vector data_read(page_size); + out.read(0, 1, data_read.size(), data_read.data(), { AccessOptions::read }); + ASSERT_TRUE(equal(data_page, data_read)); + out.read(page_size, 2, data_read.size(), data_read.data(), { AccessOptions::read }); + ASSERT_TRUE(equal(second_data_page, data_read)); + out.close(); + src.close(); + } + TEST_F( BDevStorageTest , testCanWriteThenReadFullPagesFromOneState ) { srand(9142424u); @@ -148,7 +569,7 @@ namespace tests } cut.close(); } - + TEST_F( BDevStorageTest , testBDevStorageThrowsIfReadingFromUninitializedSpace ) { srand(9142424u); @@ -231,6 +652,45 @@ namespace tests cut.close(); } + TEST_F( BDevStorageTest , testReopenedWriterAppendsUpdatedPagesAfterExistingData ) + { + BDevStorage::create(file_name); + std::size_t page_size = 0; + + { + BDevStorage cut(file_name); + page_size = cut.getPageSize(); + for (int i = 0; i < 3; ++i) { + std::vector page(page_size, static_cast('a' + i)); + cut.write(i * page_size, 1, page.size(), page.data()); + } + cut.close(); + } + + { + BDevStorage cut(file_name); + for (int i = 0; i < 3; ++i) { + std::vector page(page_size, static_cast('A' + i)); + cut.write(i * page_size, 2, page.size(), page.data()); + } + cut.close(); + } + + BDevStorageWrapper cut(file_name); + for (int i = 0; i < 3; ++i) { + std::vector read_buffer(page_size); + cut.read(i * page_size, 1, read_buffer.size(), read_buffer.data()); + ASSERT_EQ(read_buffer[0], static_cast('a' + i)); + + cut.read(i * page_size, 2, read_buffer.size(), read_buffer.data()); + ASSERT_EQ(read_buffer[0], static_cast('A' + i)) + << "logical_page=" << i + << " storage_page=" + << cut.applicationStoragePageNum(static_cast(i), 2).value_or(0); + } + cut.close(); + } + TEST_F( BDevStorageTest , testStateWiseWriteThenRead ) { // In this test scenario we simply perform a sequence of writes @@ -432,12 +892,42 @@ namespace tests reader.join(); } + TEST_F( BDevStorageTest , testReaderRefreshSeesRepeatedWritesInSameSparsePairSlot ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + BDevStorage writer(file_name, AccessType::READ_WRITE); + BDevStorage reader(file_name, AccessType::READ_ONLY); + + std::vector first(page_size, 'a'); + writer.write(page_size, 1, first.size(), first.data()); + writer.flush(); + + reader.refresh(); + ASSERT_GE(reader.getMaxStateNum(), 1u); + std::vector buffer(page_size); + reader.read(page_size, 1, buffer.size(), buffer.data(), { AccessOptions::read }); + ASSERT_TRUE(equal(first, buffer)); + + std::vector second(page_size, 'b'); + writer.write(2 * page_size, 2, second.size(), second.data()); + writer.flush(); + + reader.refresh(); + ASSERT_GE(reader.getMaxStateNum(), 2u); + reader.read(2 * page_size, 2, buffer.size(), buffer.data(), { AccessOptions::read }); + ASSERT_TRUE(equal(second, buffer)); + writer.close(); + reader.close(); + } + TEST_F( BDevStorageTest , testNoLoadReaderCanRefreshAfterWriterCommit ) { std::size_t page_size = 4096; BDevStorage::create(file_name, page_size); - BDevStorage reader(file_name, AccessType::READ_ONLY, {}, {}, { StorageOptions::NO_LOAD }); + BDevStorage reader(file_name, AccessType::READ_ONLY, {}, {}, { StorageFlagOption::NO_LOAD }); std::vector data(page_size, 'r'); { @@ -455,6 +945,40 @@ namespace tests ASSERT_TRUE(equal(data, buffer)); reader.close(); } + + TEST_F( BDevStorageTest , testNoLoadReaderRootSparsePairSizeAfterRefresh ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + BDevStorageWrapper reader(file_name, AccessType::READ_ONLY, {}, {}, { StorageFlagOption::NO_LOAD }); + + std::vector data(page_size, 's'); + { + BDevStorage writer(file_name, AccessType::READ_WRITE); + writer.write(0, 1, data.size(), data.data()); + writer.flush(); + writer.close(); + } + + reader.refresh(); + + ASSERT_GT(reader.getRootMetaSparsePair().size(), 0u); + reader.close(); + } + + TEST_F( BDevStorageTest , testFlushRejectsDirtyMetadataWithoutRegisteredStateHighWatermark ) + { + std::size_t page_size = 4096; + BDevStorage::create(file_name, page_size); + + BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + cut.dirtyMetaSpaceWithoutStateRegistration(); + + ASSERT_THROW(cut.flush(), db0::InternalException); + cut.recordRootStateForTest(1); + cut.close(); + } TEST_F( BDevStorageTest , testSparseIndexDurability ) { @@ -466,22 +990,23 @@ namespace tests std::optional last_state_num; for (int i = 0; i < count; ++i) { BDevStorageWrapper cut(file_name, AccessType::READ_WRITE); + auto state_num = static_cast(i + 1); if (last_state_num) { ASSERT_EQ(cut.getMaxStateNum(), *last_state_num); } - auto &sparse_index = cut.getSparseIndex(); + std::vector data(page_size); for (unsigned int page_num = 0; page_num < 1000; ++page_num) { - sparse_index.emplace(page_num, i, 999); + cut.write(page_num * page_size, state_num, data.size(), data.data()); cut.getSparseIndex().refresh(); - ASSERT_EQ(cut.getMaxStateNum(), (std::uint32_t)i); + ASSERT_EQ(cut.getRootMetaSparsePair().getMaxStateNum(), state_num); } cut.getSparseIndex().refresh(); - ASSERT_EQ(cut.getMaxStateNum(), (std::uint32_t)i); + ASSERT_EQ(cut.getRootMetaSparsePair().getMaxStateNum(), state_num); cut.close(); - last_state_num = i; + last_state_num = state_num; } } diff --git a/tests/unit_tests/BaseWorkspaceTest.cpp b/tests/unit_tests/BaseWorkspaceTest.cpp index d66eef04..0e6df2aa 100644 --- a/tests/unit_tests/BaseWorkspaceTest.cpp +++ b/tests/unit_tests/BaseWorkspaceTest.cpp @@ -107,9 +107,10 @@ namespace tests // need to open as read/write to be able to estimate allocated size auto file_name = m_workspace.getPrefixCatalog().getFileName(prefix_name).string(); BDevStorage storage(file_name, AccessType::READ_WRITE); - // make sure the DramIO (sparse index + diff index storage) streams have allocated < 4 blocks + // DRAM metadata is append-only to preserve the previous committed root + // state for concurrent readers while a writer publishes the next one. auto &io = storage.getDramIO(); - ASSERT_LE((int)(io.getAllocatedSize() / io.getBlockSize()), 4); + ASSERT_LE((int)(io.getAllocatedSize() / io.getBlockSize()), 256); storage.close(); } diff --git a/tests/unit_tests/BlockIOStreamTest.cpp b/tests/unit_tests/BlockIOStreamTest.cpp index c39cc9cc..f9f8ac46 100644 --- a/tests/unit_tests/BlockIOStreamTest.cpp +++ b/tests/unit_tests/BlockIOStreamTest.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include using namespace std; @@ -353,6 +354,72 @@ namespace tests testReaderCanAccessChunksWrittenInMultipleCycles(*this, true); } + TEST_F( BlockIOStreamTest, testRefreshCanAdvanceToLinkedBlockFlushedAfterEos ) + { + std::vector no_data; + CFile::create(file_name, no_data); + CFile file(file_name, AccessType::READ_WRITE); + + constexpr std::uint32_t block_size = 1024; + auto first_chunk_size = block_size + - o_block_io_cs_block_header::sizeOf() + - o_block_io_chunk_header::sizeOf(); + + BlockIOStream out(file, 0, block_size, {}, AccessType::READ_WRITE, true); + out.addChunk(first_chunk_size); + out.appendToChunk(std::vector(first_chunk_size, 'a').data(), first_chunk_size); + out.addChunk(1); + out.appendToChunk("b", 1); + file.flush(); + + CFile read_file(file_name, AccessType::READ_ONLY); + BlockIOStream in(read_file, 0, block_size, {}, AccessType::READ_ONLY, true); + std::vector buffer; + ASSERT_EQ(in.readChunk(buffer), first_chunk_size); + ASSERT_EQ(in.readChunk(buffer), 0); + ASSERT_TRUE(in.eos()); + + out.flush(); + ASSERT_TRUE(in.refresh()); + ASSERT_EQ(in.readChunk(buffer), 1); + ASSERT_EQ(buffer[0], 'b'); + + out.close(); + } + + TEST_F( BlockIOStreamTest, testIncompleteChunkReadCanBeRetriedAfterRefresh ) + { + std::vector no_data; + CFile::create(file_name, no_data); + CFile file(file_name, AccessType::READ_WRITE); + + constexpr std::uint32_t block_size = 1024; + auto chunk_size = block_size + - o_block_io_cs_block_header::sizeOf() + - o_block_io_chunk_header::sizeOf() + + 1; + + BlockIOStream out(file, 0, block_size, {}, AccessType::READ_WRITE, true); + out.addChunk(chunk_size); + out.appendToChunk(std::vector(chunk_size, 'c').data(), chunk_size); + file.flush(); + + CFile read_file(file_name, AccessType::READ_ONLY); + BlockIOStream in(read_file, 0, block_size, {}, AccessType::READ_ONLY, true); + std::vector buffer; + ASSERT_EQ(in.readChunk(buffer), 0); + ASSERT_TRUE(in.eos()); + + out.flush(); + ASSERT_TRUE(in.refresh()); + ASSERT_EQ(in.readChunk(buffer), chunk_size); + ASSERT_TRUE(std::all_of(buffer.begin(), buffer.begin() + chunk_size, [](char value) { + return value == 'c'; + })); + + out.close(); + } + TEST_F( BlockIOStreamTest, testCanSaveAndThenRestoreStateWhenAppending ) { std::vector no_data; @@ -421,4 +488,4 @@ namespace tests cut.close(); } -} \ No newline at end of file +} diff --git a/tests/unit_tests/ChangeLogTest.cpp b/tests/unit_tests/ChangeLogTest.cpp index 24dd0d02..eeba431a 100644 --- a/tests/unit_tests/ChangeLogTest.cpp +++ b/tests/unit_tests/ChangeLogTest.cpp @@ -90,6 +90,24 @@ namespace tests } ASSERT_EQ(count, 5u); } + + TEST_F( ChangeLogTest , testChangeLogRLEPreservesZero ) + { + std::vector buf; + using ChangeLogT = o_change_log; + + std::vector change_log = { 0, 1, 2, 5 }; + ChangeLogData data(std::move(change_log), true, false, false); + auto measured_size = ChangeLogT::measure(data); + buf.resize(measured_size); + auto &cut = ChangeLogT::__new(buf.data(), data); + + std::vector decoded; + for (auto value: cut) { + decoded.push_back(value); + } + ASSERT_EQ(decoded, (std::vector { 0, 1, 2, 5 })); + } TEST_F( ChangeLogTest , testChangeLogWithHeader ) { diff --git a/tests/unit_tests/ContentIndexTest.cpp b/tests/unit_tests/ContentIndexTest.cpp index df1e5906..e4837790 100644 --- a/tests/unit_tests/ContentIndexTest.cpp +++ b/tests/unit_tests/ContentIndexTest.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace tests @@ -23,19 +24,16 @@ namespace tests class ContentIndexTest: public testing::Test { protected: - ContentIndexTest() - : m_workspace("", {}, {}, {}, {}, db0::object_model::initializer()) - { - } - void SetUp() override { - m_fixture = m_workspace.getFixture("content-index-test"); + m_workspace_fixture = std::make_unique("content-index-test"); + m_fixture = m_workspace_fixture->fixture(); } void TearDown() override { - m_workspace.close(); + m_fixture = nullptr; + m_workspace_fixture = nullptr; } std::shared_ptr makeClass(const char *name) @@ -80,7 +78,7 @@ namespace tests return *initializer; } - Workspace m_workspace; + std::unique_ptr m_workspace_fixture; db0::swine_ptr m_fixture; }; diff --git a/tests/unit_tests/DiffIndexTest.cpp b/tests/unit_tests/DiffIndexTest.cpp index 59651c38..cd620cc4 100644 --- a/tests/unit_tests/DiffIndexTest.cpp +++ b/tests/unit_tests/DiffIndexTest.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -37,17 +39,55 @@ namespace tests void TearDown() override { drop(file_name); } + + static DRAM_Pair createDramPair(std::size_t page_size) + { + return { + std::make_shared(page_size), + std::make_shared(page_size) + }; + } + + static SparseIndex createSparseIndex(std::size_t page_size) + { + return SparseIndex(SparseIndex::tag_create(), createDramPair(page_size)); + } + + template + static DiffIndexT createDiffIndex(std::size_t page_size) + { + return DiffIndexT(DiffIndex::tag_create(), createDramPair(page_size)); + } + }; + + class DiffIndexEraseTestAdapter: public DiffIndex + { + public: + using DiffIndex::DiffIndex; + + bool eraseDescriptor(PageNumT page_num, StateNumT state_num) { + return super_t::erase(page_num, state_num); + } + + std::size_t eraseDescriptorsBelow(PageNumT page_num, StateNumT state_num) { + return super_t::eraseBelow(page_num, state_num); + } + + std::size_t eraseDescriptorRange(PageNumT page_num, std::optional first_state_num = {}, + std::optional last_state_num = {}) { + return super_t::eraseRange(page_num, first_state_num, last_state_num); + } }; TEST_F( DiffIndexTest , testDiffIndexCanBeInstantiated ) { - DiffIndex cut(16 * 1024); + auto cut = createDiffIndex(16 * 1024); ASSERT_EQ(cut.size(), 0); } TEST_F( DiffIndexTest , testDiffIndexInsertNewItems ) { - DiffIndex cut(16 * 1024); + auto cut = createDiffIndex(16 * 1024); cut.insert(1, 1, 1); cut.insert(2, 1, 3); cut.insert(3, 1, 8); @@ -56,7 +96,7 @@ namespace tests TEST_F( DiffIndexTest , testDiffIndexExpandExistingItems ) { - DiffIndex cut(16 * 1024); + auto cut = createDiffIndex(16 * 1024); cut.insert(1, 1, 1); cut.insert(2, 1, 3); cut.insert(1, 3, 8); @@ -67,7 +107,7 @@ namespace tests TEST_F( DiffIndexTest , testDiffIndexFindLower ) { - DiffIndex cut(16 * 1024); + auto cut = createDiffIndex(16 * 1024); cut.insert(1, 1, 1); cut.insert(2, 1, 3); cut.insert(1, 3, 8); @@ -84,7 +124,7 @@ namespace tests TEST_F( DiffIndexTest , testDiffIndexFindUpper ) { - DiffIndex cut(16 * 1024); + auto cut = createDiffIndex(16 * 1024); cut.insert(1, 2, 3); cut.insert(1, 4, 4); cut.insert(1, 5, 11); @@ -99,7 +139,7 @@ namespace tests TEST_F( DiffIndexTest , testDiffIndexFindUpperIssue1 ) { - DiffIndex diff_index(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); for (auto [page, state, storage]: getDiffIndexData1()) { diff_index.insert(page, state, storage); } @@ -108,11 +148,148 @@ namespace tests ASSERT_EQ(item.m_page_num, 4); } + TEST_F( DiffIndexTest , testDiffIndexSparseIndexBaseCanEraseExactDescriptor ) + { + auto cut = createDiffIndex(512); + cut.insert(1, 1, 10); + cut.insert(2, 1, 20); + cut.insert(3, 1, 30); + + ASSERT_TRUE(cut.eraseDescriptor(2, 1)); + ASSERT_FALSE(cut.eraseDescriptor(2, 1)); + ASSERT_EQ(cut.size(), 2u); + ASSERT_EQ(cut.findLower(1, 1), 1u); + ASSERT_EQ(cut.findLower(2, 1), 0u); + ASSERT_EQ(cut.findLower(3, 1), 1u); + } + + TEST_F( DiffIndexTest , testDiffIndexSparseIndexBaseEraseBelowDescriptorEdgeCasesWithSmallNodes ) + { + auto cut = createDiffIndex(512); + constexpr std::uint64_t storage_step = 1ull << 32; + for (std::uint32_t state_num = 1; state_num <= 40; ++state_num) { + cut.insert(1, state_num, storage_step * state_num); + cut.insert(2, state_num, storage_step * (100 + state_num)); + } + ASSERT_GT(cut.size(), 2u); + + auto original_size = cut.size(); + ASSERT_EQ(cut.eraseDescriptorsBelow(1, 0), 0u); + ASSERT_EQ(cut.eraseDescriptorsBelow(1, 1), 0u); + ASSERT_EQ(cut.eraseDescriptorsBelow(99, 20), 0u); + ASSERT_EQ(cut.size(), original_size); + + auto removed = cut.eraseDescriptorsBelow(1, std::numeric_limits::max()); + ASSERT_GT(removed, 0u); + ASSERT_EQ(cut.size(), original_size - removed); + ASSERT_EQ(cut.findLower(1, 40), 0u); + ASSERT_EQ(cut.findLower(2, 40), 40u); + + auto page_2_descriptor_count = cut.size(); + ASSERT_EQ(cut.eraseDescriptorsBelow(1, std::numeric_limits::max()), 0u); + ASSERT_EQ(cut.eraseDescriptorsBelow(2, std::numeric_limits::max()), page_2_descriptor_count); + ASSERT_TRUE(cut.empty()); + } + + TEST_F( DiffIndexTest , testDiffIndexSparseIndexBaseEraseRangeDescriptorOptionalBounds ) + { + auto cut = createDiffIndex(512); + constexpr std::uint64_t storage_step = 1ull << 32; + for (std::uint32_t state_num = 1; state_num <= 12; ++state_num) { + cut.insert(1, state_num, storage_step * state_num); + cut.insert(2, state_num, storage_step * (100 + state_num)); + } + auto original_size = cut.size(); + ASSERT_GT(original_size, 2u); + + auto removed_middle = cut.eraseDescriptorRange(1, 4, 8); + ASSERT_GT(removed_middle, 0u); + ASSERT_EQ(cut.size(), original_size - removed_middle); + ASSERT_EQ(cut.findLower(2, 12), 12u); + + auto removed_tail = cut.eraseDescriptorRange(1, 8, {}); + ASSERT_GT(removed_tail, 0u); + ASSERT_EQ(cut.size(), original_size - removed_middle - removed_tail); + ASSERT_EQ(cut.findLower(2, 12), 12u); + + auto removed_page_2 = cut.eraseDescriptorRange(2); + ASSERT_GT(removed_page_2, 0u); + ASSERT_EQ(cut.findLower(2, 12), 0u); + ASSERT_EQ(cut.size(), original_size - removed_middle - removed_tail - removed_page_2); + } + + TEST_F( DiffIndexTest , testDiffIndexClearRemovesAllDescriptors ) + { + auto cut = createDiffIndex(512); + constexpr std::uint64_t storage_step = 1ull << 32; + for (std::uint32_t state_num = 1; state_num <= 40; ++state_num) { + cut.insert(1, state_num, storage_step * state_num); + cut.insert(2, state_num, storage_step * (100 + state_num)); + } + ASSERT_GT(cut.size(), 2u); + + cut.clear(); + + ASSERT_TRUE(cut.empty()); + ASSERT_EQ(cut.size(), 0u); + ASSERT_EQ(cut.findLower(1, 40), 0u); + ASSERT_EQ(cut.findLower(2, 40), 0u); + ASSERT_FALSE(cut.findUpper(1, 1)); + ASSERT_FALSE(cut.findUpper(2, 1)); + + cut.insert(3, 41, 0); + ASSERT_EQ(cut.size(), 1u); + ASSERT_EQ(cut.findLower(3, 41), 41u); + } + + TEST_F( DiffIndexTest , testDiffIndexForPageRangeUsesHalfOpenBounds ) + { + auto cut = createDiffIndex(16 * 1024); + constexpr std::uint64_t slot_size = 1ull << 24; + constexpr std::uint64_t slot_1_first = slot_size; + constexpr std::uint64_t slot_2_first = slot_size * 2; + + cut.insert(slot_1_first - 1, 1, 10); + cut.insert(slot_1_first, 1, 20); + cut.insert(slot_1_first + 7, 2, 21); + cut.insert(slot_2_first, 1, 30); + + std::vector page_nums; + cut.forPageRange(slot_1_first, slot_2_first, [&](const DI_Item &item) { + page_nums.push_back(item.m_page_num); + }); + + ASSERT_EQ(page_nums, (std::vector { slot_1_first, slot_1_first + 7 })); + } + + TEST_F( DiffIndexTest , testDiffIndexForPageRangeReturnsDiffDescriptorsAcrossNodes ) + { + auto cut = createDiffIndex(512); + constexpr std::uint64_t storage_step = 1ull << 32; + for (std::uint64_t page_num = 0; page_num < 200; ++page_num) { + cut.insert(page_num, 1, storage_step * (page_num + 1)); + cut.insert(page_num, 2, storage_step * (page_num + 1) + 1); + } + + std::vector page_nums; + std::vector first_state_nums; + cut.forPageRange(40, 75, [&](const DI_Item &item) { + page_nums.push_back(item.m_page_num); + first_state_nums.push_back(item.m_state_num); + }); + + ASSERT_EQ(page_nums.size(), 35u); + ASSERT_EQ(page_nums.front(), 40u); + ASSERT_EQ(page_nums.back(), 74u); + ASSERT_EQ(first_state_nums.front(), 1u); + ASSERT_EQ(first_state_nums.back(), 1u); + } + TEST_F( DiffIndexTest , DISABLED_testDiffIndexInsertThenQuery ) { auto ops = loadArray("./tests/files/diff_index_ops.csv"); - SparseIndex sparse_index(512); - DiffIndex diff_index(512); + auto sparse_index = createSparseIndex(512); + auto diff_index = createDiffIndex(512); std::vector> queries; unsigned int count = 0; diff --git a/tests/unit_tests/Diff_IOTest.cpp b/tests/unit_tests/Diff_IOTest.cpp index 6a0a12d1..b0691694 100644 --- a/tests/unit_tests/Diff_IOTest.cpp +++ b/tests/unit_tests/Diff_IOTest.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include using namespace std; @@ -193,5 +194,29 @@ namespace tests } cut.flush(); } + + TEST_F( Diff_IOTest , testDiff_IOBufferedDiffSurvivesRandomIOReservation ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + Diff_IOProxy cut(0, file, page_size, page_size * 16, 0, 0, tail_function); + std::vector diff_buf; + db0::getDiffs(m_dp_0.data(), m_dp_1.data(), page_size, diff_buf); + + auto page_num = cut.appendDiff(m_dp_1.data(), {1, 1}, diff_buf).first; + + RandomIO_Stream random_stream(cut, 4); + random_stream.append(m_dp_2.data()); + random_stream.flush(); + cut.flush(); + + auto dp = m_dp_0; + cut.applyFrom(page_num, dp.data(), {1, 1}); + ASSERT_EQ(std::memcmp(m_dp_1.data(), dp.data(), page_size), 0); + } -} \ No newline at end of file +} diff --git a/tests/unit_tests/EmbeddedDictTest.cpp b/tests/unit_tests/EmbeddedDictTest.cpp index 56e27c0d..bffa1ca9 100644 --- a/tests/unit_tests/EmbeddedDictTest.cpp +++ b/tests/unit_tests/EmbeddedDictTest.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -506,8 +507,8 @@ namespace tests { Py_Initialize(); - Workspace workspace("", {}, {}, {}, {}, db0::object_model::initializer()); - auto fixture = workspace.getFixture("embedded-dict-nested-memo"); + ScopedWorkspaceFixture workspace_fixture("embedded-dict-nested-memo"); + auto fixture = workspace_fixture.fixture(); auto nestedClass = getTestClass(fixture); auto pyMemoType = makeMemoType(); ASSERT_TRUE(pyMemoType.get()); @@ -571,7 +572,7 @@ namespace tests ASSERT_TRUE(sawEmbeddedKey); ASSERT_TRUE(sawEmbeddedValue); - workspace.close(); + workspace_fixture.close(); } } diff --git a/tests/unit_tests/EmbeddedObjectTest.cpp b/tests/unit_tests/EmbeddedObjectTest.cpp index a92c5f82..001847f4 100644 --- a/tests/unit_tests/EmbeddedObjectTest.cpp +++ b/tests/unit_tests/EmbeddedObjectTest.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -756,8 +757,8 @@ namespace tests { Py_Initialize(); - Workspace workspace("", {}, {}, {}, {}, db0::object_model::initializer()); - auto fixture = workspace.getFixture("embedded-object-nested-memo"); + ScopedWorkspaceFixture workspace_fixture("embedded-object-nested-memo"); + auto fixture = workspace_fixture.fixture(); auto nestedClass = getTestClass(fixture); auto pyMemoType = makeMemoType(); ASSERT_TRUE(pyMemoType.get()); @@ -794,7 +795,7 @@ namespace tests ASSERT_EQ(fixedValue->m_kind, StorageClass::INT64); ASSERT_EQ(fixedValue->m_value, 17u); - workspace.close(); + workspace_fixture.close(); } TEST_F( EmbeddedObjectTest , testEmbeddedObjectMeasureSizeOfAndSafeSizeOf ) diff --git a/tests/unit_tests/EmbeddedTupleTest.cpp b/tests/unit_tests/EmbeddedTupleTest.cpp index ebdcf5fc..7feb5c71 100644 --- a/tests/unit_tests/EmbeddedTupleTest.cpp +++ b/tests/unit_tests/EmbeddedTupleTest.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -403,8 +404,8 @@ namespace tests { Py_Initialize(); - Workspace workspace("", {}, {}, {}, {}, db0::object_model::initializer()); - auto fixture = workspace.getFixture("embedded-tuple-nested-memo"); + ScopedWorkspaceFixture workspace_fixture("embedded-tuple-nested-memo"); + auto fixture = workspace_fixture.fixture(); auto nestedClass = getTestClass(fixture); auto pyMemoType = makeMemoType(); ASSERT_TRUE(pyMemoType.get()); @@ -435,7 +436,7 @@ namespace tests ASSERT_EQ(fixedValue->m_kind, StorageClass::INT64); ASSERT_EQ(fixedValue->m_value, 23u); - workspace.close(); + workspace_fixture.close(); } TEST_F( EmbeddedTupleTest , testPyTupleConstructsDeeplyNestedCollections ) diff --git a/tests/unit_tests/FT_DetachTest.cpp b/tests/unit_tests/FT_DetachTest.cpp index 9bf906a6..8b6d1aa6 100644 --- a/tests/unit_tests/FT_DetachTest.cpp +++ b/tests/unit_tests/FT_DetachTest.cpp @@ -2,6 +2,7 @@ // Copyright (c) 2026 DBZero Software sp. z o.o. #include +#include #include #include #include @@ -350,8 +351,8 @@ namespace tests TEST_F(ObjectIteratorDetachTest, testObjectIteratorDetachDelegatesToUnderlyingIterator) { - Workspace workspace("", {}, {}, {}, {}, db0::object_model::initializer()); - auto fixture = workspace.getFixture("object-iterator-detach-test"); + ScopedWorkspaceFixture workspace_fixture("object-iterator-detach-test"); + auto fixture = workspace_fixture.fixture(); auto query = std::make_unique(); auto *query_ptr = query.get(); @@ -359,7 +360,7 @@ namespace tests iterator.detach(); ASSERT_TRUE(query_ptr->m_detached); - workspace.close(); + workspace_fixture.close(); } } diff --git a/tests/unit_tests/MetaAllocatorTest.cpp b/tests/unit_tests/MetaAllocatorTest.cpp index e15365df..45e3ca5e 100644 --- a/tests/unit_tests/MetaAllocatorTest.cpp +++ b/tests/unit_tests/MetaAllocatorTest.cpp @@ -126,6 +126,51 @@ namespace tests } } + TEST_F( MetaAllocatorTests , testStorageSlabBucketingFunctionMapsInnerBucketAddresses ) + { + auto page_size = 4096; + auto slab_size = 16 * 4096; + auto f = MetaAllocator::getStorageSlabBucketingFunction(page_size, slab_size); + + ASSERT_EQ(f(0), 0u); + ASSERT_EQ(f(1), 0u); + ASSERT_EQ(f(page_size), 0u); + ASSERT_EQ(f(slab_size / 2), 0u); + ASSERT_EQ(f(slab_size - 1), 0u); + + ASSERT_EQ(f(slab_size), 1u); + ASSERT_EQ(f(slab_size + 123), 1u); + ASSERT_EQ(f(2 * slab_size - 1), 1u); + ASSERT_EQ(f(2 * slab_size), 2u); + } + + TEST_F( MetaAllocatorTests , testStorageSlabBucketingFunctionSupportsOffset ) + { + auto page_size = 4096; + auto slab_size = 16 * 4096; + auto offset = page_size + 123; + auto f = MetaAllocator::getStorageSlabBucketingFunction(offset, page_size, slab_size); + + ASSERT_EQ(f(0), 0u); + ASSERT_EQ(f(offset), 0u); + ASSERT_EQ(f(offset + slab_size - 1), 0u); + ASSERT_EQ(f(offset + slab_size), 1u); + ASSERT_EQ(f(offset + 2 * slab_size), 2u); + } + + TEST_F( MetaAllocatorTests , testStorageSlabBucketingFunctionReportsBucketPageSpan ) + { + auto page_size = 4096; + auto slab_size = 16 * page_size; + auto f = MetaAllocator::getStorageSlabBucketingFunction(page_size, slab_size); + + auto bucket = f.getBucket(slab_size + page_size, page_size); + + ASSERT_EQ(bucket.m_slot_id, 1u); + ASSERT_EQ(bucket.m_begin_page_num, 16u); + ASSERT_EQ(bucket.m_end_page_num, 32u); + } + TEST_F( MetaAllocatorTests , testMetaAllocatorCanBeInitialized ) { // prepare prefix before first use diff --git a/tests/unit_tests/MetaSpaceTest.cpp b/tests/unit_tests/MetaSpaceTest.cpp new file mode 100644 index 00000000..8f3b4623 --- /dev/null +++ b/tests/unit_tests/MetaSpaceTest.cpp @@ -0,0 +1,975 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace db0; +using namespace db0::tests; + +namespace tests +{ + + class MetaSpaceTest: public testing::Test + { + public: + static constexpr const char *file_name = "my-test-metaspace.io"; + static constexpr std::size_t page_size = 4096; + + void SetUp() override { + drop(file_name); + } + + void TearDown() override { + drop(file_name); + } + + static Diff_IO createIO(CFile &file) + { + return createIO(file, page_size); + } + + static Diff_IO createIO(CFile &file, std::size_t page_size) + { + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + return Diff_IO(0, file, page_size, page_size * 16, page_size, 0, 1, tail_function, 0); + } + + static RandomIO_Stream createStream(Diff_IO &io) + { + return RandomIO_Stream(io, 2); + } + + static DRAM_Pair createMappingPair() + { + return createMappingPair(page_size); + } + + static DRAM_Pair createMappingPair(std::size_t page_size) + { + return { + std::make_shared(page_size), + std::make_shared(page_size) + }; + } + + static void fillPage(Memspace &memspace, Address address, unsigned char value) + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + std::memset(lock.modify(), value, page_size); + } + + static std::vector readPage(Memspace &memspace, Address address) + { + std::vector result(page_size); + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::read }); + std::memcpy(result.data(), static_cast(lock), page_size); + return result; + } + + static bool flushMeta(Memspace &memspace, RandomIO_Stream &io, SparsePair &sparse_pair) + { + auto &prefix = dynamic_cast(memspace.getPrefix()); + if (prefix.getDirtySize() != 0) { + sparse_pair.recordMaxStateNum(prefix.getStateNum(false) + 1); + } + return flush(prefix, io); + } + + static bool compactMeta(Memspace &memspace, RandomIO_Stream &io) + { + return compact(dynamic_cast(memspace.getPrefix()), io); + } + + static DRAM_Pair createPairFromMetaSpace(Memspace &memspace) + { + auto prefix = std::dynamic_pointer_cast(memspace.getPrefixPtr()); + auto meta_prefix = std::dynamic_pointer_cast(prefix); + std::unordered_set allocated_addresses; + meta_prefix->forAllocatedAddresses([&](std::size_t address) { + if (address != 0) { + allocated_addresses.insert(address); + } + }); + auto allocator = std::make_shared(allocated_addresses, memspace.getPageSize()); + return { prefix, allocator }; + } + + static std::optional findDiffStoragePage(const DI_Item &item, std::uint32_t state_num) + { + if (item.m_state_num == state_num) { + return item.m_storage_page_num; + } + + std::uint32_t next_state_num = 0; + std::uint64_t next_storage_page_num = 0; + auto it = item.beginDiff(); + while (it.next(next_state_num, next_storage_page_num)) { + if (next_state_num == state_num) { + return next_storage_page_num; + } + } + return std::nullopt; + } + + static std::vector readStoragePage(RandomIO_Stream &io, std::uint64_t storage_page_num) + { + std::vector result(page_size); + io.readRandom(storage_page_num, result.data()); + return result; + } + + static void patchExpectedPageRandom(Memspace &memspace, Address address, + std::vector &expected_page, std::mt19937 &rng, std::uint32_t write_count) + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *page = static_cast(lock.modify()); + std::uniform_int_distribution offset_dist(0, page_size - 1); + std::uniform_int_distribution value_dist(0, 255); + for (std::uint32_t i = 0; i < write_count; ++i) { + auto offset = offset_dist(rng); + auto value = static_cast(value_dist(rng)); + page[offset] = value; + expected_page[offset] = value; + } + } + }; + + TEST_F( MetaSpaceTest, testMetaSpacePersistsFullDPAndReopens ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x42); + + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto data = readPage(reopened, address); + ASSERT_EQ(data, std::vector(page_size, 0x42)); + } + + TEST_F( MetaSpaceTest, testMetaSpacePersistsDiffAndReopens ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x11); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *data = static_cast(lock.modify()); + data[17] = 0x22; + data[1234] = 0x33; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + ASSERT_GT(sparse_pair.getDiffIndex().size(), 0u); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto data = readPage(reopened, address); + ASSERT_EQ(data[0], 0x11); + ASSERT_EQ(data[17], 0x22); + ASSERT_EQ(data[1234], 0x33); + } + + TEST_F( MetaSpaceTest, testMetaSpaceLoadReportsLoadedPages ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto first = memspace.alloc(page_size); + auto second = memspace.alloc(page_size); + fillPage(memspace, first, 0x11); + fillPage(memspace, second, 0x22); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + MetaPrefix prefix(page_size, sparse_pair); + std::vector loaded_pages; + load(prefix, stream); + prefix.forAllocatedAddresses([&](std::uint64_t address) { + loaded_pages.push_back(address / page_size); + }); + + std::sort(loaded_pages.begin(), loaded_pages.end()); + ASSERT_EQ(loaded_pages.size(), 2u); + ASSERT_EQ(loaded_pages[0], first.getOffset() / page_size); + ASSERT_EQ(loaded_pages[1], second.getOffset() / page_size); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCapturesPreviousPageOnlyOnFirstDirtyMap ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x11); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *data = static_cast(lock.modify()); + data[17] = 0x22; + } + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *data = static_cast(lock.modify()); + data[1234] = 0x33; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto data = readPage(reopened, address); + ASSERT_EQ(data[0], 0x11); + ASSERT_EQ(data[17], 0x22); + ASSERT_EQ(data[1234], 0x33); + } + + TEST_F( MetaSpaceTest, testMetaSpaceNoopCommitDoesNotAdvanceState ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x7f); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + auto state_num = memspace.getStateNum(); + + ASSERT_FALSE(flushMeta(memspace, stream, sparse_pair)); + ASSERT_EQ(memspace.getStateNum(), state_num); + } + + TEST_F( MetaSpaceTest, testMetaSpaceReopenAllocatorGrowsFromLoadedHighWater ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto first = memspace.alloc(page_size); + auto second = memspace.alloc(page_size); + fillPage(memspace, first, 0x01); + fillPage(memspace, second, 0x02); + memspace.free(second); + auto reused = memspace.alloc(page_size); + ASSERT_EQ(reused, second); + fillPage(memspace, reused, 0x03); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto next = reopened.alloc(page_size); + ASSERT_EQ(next.getOffset(), second.getOffset() + page_size); + } + + TEST_F( MetaSpaceTest, testMetaSpaceReopenAllocatorRestoresSparseHoles ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto first = memspace.alloc(page_size); + auto second = memspace.alloc(page_size); + auto third = memspace.alloc(page_size); + fillPage(memspace, first, 0x01); + fillPage(memspace, third, 0x03); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto reused = reopened.alloc(page_size); + ASSERT_EQ(reused, second); + } + + TEST_F( MetaSpaceTest, testMSMetaSpacePersistsSlotZeroAndNonZeroSlot ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto slot_0_address = memspace.alloc(page_size, 0); + auto slot_7_address = memspace.alloc(page_size, 7); + fillPage(memspace, slot_0_address, 0x10); + fillPage(memspace, slot_7_address, 0x70); + + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + ASSERT_EQ(slot_0_address.getOffset() / page_size, 1u); + ASSERT_EQ(slot_7_address.getOffset(), (7ull << 24) + page_size); + ASSERT_TRUE(sparse_pair.getSparseIndex().lookup(slot_7_address.getOffset() / page_size, memspace.getStateNum())); + + auto reopened = MS_MetaSpace::create(page_size, sparse_pair, stream); + SparsePairManager manager(reopened); + ASSERT_EQ(readPage(reopened, slot_0_address), std::vector(page_size, 0x10)); + ASSERT_EQ(readPage(reopened, slot_7_address), std::vector(page_size, 0x70)); + } + + TEST_F( MetaSpaceTest, testMSAddressWrapsEncodedAddress ) + { + auto encoded_address = MS_Address::encode(7, 42); + auto &address = MS_Address::from(encoded_address); + + ASSERT_EQ(address.slot_id(), 7u); + ASSERT_EQ(address.local_address(), 42u); + ASSERT_EQ(encoded_address, (7ull << 24) + 42); + } + + TEST_F( MetaSpaceTest, testMSMetaSpaceReopenRestoresAllocatorHolePerSlot ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto first = memspace.alloc(page_size, 3); + auto second = memspace.alloc(page_size, 3); + auto third = memspace.alloc(page_size, 3); + fillPage(memspace, first, 0x01); + fillPage(memspace, third, 0x03); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto reopened = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto reused = reopened.alloc(page_size, 3); + ASSERT_EQ(reused, second); + } + + TEST_F( MetaSpaceTest, testMSMetaSpaceReopenRestoresAllocatorQueriesForNonZeroSlot ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto slot_7_address = memspace.alloc(page_size, 7); + fillPage(memspace, slot_7_address, 0x77); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto reopened = MS_MetaSpace::create(page_size, sparse_pair, stream); + std::size_t alloc_size = 0; + ASSERT_TRUE(reopened.getAllocator().isAllocated(slot_7_address, &alloc_size)); + ASSERT_EQ(alloc_size, page_size); + ASSERT_EQ(reopened.getAllocator().getAllocSize(slot_7_address), page_size); + auto allocation = reopened.getAllocator().findAllocation(slot_7_address + static_cast(17)); + ASSERT_EQ(allocation.address, slot_7_address); + ASSERT_EQ(allocation.size, page_size); + } + + TEST_F( MetaSpaceTest, testMSMetaSpaceFlushesMultipleSlotsAtomically ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto slot_1_address = memspace.alloc(page_size, 1); + auto slot_2_address = memspace.alloc(page_size, 2); + fillPage(memspace, slot_1_address, 0x11); + fillPage(memspace, slot_2_address, 0x22); + + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto state_num = memspace.getStateNum(); + ASSERT_EQ(state_num, 1u); + ASSERT_EQ(sparse_pair.getMaxStateNum(), state_num); + ASSERT_EQ(sparse_pair.getSparseIndex().lookup(slot_1_address.getOffset() / page_size, state_num).m_state_num, + state_num); + ASSERT_EQ(sparse_pair.getSparseIndex().lookup(slot_2_address.getOffset() / page_size, state_num).m_state_num, + state_num); + } + + TEST_F( MetaSpaceTest, testMSMetaSpacePersistsDiffInNonZeroSlot ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size, 9); + fillPage(memspace, address, 0x19); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *data = static_cast(lock.modify()); + data[17] = 0x91; + data[1024] = 0x92; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto encoded_page_num = address.getOffset() / page_size; + auto diff_item = sparse_pair.getDiffIndex().findUpper(encoded_page_num, memspace.getStateNum()); + ASSERT_TRUE(diff_item); + ASSERT_EQ(diff_item.m_page_num, encoded_page_num); + + auto reopened = MS_MetaSpace::create(page_size, sparse_pair, stream); + SparsePairManager manager(reopened); + auto data = readPage(reopened, address); + ASSERT_EQ(data[0], 0x19); + ASSERT_EQ(data[17], 0x91); + ASSERT_EQ(data[1024], 0x92); + } + + TEST_F( MetaSpaceTest, testMSMetaSpaceCompactionCoversMultipleSlots ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MS_MetaSpace::create(page_size, sparse_pair, stream); + auto slot_4_address = memspace.alloc(page_size, 4); + auto slot_5_address = memspace.alloc(page_size, 5); + fillPage(memspace, slot_4_address, 0x44); + fillPage(memspace, slot_5_address, 0x55); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + { + auto lock = memspace.getPrefix().mapRange(slot_4_address.getOffset(), page_size, { AccessOptions::write }); + static_cast(lock.modify())[17] = 0x40; + } + { + auto lock = memspace.getPrefix().mapRange(slot_5_address.getOffset(), page_size, { AccessOptions::write }); + static_cast(lock.modify())[17] = 0x50; + } + + ASSERT_TRUE(compact(dynamic_cast(memspace.getPrefix()), stream)); + + auto reopened = MS_MetaSpace::create(page_size, sparse_pair, stream); + SparsePairManager manager(reopened); + auto slot_4_data = readPage(reopened, slot_4_address); + auto slot_5_data = readPage(reopened, slot_5_address); + ASSERT_EQ(slot_4_data[0], 0x44); + ASSERT_EQ(slot_4_data[17], 0x40); + ASSERT_EQ(slot_5_data[0], 0x55); + ASSERT_EQ(slot_5_data[17], 0x50); + } + + TEST_F( MetaSpaceTest, testSparsePairDeploysOnMetaSpaceWith16KBPageSize ) + { + constexpr std::size_t large_page_size = 16 << 10; + + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(large_page_size); + SparsePair mapping_sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file, large_page_size); + auto stream = createStream(io); + auto meta_space = MetaSpace::create(large_page_size, mapping_sparse_pair, stream); + auto meta_pair = createPairFromMetaSpace(meta_space); + + using PageModel = std::map; + std::map sparse_model; + std::map diff_model; + std::vector pages_with_sparse_ops; + std::vector pages_with_diff_ops; + + std::mt19937_64 rng(0xdb0016); + std::uniform_int_distribution page_dist(0, 511); + std::bernoulli_distribution sparse_op_dist(0.62); + + std::uint32_t state_num = 1; + std::uint64_t storage_page_num = 101; + constexpr std::size_t op_count = 1000; + + SparsePair cut(SparsePair::tag_create(), meta_pair); + for (std::size_t i = 0; i < op_count; ++i) { + auto page_num = page_dist(rng); + storage_page_num += 1 + (rng() % 7); + + if (sparse_op_dist(rng)) { + cut.getSparseIndex().emplace(page_num, state_num, storage_page_num); + if (sparse_model[page_num].empty()) { + pages_with_sparse_ops.push_back(page_num); + } + sparse_model[page_num][state_num] = storage_page_num; + } else { + cut.getDiffIndex().insert(page_num, state_num, storage_page_num, (rng() % 23) == 0); + if (diff_model[page_num].empty()) { + pages_with_diff_ops.push_back(page_num); + } + diff_model[page_num][state_num] = storage_page_num; + } + + cut.recordMaxStateNum(state_num); + ++state_num; + } + cut.commit(); + + ASSERT_TRUE(flushMeta(meta_space, stream, mapping_sparse_pair)); + + auto reopened_meta_space = MetaSpace::create(large_page_size, mapping_sparse_pair, stream); + auto reopened_meta_pair = createPairFromMetaSpace(reopened_meta_space); + SparsePair reopened(reopened_meta_pair, AccessType::READ_WRITE, reopened_meta_pair.second->firstAlloc()); + + ASSERT_GT(reopened.size(), 500u); + ASSERT_EQ(reopened.getMaxStateNum(), state_num - 1); + + for (const auto &[page_num, states]: sparse_model) { + for (const auto &[expected_state_num, expected_storage_page_num]: states) { + auto sparse_item = reopened.getSparseIndex().lookup(page_num, expected_state_num); + ASSERT_TRUE(sparse_item); + ASSERT_EQ(sparse_item.m_state_num, expected_state_num); + ASSERT_EQ(sparse_item.m_storage_page_num, expected_storage_page_num); + } + } + + for (const auto &[page_num, states]: diff_model) { + for (const auto &[expected_state_num, expected_storage_page_num]: states) { + ASSERT_EQ(reopened.getDiffIndex().findLower(page_num, expected_state_num), expected_state_num); + + auto diff_item = reopened.getDiffIndex().findUpper(page_num, expected_state_num); + ASSERT_TRUE(diff_item); + auto actual_storage_page_num = findDiffStoragePage(diff_item, expected_state_num); + ASSERT_TRUE(actual_storage_page_num); + ASSERT_EQ(*actual_storage_page_num, expected_storage_page_num); + } + } + + std::shuffle(pages_with_sparse_ops.begin(), pages_with_sparse_ops.end(), rng); + for (std::size_t i = 0; i < std::min(pages_with_sparse_ops.size(), 256); ++i) { + auto page_num = pages_with_sparse_ops[i]; + auto query_state_num = static_cast(rng() % state_num); + auto expected_it = sparse_model[page_num].upper_bound(query_state_num); + auto sparse_item = reopened.getSparseIndex().lookup(page_num, query_state_num); + if (expected_it == sparse_model[page_num].begin()) { + ASSERT_FALSE(sparse_item); + } else { + --expected_it; + ASSERT_TRUE(sparse_item); + ASSERT_EQ(sparse_item.m_state_num, expected_it->first); + ASSERT_EQ(sparse_item.m_storage_page_num, expected_it->second); + } + } + + std::shuffle(pages_with_diff_ops.begin(), pages_with_diff_ops.end(), rng); + for (std::size_t i = 0; i < std::min(pages_with_diff_ops.size(), 256); ++i) { + auto page_num = pages_with_diff_ops[i]; + auto query_state_num = static_cast(rng() % state_num); + auto expected_lower_it = diff_model[page_num].upper_bound(query_state_num); + auto expected_upper_it = diff_model[page_num].lower_bound(query_state_num); + + if (expected_lower_it == diff_model[page_num].begin()) { + ASSERT_EQ(reopened.getDiffIndex().findLower(page_num, query_state_num), 0u); + } else { + --expected_lower_it; + ASSERT_EQ(reopened.getDiffIndex().findLower(page_num, query_state_num), expected_lower_it->first); + } + + auto diff_item = reopened.getDiffIndex().findUpper(page_num, query_state_num); + if (expected_upper_it == diff_model[page_num].end()) { + ASSERT_FALSE(diff_item); + } else { + ASSERT_TRUE(diff_item); + auto actual_storage_page_num = findDiffStoragePage(diff_item, expected_upper_it->first); + ASSERT_TRUE(actual_storage_page_num); + ASSERT_EQ(*actual_storage_page_num, expected_upper_it->second); + } + } + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionRewritesDiffBackedPageAndReopens ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x11); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *data = static_cast(lock.modify()); + data[17] = 0x22; + data[1234] = 0x33; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + ASSERT_GT(sparse_pair.getDiffIndex().size(), 0u); + auto diff_item = sparse_pair.getDiffIndex().findUpper(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(diff_item); + auto stale_diff_storage_page = findDiffStoragePage(diff_item, memspace.getStateNum()); + ASSERT_TRUE(stale_diff_storage_page); + + ASSERT_TRUE(compactMeta(memspace, stream)); + ASSERT_GT(sparse_pair.getDiffIndex().size(), 0u); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + static_cast(lock.modify())[2048] = 0x44; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + auto next_diff_item = sparse_pair.getDiffIndex().findUpper(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(next_diff_item); + auto next_diff_storage_page = findDiffStoragePage(next_diff_item, memspace.getStateNum()); + ASSERT_TRUE(next_diff_storage_page); + ASSERT_NE(*next_diff_storage_page, *stale_diff_storage_page); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto data = readPage(reopened, address); + ASSERT_EQ(data[0], 0x11); + ASSERT_EQ(data[17], 0x22); + ASSERT_EQ(data[1234], 0x33); + ASSERT_EQ(data[2048], 0x44); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionReusesStaleFullDP ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x10); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + auto initial_item = sparse_pair.getSparseIndex().lookup(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(initial_item); + auto stale_storage_page = initial_item.m_storage_page_num; + ASSERT_NE(stale_storage_page, 0u); + + ASSERT_TRUE(compactMeta(memspace, stream)); + auto first_compact_item = sparse_pair.getSparseIndex().lookup(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(first_compact_item); + ASSERT_NE(first_compact_item.m_storage_page_num, stale_storage_page); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + static_cast(lock.modify())[0] = 0x20; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + ASSERT_TRUE(compactMeta(memspace, stream)); + + auto second_compact_item = sparse_pair.getSparseIndex().lookup(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(second_compact_item); + ASSERT_NE(second_compact_item.m_storage_page_num, 0u); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + static_cast(lock.modify())[0] = 0x30; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + ASSERT_TRUE(compactMeta(memspace, stream)); + + auto third_compact_item = sparse_pair.getSparseIndex().lookup(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(third_compact_item); + ASSERT_NE(third_compact_item.m_storage_page_num, 0u); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto data = readPage(reopened, address); + ASSERT_EQ(data[0], 0x30); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionDoesNotOverwriteCurrentHeadFullDP ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x10); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto page_num = address.getOffset() / page_size; + auto head_state_num = memspace.getStateNum(); + auto head_item = sparse_pair.getSparseIndex().lookup(page_num, head_state_num); + ASSERT_TRUE(head_item); + auto head_storage_page_num = head_item.m_storage_page_num; + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + static_cast(lock.modify())[0] = 0x20; + } + + ASSERT_TRUE(compactMeta(memspace, stream)); + auto current_head_data = readStoragePage(stream, head_storage_page_num); + ASSERT_EQ(current_head_data, std::vector(page_size, 0x10)); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionLeavesCurrentHeadDiffReadable ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x11); + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + { + auto lock = memspace.getPrefix().mapRange(address.getOffset(), page_size, { AccessOptions::write }); + auto *data = static_cast(lock.modify()); + data[17] = 0x22; + data[1234] = 0x33; + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + auto page_num = address.getOffset() / page_size; + auto head_state_num = memspace.getStateNum(); + SparseIndexQuery query(sparse_pair.getSparseIndex(), sparse_pair.getDiffIndex(), page_num, head_state_num); + ASSERT_FALSE(query.empty()); + std::vector current_head_buffer(page_size); + io.read(query.first(), current_head_buffer.data()); + StateNumType diff_state_num = 0; + std::uint64_t diff_storage_page_num = 0; + ASSERT_TRUE(query.next(diff_state_num, diff_storage_page_num)); + ASSERT_EQ(diff_state_num, head_state_num); + + ASSERT_TRUE(compactMeta(memspace, stream)); + + io.applyFrom(diff_storage_page_num, current_head_buffer.data(), { page_num, diff_state_num }); + ASSERT_EQ(current_head_buffer[0], 0x11); + ASSERT_EQ(current_head_buffer[17], 0x22); + ASSERT_EQ(current_head_buffer[1234], 0x33); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionReusesThirdFullDPVersion ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + constexpr std::uint64_t page_num = 1; + bool is_first_page = false; + + std::vector oldest_buffer(page_size, 0x41); + auto oldest_storage_page_num = io.append(oldest_buffer.data(), &is_first_page); + sparse_pair.getSparseIndex().emplace(page_num, 1, oldest_storage_page_num); + + std::vector previous_buffer(page_size, 0x42); + auto previous_storage_page_num = io.append(previous_buffer.data(), &is_first_page); + sparse_pair.getSparseIndex().emplace(page_num, 2, previous_storage_page_num); + + std::vector head_buffer(page_size, 0x43); + auto head_storage_page_num = io.append(head_buffer.data(), &is_first_page); + sparse_pair.getSparseIndex().emplace(page_num, 3, head_storage_page_num); + sparse_pair.recordMaxStateNum(3); + sparse_pair.commit(); + + MetaPrefix prefix(page_size, sparse_pair); + ASSERT_EQ(prefix.getStateNum(false), 3u); + + ASSERT_TRUE(compact(prefix, stream)); + + auto compacted_item = sparse_pair.getSparseIndex().lookup(page_num, prefix.getStateNum(false)); + ASSERT_TRUE(compacted_item); + ASSERT_EQ(compacted_item.m_storage_page_num, oldest_storage_page_num); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionPersistsDirtyPageWithoutPriorFlush ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + auto address = memspace.alloc(page_size); + fillPage(memspace, address, 0x55); + + ASSERT_TRUE(compactMeta(memspace, stream)); + auto item = sparse_pair.getSparseIndex().lookup(address.getOffset() / page_size, memspace.getStateNum()); + ASSERT_TRUE(item); + ASSERT_NE(item.m_storage_page_num, 0u); + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + auto data = readPage(reopened, address); + ASSERT_EQ(data, std::vector(page_size, 0x55)); + } + + TEST_F( MetaSpaceTest, testMetaSpaceCompactionBiggerSimulatedWorkload ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto mapping_pair = createMappingPair(); + SparsePair sparse_pair(SparsePair::tag_create(), mapping_pair); + + auto io = createIO(file); + auto stream = createStream(io); + auto memspace = MetaSpace::create(page_size, sparse_pair, stream); + constexpr std::size_t page_count = 640; + std::vector
addresses; + std::vector > expected_pages; + std::vector dirty_before_second_compact(page_count, false); + addresses.reserve(page_count); + expected_pages.reserve(page_count); + std::mt19937 rng(0xDB005EED); + std::uniform_int_distribution page_dist(0, page_count - 1); + std::uniform_int_distribution sparse_write_count_dist(1, 12); + std::uniform_int_distribution dense_write_count_dist(16, 96); + + for (std::size_t i = 0; i < page_count; ++i) { + auto address = memspace.alloc(page_size); + addresses.push_back(address); + ASSERT_NE(address.getOffset(), 0u) << "page index " << i; + expected_pages.emplace_back(page_size, static_cast((i + 1) & 0xFF)); + fillPage(memspace, address, expected_pages.back()[0]); + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + + for (std::uint32_t round = 1; round <= 9; ++round) { + auto operation_count = page_count / 2 + round * 17; + for (std::size_t op = 0; op < operation_count; ++op) { + auto page_index = page_dist(rng); + patchExpectedPageRandom( + memspace, addresses[page_index], expected_pages[page_index], rng, sparse_write_count_dist(rng) + ); + } + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + } + ASSERT_GT(sparse_pair.getDiffIndex().size(), 0u); + + for (std::size_t i = 0; i < 16; ++i) { + auto page_index = page_dist(rng); + ASSERT_EQ(readPage(memspace, addresses[page_index]), expected_pages[page_index]) + << "pre-compact page index " << page_index; + } + ASSERT_TRUE(compactMeta(memspace, stream)); + ASSERT_EQ(sparse_pair.getSparseIndex().size(), page_count); + for (std::size_t i = 0; i < 16; ++i) { + auto page_index = page_dist(rng); + ASSERT_EQ(readPage(memspace, addresses[page_index]), expected_pages[page_index]) + << "post-first-compact page index " << page_index; + } + + for (std::uint32_t round = 10; round <= 12; ++round) { + auto operation_count = page_count / 3 + round * 23; + for (std::size_t op = 0; op < operation_count; ++op) { + auto page_index = page_dist(rng); + patchExpectedPageRandom( + memspace, addresses[page_index], expected_pages[page_index], rng, dense_write_count_dist(rng) + ); + if (round == 12) { + dirty_before_second_compact[page_index] = true; + } + } + if (round != 12) { + ASSERT_TRUE(flushMeta(memspace, stream, sparse_pair)); + } + } + + for (std::size_t page_index = 0; page_index < page_count; ++page_index) { + ASSERT_EQ(readPage(memspace, addresses[page_index]), expected_pages[page_index]) + << "pre-second-compact page index " << page_index; + } + ASSERT_TRUE(compactMeta(memspace, stream)); + ASSERT_EQ(sparse_pair.getSparseIndex().size(), page_count); + for (std::size_t i = 0; i < 16; ++i) { + auto page_index = page_dist(rng); + ASSERT_EQ(readPage(memspace, addresses[page_index]), expected_pages[page_index]) + << "post-second-compact page index " << page_index; + } + for (std::size_t page_index = 0; page_index < page_count; ++page_index) { + auto item = sparse_pair.getSparseIndex().lookup( + addresses[page_index].getOffset() / page_size, memspace.getStateNum() + ); + ASSERT_TRUE(item) << "page index " << page_index; + ASSERT_EQ(readStoragePage(stream, item.m_storage_page_num), expected_pages[page_index]) + << "storage page check page index " << page_index + << " dirty before second compact " << dirty_before_second_compact[page_index]; + } + + auto reopened = MetaSpace::create(page_size, sparse_pair, stream); + for (std::size_t i = 0; i < page_count; ++i) { + auto data = readPage(reopened, addresses[i]); + ASSERT_EQ(data, expected_pages[i]) << "page index " << i << " address " << addresses[i].getOffset(); + } + + std::vector allocated_addresses; + dynamic_cast(reopened.getPrefix()).forAllocatedAddresses([&](std::uint64_t address) { + allocated_addresses.push_back(address); + }); + + ASSERT_EQ(allocated_addresses.size(), addresses.size()); + for (std::size_t i = 0; i < addresses.size(); ++i) { + ASSERT_EQ(allocated_addresses[i], addresses[i].getOffset()); + } + } + +} diff --git a/tests/unit_tests/Page_IOTest.cpp b/tests/unit_tests/Page_IOTest.cpp index e2a25223..dab5f583 100644 --- a/tests/unit_tests/Page_IOTest.cpp +++ b/tests/unit_tests/Page_IOTest.cpp @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include using namespace std; using namespace db0; @@ -28,6 +30,28 @@ namespace tests virtual void TearDown() override { drop(file_name); } + + static std::vector makePage(std::size_t size, std::byte value) + { + return std::vector(size, value); + } + }; + + class RandomIO_StreamDiffIO: public Diff_IO + { + public: + RandomIO_StreamDiffIO(CFile &file, std::uint32_t page_size, std::uint32_t block_size, + std::function tail_function) + : Diff_IO(0, file, page_size, block_size, 0, 0, 1u, tail_function, 0) + { + } + }; + + class TestRandomIO_Stream: public RandomIO_Stream + { + public: + using RandomIO_Stream::RandomIO_Stream; + using RandomIO_Stream::getPageNum; }; TEST_F( Page_IOTest, testPage_IOAppendMultiple ) @@ -57,9 +81,496 @@ namespace tests ASSERT_EQ(cut.getCurrentStepRemainingPages(), 8); cut.append(buf.data(), 3); ASSERT_EQ(cut.getCurrentStepRemainingPages(), 5); - cut.append(buf.data(), 8); + ASSERT_THROW(cut.append(buf.data(), 6), db0::InternalException); ASSERT_EQ(cut.getCurrentStepRemainingPages(), 5); - ASSERT_EQ(cut.getNextPageNum().first, 11); + cut.append(buf.data(), 5); + ASSERT_EQ(cut.getNextPageNum().first, 8); + } + + TEST_F( Page_IOTest, testPage_IOReserveWithinSingleBlockStep ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + auto block_size = page_size * 8; + db0::Page_IO cut(0, file, page_size, block_size, 0, 0, 1u, tail_function, 0); + + ASSERT_EQ(0u, cut.reserve(4)); + ASSERT_EQ(4u, cut.reserve(4)); + ASSERT_EQ(8u, cut.reserve(4)); + } + + TEST_F( Page_IOTest, testReservePoolTracksContiguousStrides ) + { + db0::ReservePool cut; + + cut.add(10, 2); + cut.add(12, 3); + cut.add(20, 1); + + ASSERT_FALSE(cut.empty()); + ASSERT_EQ((std::make_pair(10, 5)), cut.next()); + ASSERT_EQ(10u, cut.tryPop(3).value()); + ASSERT_EQ((std::make_pair(13, 2)), cut.next()); + ASSERT_FALSE(cut.tryPop(3).has_value()); + ASSERT_EQ(13u, cut.tryPop(2).value()); + ASSERT_EQ((std::make_pair(20, 1)), cut.next()); + ASSERT_EQ(20u, cut.pop()); + ASSERT_TRUE(cut.empty()); + } + + TEST_F( Page_IOTest, testPage_IOReserveSkippedPagesAreReusedByAppend ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + auto block_size = page_size * 2; + db0::Page_IO cut(0, file, page_size, block_size, 0, 0, 2u, tail_function, 0); + + ASSERT_EQ(0u, cut.reserve(1)); + ASSERT_EQ(4u, cut.reserve(4)); + + std::vector write_buf(page_size * 2, 'x'); + ASSERT_EQ(1u, cut.append(write_buf.data(), 2)); + + std::vector read_buf(page_size * 2, 0); + cut.read(1, read_buf.data(), 2); + ASSERT_EQ(write_buf, read_buf); + ASSERT_EQ(3u, cut.append(write_buf.data())); + ASSERT_EQ(8u, cut.append(write_buf.data())); + } + + TEST_F( Page_IOTest, testPage_IOAppendMultipleDoesNotSplitReservePool ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + auto block_size = page_size * 2; + db0::Page_IO cut(0, file, page_size, block_size, 0, 0, 2u, tail_function, 0); + + ASSERT_EQ(0u, cut.reserve(1)); + ASSERT_EQ(4u, cut.reserve(4)); + + std::vector write_buf(page_size * 4, 'x'); + ASSERT_EQ(8u, cut.append(write_buf.data(), 4)); + ASSERT_EQ(1u, cut.getNextPageNum().first); + } + + TEST_F( Page_IOTest, testPage_IOPreservesFirstPageFlag ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + auto block_size = page_size * 2; + db0::Page_IO cut(0, file, page_size, block_size, 0, 0, 2u, tail_function, 0); + + bool is_first_page = false; + ASSERT_EQ(0u, cut.getNextPageNum(&is_first_page).first); + ASSERT_TRUE(is_first_page); + + std::vector write_buf(page_size, 'x'); + ASSERT_EQ(0u, cut.reserve(4)); + + is_first_page = false; + ASSERT_EQ(4u, cut.append(write_buf.data(), &is_first_page)); + ASSERT_TRUE(is_first_page); + } + + TEST_F( Page_IOTest, testPage_IOReserveSkippedPagesAreForgottenAfterReopen ) + { + CFile::create(file_name, {}); + std::uint64_t end_page_num = 0; + { + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + auto block_size = page_size * 2; + db0::Page_IO cut(0, file, page_size, block_size, 0, 0, 2u, tail_function, 0); + + ASSERT_EQ(0u, cut.reserve(3)); + ASSERT_EQ(4u, cut.reserve(2)); + end_page_num = cut.getEndPageNum(); + } + + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + auto block_size = page_size * 2; + db0::Page_IO reopened(0, file, page_size, block_size, 4 * page_size, 2, 2u, tail_function, 0); + ASSERT_EQ(6u, end_page_num); + + std::vector write_buf(page_size, 'x'); + ASSERT_EQ(6u, reopened.append(write_buf.data())); + } + + TEST_F( Page_IOTest, testRandomIO_StreamAppendsLargePagesOverSmallPageIO ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(5), page_size * 2); + + auto first = makePage(cut.getPageSize(), std::byte(1)); + auto second = makePage(cut.getPageSize(), std::byte(2)); + auto third = makePage(cut.getPageSize(), std::byte(3)); + + bool is_first_page = false; + ASSERT_EQ(0u, cut.append(first.data(), &is_first_page)); + ASSERT_TRUE(is_first_page); + ASSERT_EQ(2u, cut.append(second.data())); + ASSERT_EQ(5u, cut.append(third.data())); + cut.flush(); + + std::vector read_buf(cut.getPageSize()); + cut.readRandom(0, read_buf.data()); + ASSERT_EQ(first, read_buf); + cut.readRandom(2, read_buf.data()); + ASSERT_EQ(second, read_buf); + cut.readRandom(5, read_buf.data()); + ASSERT_EQ(third, read_buf); + + } + + TEST_F( Page_IOTest, testRandomIO_StreamClearReusesLargePageBlocks ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(5), page_size * 2); + + auto first = makePage(cut.getPageSize(), std::byte(1)); + auto second = makePage(cut.getPageSize(), std::byte(2)); + auto replacement = makePage(cut.getPageSize(), std::byte(9)); + + ASSERT_EQ(0u, cut.append(first.data())); + ASSERT_EQ(2u, cut.append(second.data())); + cut.flush(); + auto size_before_clear = file.size(); + + cut.clear(); + ASSERT_EQ(0u, cut.append(replacement.data())); + cut.flush(); + ASSERT_EQ(size_before_clear, file.size()); + + std::vector read_buf(cut.getPageSize()); + cut.readRandom(0, read_buf.data()); + ASSERT_EQ(replacement, read_buf); + + cut.readRandom(0, read_buf.data()); + ASSERT_EQ(replacement, read_buf); + } + + TEST_F( Page_IOTest, testRandomIO_StreamForwardsRandomAccessWithPageSizeTranslation ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(5), page_size * 2); + + auto page = makePage(cut.getPageSize(), std::byte(4)); + auto page_num = cut.append(page.data()); + cut.flush(); + + auto replacement = makePage(cut.getPageSize(), std::byte(8)); + replacement[page_size - 1] = std::byte(0xaa); + replacement[page_size] = std::byte(0xbb); + cut.writeRandom(page_num, replacement.data()); + + std::vector read_buf(cut.getPageSize()); + cut.readRandom(page_num, read_buf.data()); + ASSERT_EQ(replacement, read_buf); + } + + TEST_F( Page_IOTest, testRandomIO_StreamRandomAccessIsIndependentOfClear ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(3)); + + auto first = makePage(page_size, std::byte(1)); + auto random_replacement = makePage(page_size, std::byte(7)); + auto stream_replacement = makePage(page_size, std::byte(9)); + + ASSERT_EQ(0u, cut.append(first.data())); + cut.flush(); + + auto random_page_num = page_io.reserve(1); + cut.writeRandom(random_page_num, random_replacement.data()); + + cut.clear(); + + std::vector read_buf(page_size); + cut.readRandom(random_page_num, read_buf.data()); + ASSERT_EQ(random_replacement, read_buf); + + ASSERT_EQ(0u, cut.append(stream_replacement.data())); + cut.flush(); + + cut.readRandom(0, read_buf.data()); + ASSERT_EQ(stream_replacement, read_buf); + + cut.readRandom(random_page_num, read_buf.data()); + ASSERT_EQ(random_replacement, read_buf); + } + + TEST_F( Page_IOTest, testRandomIO_StreamAppendRandomDoesNotAffectManagedStream ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(3)); + + auto stream_first = makePage(page_size, std::byte(1)); + auto random_page = makePage(page_size, std::byte(7)); + auto stream_second = makePage(page_size, std::byte(2)); + + ASSERT_EQ(0u, cut.append(stream_first.data())); + cut.flush(); + + auto random_page_num = cut.appendRandom(random_page.data()); + ASSERT_EQ(3u, random_page_num); + + std::vector read_buf(page_size); + cut.readRandom(random_page_num, read_buf.data()); + ASSERT_EQ(random_page, read_buf); + + cut.readRandom(0, read_buf.data()); + ASSERT_EQ(stream_first, read_buf); + + ASSERT_EQ(1u, cut.append(stream_second.data())); + cut.flush(); + + cut.readRandom(0, read_buf.data()); + ASSERT_EQ(stream_first, read_buf); + cut.readRandom(1, read_buf.data()); + ASSERT_EQ(stream_second, read_buf); + + cut.readRandom(random_page_num, read_buf.data()); + ASSERT_EQ(random_page, read_buf); + } + + TEST_F( Page_IOTest, testRandomIO_StreamReadRandomCanAccessStreamAppends ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(13), page_size * 4); + + auto full_page = makePage(cut.getPageSize(), std::byte(4)); + auto full_page_num = cut.append(full_page.data()); + cut.flush(); + + std::vector read_buf(cut.getPageSize()); + cut.readRandom(full_page_num, read_buf.data()); + ASSERT_EQ(full_page, read_buf); + + auto base_page = makePage(cut.getPageSize(), std::byte(0)); + auto changed_page = base_page; + std::memset(changed_page.data() + 17, 0x11, 120); + std::memset(changed_page.data() + page_size * 2 + 31, 0x22, 300); + + std::vector diff_buf; + ASSERT_TRUE(db0::getDiffs(base_page.data(), changed_page.data(), cut.getPageSize(), diff_buf)); + + auto [diff_page_num, overflow] = cut.appendDiff(changed_page.data(), {11, 7}, diff_buf); + ASSERT_FALSE(overflow); + cut.flush(); + + cut.readRandom(diff_page_num, read_buf.data()); + ASSERT_NE(base_page, read_buf); + auto result = base_page; + cut.applyFrom(diff_page_num, result.data(), {11, 7}); + ASSERT_EQ(changed_page, result); + } + + TEST_F( Page_IOTest, testRandomIO_StreamAppendDiffApplies16KBPages ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(13), page_size * 4); + + auto base_page = makePage(cut.getPageSize(), std::byte(0)); + auto changed_page = base_page; + std::memset(changed_page.data() + 123, 0x11, 120); + std::memset(changed_page.data() + page_size + 31, 0x22, 300); + std::memset(changed_page.data() + page_size * 3 + 17, 0x33, 80); + + std::vector diff_buf; + ASSERT_TRUE(db0::getDiffs(base_page.data(), changed_page.data(), cut.getPageSize(), diff_buf)); + + bool is_first_page = false; + auto [page_num, overflow] = cut.appendDiff(changed_page.data(), {7, 3}, diff_buf, &is_first_page); + ASSERT_EQ(0u, page_num); + ASSERT_FALSE(overflow); + ASSERT_TRUE(is_first_page); + + auto result = base_page; + cut.applyFrom(page_num, result.data(), {7, 3}); + ASSERT_EQ(changed_page, result); + } + + TEST_F( Page_IOTest, testRandomIO_StreamAppendDiffWithOverflowApplies16KBPages ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + RandomIO_Stream cut(page_io, std::uint32_t(9), page_size * 4); + + auto base_page = makePage(cut.getPageSize(), std::byte(0)); + auto changed_page = base_page; + for (std::size_t i = 0; i < changed_page.size(); i += 2) { + changed_page[i] = std::byte(0x7f); + } + + std::vector diff_buf; + ASSERT_TRUE(db0::getDiffs(base_page.data(), changed_page.data(), cut.getPageSize(), diff_buf, + cut.getPageSize() * 2)); + + auto [page_num, overflow] = cut.appendDiff(changed_page.data(), {19, 5}, diff_buf); + ASSERT_EQ(0u, page_num); + ASSERT_TRUE(overflow); + + auto result = base_page; + cut.applyFrom(page_num, result.data(), {19, 5}); + ASSERT_EQ(changed_page, result); + } + + TEST_F( Page_IOTest, testRandomIO_StreamOpenReadWritePositionsForAppend ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + auto first = makePage(page_size, std::byte(1)); + auto second = makePage(page_size, std::byte(2)); + auto third = makePage(page_size, std::byte(3)); + auto fourth = makePage(page_size, std::byte(4)); + + TestRandomIO_Stream created(page_io, std::uint32_t(3)); + ASSERT_EQ(0u, created.append(first.data())); + ASSERT_EQ(1u, created.append(second.data())); + ASSERT_EQ(3u, created.append(third.data())); + created.flush(); + + auto stream_page_num = created.getPageNum(); + RandomIO_Stream opened(page_io, stream_page_num, 3, AccessType::READ_WRITE); + ASSERT_EQ(4u, opened.append(fourth.data())); + opened.flush(); + + std::vector read_buf(opened.getPageSize()); + opened.readRandom(0, read_buf.data()); + ASSERT_EQ(first, read_buf); + opened.readRandom(1, read_buf.data()); + ASSERT_EQ(second, read_buf); + opened.readRandom(3, read_buf.data()); + ASSERT_EQ(third, read_buf); + opened.readRandom(4, read_buf.data()); + ASSERT_EQ(fourth, read_buf); + } + + TEST_F( Page_IOTest, testRandomIO_StreamMaintainsIndependentStreamsOverSharedDiffIO ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + + RandomIO_StreamDiffIO page_io(file, page_size, page_size * 16, tail_function); + auto a1 = makePage(page_size, std::byte(0xa1)); + auto a2 = makePage(page_size, std::byte(0xa2)); + auto a3 = makePage(page_size, std::byte(0xa3)); + auto b1 = makePage(page_size, std::byte(0xb1)); + auto b2 = makePage(page_size, std::byte(0xb2)); + auto b3 = makePage(page_size, std::byte(0xb3)); + + TestRandomIO_Stream stream_a(page_io, std::uint32_t(3)); + ASSERT_EQ(0u, stream_a.append(a1.data())); + ASSERT_EQ(1u, stream_a.append(a2.data())); + stream_a.flush(); + + TestRandomIO_Stream stream_b(page_io, std::uint32_t(3)); + ASSERT_EQ(3u, stream_b.append(b1.data())); + ASSERT_EQ(4u, stream_b.append(b2.data())); + stream_b.flush(); + + auto stream_a_page_num = stream_a.getPageNum(); + auto stream_b_page_num = stream_b.getPageNum(); + + RandomIO_Stream opened_a(page_io, stream_a_page_num, 3, AccessType::READ_WRITE); + ASSERT_EQ(6u, opened_a.append(a3.data())); + opened_a.flush(); + + RandomIO_Stream opened_b(page_io, stream_b_page_num, 3, AccessType::READ_WRITE); + ASSERT_EQ(9u, opened_b.append(b3.data())); + opened_b.flush(); + + std::vector read_buf(page_size); + opened_a.readRandom(0, read_buf.data()); + ASSERT_EQ(a1, read_buf); + opened_a.readRandom(1, read_buf.data()); + ASSERT_EQ(a2, read_buf); + opened_a.readRandom(6, read_buf.data()); + ASSERT_EQ(a3, read_buf); + opened_b.readRandom(3, read_buf.data()); + ASSERT_EQ(b1, read_buf); + opened_b.readRandom(4, read_buf.data()); + ASSERT_EQ(b2, read_buf); + opened_b.readRandom(9, read_buf.data()); + ASSERT_EQ(b3, read_buf); } -} \ No newline at end of file +} diff --git a/tests/unit_tests/SGBCompressedLookupTreeTest.cpp b/tests/unit_tests/SGBCompressedLookupTreeTest.cpp index 8e699f51..0b8cc1b0 100644 --- a/tests/unit_tests/SGBCompressedLookupTreeTest.cpp +++ b/tests/unit_tests/SGBCompressedLookupTreeTest.cpp @@ -2,8 +2,10 @@ // Copyright (c) 2025 DBZero Software sp. z o.o. #include +#include #include #include +#include #include #include #include @@ -137,7 +139,47 @@ namespace tests } return result; } - + + template + std::vector collectSorted(const TreeT &tree) + { + std::vector result; + for (auto it = tree.sortedBegin(); !it.is_end(); ++it) { + result.push_back(*it); + } + return result; + } + + template + std::vector collectSortedFrom(const TreeT &tree, std::uint64_t first) + { + std::vector result; + for (auto it = tree.sortedBeginFrom(first); !it.is_end(); ++it) { + result.push_back(*it); + } + return result; + } + + template + std::vector collectSortedRange(const TreeT &tree, std::uint64_t first, std::uint64_t end) + { + std::vector result; + for (auto it = tree.sortedBeginFrom(first); !it.is_end() && *it < end; ++it) { + result.push_back(*it); + } + return result; + } + + template + std::vector collectForRange(const TreeT &tree, std::uint64_t first, std::uint64_t end) + { + std::vector result; + tree.forRange(first, end, [&](const auto &item) { + result.push_back(item); + }); + return result; + } + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeHeaderIsInitialized ) { // compress uint64 to uint16 @@ -205,6 +247,282 @@ namespace tests } } } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeCanEraseCompressedKey ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + for (std::uint32_t i = 0; i < 256u; ++i) { + cut.insert(i); + } + cut.insert(1000); + + ASSERT_TRUE(cut.erase_equal(42u)); + ASSERT_FALSE(cut.erase_equal(42u)); + ASSERT_EQ(cut.size(), 256u); + ASSERT_EQ(cut.lower_equal_bound(42u).value(), 41u); + + ASSERT_TRUE(cut.erase_equal(1000u)); + ASSERT_EQ(cut.lower_equal_bound(1000u).value(), 255u); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeCanEraseCompressedRange ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + for (std::uint32_t i = 0; i < 256u; ++i) { + cut.insert(i); + } + cut.insert(1000); + cut.insert(1001); + + ASSERT_EQ(cut.erase_range(40u, 200u), 160u); + ASSERT_EQ(cut.size(), 98u); + ASSERT_EQ(cut.lower_equal_bound(39u).value(), 39u); + ASSERT_EQ(cut.lower_equal_bound(199u).value(), 39u); + ASSERT_EQ(cut.lower_equal_bound(200u).value(), 200u); + ASSERT_EQ(cut.lower_equal_bound(1001u).value(), 1001u); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorVisitsAllItems ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 3000, 0, 255, 1000, 1005, 40, 41, 2000, 2255, 5 }; + for (auto value : expected) { + cut.insert(value); + } + + std::sort(expected.begin(), expected.end()); + + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(collectSorted(cut), expected); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorCanStartFromItem ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected; + for (std::uint64_t base = 0; base <= 3000; base += 1000) { + for (std::uint64_t offset : { 0u, 1u, 40u, 200u, 255u }) { + auto value = base + offset; + cut.insert(value); + expected.push_back(value); + } + } + std::sort(expected.begin(), expected.end()); + + auto expected_begin = std::lower_bound(expected.begin(), expected.end(), 1002u); + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(collectSortedFrom(cut, 1002u), std::vector(expected_begin, expected.end())); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorHandlesStartEdges ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 100, 101, 102, 1000 }; + for (auto value : expected) { + cut.insert(value); + } + std::sort(expected.begin(), expected.end()); + + ASSERT_EQ(collectSortedFrom(cut, 1u), expected); + ASSERT_TRUE(cut.sortedBeginFrom(2000u).is_end()); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorHandlesEmptyTree ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + ASSERT_TRUE(cut.sortedBegin().is_end()); + ASSERT_TRUE(cut.sortedBeginFrom(100u).is_end()); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorStartsWithinSingleNode ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 10, 20, 30, 40 }; + for (auto value : expected) { + cut.insert(value); + } + + ASSERT_EQ(countNodes(cut), 1); + ASSERT_EQ(collectSortedFrom(cut, 0u), expected); + ASSERT_EQ(collectSortedFrom(cut, 10u), expected); + ASSERT_EQ(collectSortedFrom(cut, 25u), (std::vector { 30, 40 })); + ASSERT_TRUE(cut.sortedBeginFrom(41u).is_end()); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorStartsAtMultiNodeBoundaries ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 0, 1, 255, 1000, 1001, 1255, 2000 }; + for (auto value : expected) { + cut.insert(value); + } + std::sort(expected.begin(), expected.end()); + + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(collectSortedFrom(cut, 255u), (std::vector { 255, 1000, 1001, 1255, 2000 })); + ASSERT_EQ(collectSortedFrom(cut, 256u), (std::vector { 1000, 1001, 1255, 2000 })); + ASSERT_EQ(collectSortedFrom(cut, 1000u), (std::vector { 1000, 1001, 1255, 2000 })); + ASSERT_EQ(collectSortedFrom(cut, 1256u), (std::vector { 2000 })); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorSupportsBoundedSingleNodeRanges ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + for (auto value : { 10u, 20u, 30u, 40u, 50u }) { + cut.insert(value); + } + + ASSERT_EQ(countNodes(cut), 1); + ASSERT_EQ(collectSortedRange(cut, 15u, 45u), (std::vector { 20, 30, 40 })); + ASSERT_EQ(collectSortedRange(cut, 20u, 20u), (std::vector {})); + ASSERT_EQ(collectSortedRange(cut, 0u, 10u), (std::vector {})); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorSupportsBoundedMultiNodeRanges ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 0, 100, 255, 1000, 1001, 1255, 2000, 2001 }; + for (auto value : expected) { + cut.insert(value); + } + + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(collectSortedRange(cut, 100u, 1001u), (std::vector { 100, 255, 1000 })); + ASSERT_EQ(collectSortedRange(cut, 256u, 2001u), (std::vector { 1000, 1001, 1255, 2000 })); + ASSERT_EQ(collectSortedRange(cut, 1256u, 1999u), (std::vector {})); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeForRangeVisitsSortedHalfOpenRange ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 0, 100, 255, 1000, 1000, 1001, 1255, 2000, 2001 }; + for (auto value : expected) { + cut.insert(value); + } + + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(collectForRange(cut, 100u, 1001u), (std::vector { 100, 255, 1000, 1000 })); + ASSERT_EQ(collectForRange(cut, 256u, 2001u), (std::vector { 1000, 1000, 1001, 1255, 2000 })); + ASSERT_EQ(collectForRange(cut, 1256u, 1999u), (std::vector {})); + ASSERT_EQ(collectForRange(cut, 2001u, 2001u), (std::vector {})); + ASSERT_EQ(collectForRange(cut, 2002u, 2001u), (std::vector {})); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeConstSortedIteratorKeepsDuplicateItems ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + std::vector expected { 10, 10, 20, 20, 20, 1000, 1000 }; + for (auto value : expected) { + cut.insert(value); + } + std::sort(expected.begin(), expected.end()); + + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(collectSorted(cut), expected); + ASSERT_EQ(collectSortedFrom(cut, 20u), (std::vector { 20, 20, 20, 1000, 1000 })); + } + + TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeEraseRangeEdgeCasesWithSmallNodes ) + { + using HeaderT = CompressingTestHeader; + SGB_CompressedLookupTree cut(m_bitspace, + page_size, AccessType::READ_WRITE); + + ASSERT_EQ(cut.erase_range(0u, 1u), 0u); + ASSERT_TRUE(cut.empty()); + + std::vector expected; + for (std::uint64_t base = 0; base <= 3000; base += 1000) { + for (std::uint64_t offset = 0; offset < 256; ++offset) { + cut.insert(base + offset); + expected.push_back(base + offset); + } + } + ASSERT_GT(countNodes(cut), 1); + ASSERT_EQ(cut.size(), expected.size()); + + auto erase_expected = [&](std::uint64_t first, std::uint64_t last) { + auto first_it = std::lower_bound(expected.begin(), expected.end(), first); + auto last_it = std::lower_bound(expected.begin(), expected.end(), last); + auto count = static_cast(last_it - first_it); + expected.erase(first_it, last_it); + return count; + }; + + ASSERT_EQ(cut.erase_range(40u, 40u), 0u); + ASSERT_EQ(cut.erase_range(41u, 40u), 0u); + ASSERT_EQ(cut.erase_range(260u, 900u), 0u); + ASSERT_EQ(cut.erase_range(4000u, 4100u), 0u); + ASSERT_EQ(cut.size(), expected.size()); + + ASSERT_EQ(cut.erase_range(250u, 1005u), erase_expected(250u, 1005u)); + auto lower_250 = cut.lower_equal_bound(250u); + auto upper_250 = cut.upper_equal_bound(250u); + auto lower_1004 = cut.lower_equal_bound(1004u); + auto lower_1005 = cut.lower_equal_bound(1005u); + ASSERT_TRUE(lower_250.has_value()); + ASSERT_TRUE(upper_250.has_value()); + ASSERT_TRUE(lower_1004.has_value()); + ASSERT_TRUE(lower_1005.has_value()); + ASSERT_EQ(lower_250.value(), 249u); + ASSERT_EQ(upper_250.value(), 1005u); + ASSERT_EQ(lower_1004.value(), 249u); + ASSERT_EQ(lower_1005.value(), 1005u); + ASSERT_EQ(cut.size(), expected.size()); + + ASSERT_EQ(cut.erase_range(0u, 3u), erase_expected(0u, 3u)); + ASSERT_FALSE(cut.lower_equal_bound(2u).has_value()); + auto lower_3 = cut.lower_equal_bound(3u); + ASSERT_TRUE(lower_3.has_value()); + ASSERT_EQ(lower_3.value(), 3u); + ASSERT_EQ(cut.size(), expected.size()); + + ASSERT_EQ(cut.erase_range(3250u, 4000u), erase_expected(3250u, 4000u)); + auto lower_4000 = cut.lower_equal_bound(4000u); + ASSERT_TRUE(lower_4000.has_value()); + ASSERT_EQ(lower_4000.value(), 3249u); + ASSERT_EQ(cut.size(), expected.size()); + + ASSERT_EQ(cut.erase_range(0u, 4000u), expected.size()); + ASSERT_TRUE(cut.empty()); + ASSERT_EQ(cut.size(), 0u); + } TEST_F( SGB_CompressedLookupTreeTest , testSGBCompressedLookupTreeFindLowerWhenUnableToFit ) { @@ -244,4 +562,3 @@ namespace tests } } - diff --git a/tests/unit_tests/SlabAllocatorTests.cpp b/tests/unit_tests/SlabAllocatorTests.cpp index 9cc1650a..eb322d4f 100644 --- a/tests/unit_tests/SlabAllocatorTests.cpp +++ b/tests/unit_tests/SlabAllocatorTests.cpp @@ -159,10 +159,12 @@ namespace tests ASSERT_THROW(cut.findAllocation(slotAddress + static_cast(19)), db0::BadAddressException); - auto slot = cut.findAllocation(slotAddress + static_cast(19), static_cast(1)); + auto slot = cut.findAllocation(slotAddress + static_cast(19), + static_cast(1)); ASSERT_EQ(slot.address, slotAddress); ASSERT_EQ(slot.size, 80u); - ASSERT_THROW(cut.findAllocation(slotAddress + static_cast(80), static_cast(1)), db0::BadAddressException); + ASSERT_THROW(cut.findAllocation(slotAddress + static_cast(80), + static_cast(1)), db0::BadAddressException); ASSERT_THROW(cut.findAllocation(Address::fromOffset(32 * 1024 * 1024)), db0::BadAddressException); } diff --git a/tests/unit_tests/SparseIndexQueryTest.cpp b/tests/unit_tests/SparseIndexQueryTest.cpp index 4e8525a4..df71dd2e 100644 --- a/tests/unit_tests/SparseIndexQueryTest.cpp +++ b/tests/unit_tests/SparseIndexQueryTest.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -18,13 +20,31 @@ namespace tests { class SparseIndexQueryTest: public testing::Test - { + { + public: + static DRAM_Pair createDramPair(std::size_t page_size) + { + return { + std::make_shared(page_size), + std::make_shared(page_size) + }; + } + + static SparseIndex createSparseIndex(std::size_t page_size) + { + return SparseIndex(SparseIndex::tag_create(), createDramPair(page_size)); + } + + static DiffIndex createDiffIndex(std::size_t page_size) + { + return DiffIndex(DiffIndex::tag_create(), createDramPair(page_size)); + } }; TEST_F( SparseIndexQueryTest , testSparseIndexQueryNoDiffs ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_cut(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_cut = createDiffIndex(16 * 1024); // page num, state num, storage page num sparse_index.emplace(1, 1, 1); sparse_index.emplace(1, 3, 17); @@ -45,8 +65,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQuerySingleDiff ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_cut(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_cut = createDiffIndex(16 * 1024); sparse_index.emplace(1, 1, 1); // append diff-mutation for page 1 diff_cut.insert(1, 2, 3); @@ -72,8 +92,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryMultipleDiffs ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_cut(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_cut = createDiffIndex(16 * 1024); sparse_index.emplace(1, 1, 1); // append multiple diff-mutations for page 1 diff_cut.insert(1, 2, 3); @@ -117,8 +137,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryWithLongDiffsChain ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); sparse_index.emplace(1, 1, 1); sparse_index.emplace(4, 7, 2343); // append a long chain of diffs @@ -149,8 +169,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testFindMutationQuery ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_cut(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_cut = createDiffIndex(16 * 1024); sparse_index.emplace(1, 1, 1); // append multiple diff-mutations for page 1 diff_cut.insert(1, 2, 3); @@ -191,8 +211,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryIssue1 ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); sparse_index.emplace(4, 500, 100); for (auto [page, state, storage]: getDiffIndexData1()) { diff_index.insert(page, state, storage); @@ -209,8 +229,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryLeftLessThan ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); sparse_index.emplace(4, 500, 100); sparse_index.emplace(3, 500, 300); for (auto [page, state, storage]: getDiffIndexData1()) { @@ -233,8 +253,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryLessThan ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); sparse_index.emplace(4, 500, 100); sparse_index.emplace(3, 500, 300); for (auto [page, state, storage]: getDiffIndexData1()) { @@ -264,8 +284,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryStartingFromDiffPage ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); // append multiple diff-mutations for page 1 without base page (i.e. 0x0 based) diff_index.insert(1, 2, 3); diff_index.insert(1, 4, 4); @@ -289,8 +309,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryEmpty ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); std::vector> diff_data { { 1, 2, 3 }, { 1, 4, 4 }, { 1, 8, 11 }, { 1, 9, 12 }, { 5, 2, 22 }, { 5, 3, 23 }, { 5, 4, 24 }, { 5, 5, 25 }, { 5, 6, 26 }, @@ -322,8 +342,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryZeroBasedChain ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); std::vector> diff_data { { 1, 2, 2 }, { 1, 3, 3 }, { 1, 4, 4 }, { 1, 5, 5 }, { 1, 6, 6 }, { 1, 7, 7 }, { 1, 8, 8 }, { 1, 9, 9 }, { 1, 10, 10 }, { 1, 11, 11 }, { 1, 12, 12 }, @@ -351,8 +371,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQueryZeroBasedDiffChain ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); std::vector> diff_data { { 1, 2, 2 }, { 1, 3, 3 }, { 1, 4, 4 }, { 1, 5, 5 }, { 1, 6, 6 }, { 1, 7, 7 }, { 1, 8, 8 }, { 1, 9, 9 }, { 1, 10, 10 }, { 1, 11, 11 }, { 1, 12, 12 }, @@ -385,8 +405,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexQuery_Issue1 ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); diff_index.insert(1, 2, 2); diff_index.insert(1, 3, 3); sparse_index.emplace(1, 4, 4); @@ -404,8 +424,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testFindMutationOfZeroBasedDPs ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); std::vector> diff_data { { 1, 2, 3 }, { 1, 4, 4 }, { 1, 8, 11 }, { 1, 9, 12 }, { 5, 2, 22 }, { 5, 3, 23 }, { 5, 4, 24 }, { 5, 5, 25 }, { 5, 6, 26 }, @@ -439,8 +459,8 @@ namespace tests TEST_F( SparseIndexQueryTest , testSparseIndexStartingFromDiff ) { - SparseIndex sparse_index(16 * 1024); - DiffIndex diff_index(16 * 1024); + auto sparse_index = createSparseIndex(16 * 1024); + auto diff_index = createDiffIndex(16 * 1024); sparse_index.emplace(1, 1, 1); sparse_index.emplace(1, 3, 17); sparse_index.emplace(4, 7, 2343); @@ -453,4 +473,4 @@ namespace tests ASSERT_FALSE(cut.empty()); } -} \ No newline at end of file +} diff --git a/tests/unit_tests/SparseIndexTest.cpp b/tests/unit_tests/SparseIndexTest.cpp index eb47e5a2..001829ff 100644 --- a/tests/unit_tests/SparseIndexTest.cpp +++ b/tests/unit_tests/SparseIndexTest.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,21 +34,49 @@ namespace tests void TearDown() override { drop(file_name); } + + template + static SparseIndexT createSparseIndex(std::size_t node_size, + std::vector *change_log = nullptr) + { + DRAM_Pair dram_pair { + std::make_shared(node_size), + std::make_shared(node_size) + }; + return SparseIndexT(typename SparseIndexT::tag_create(), dram_pair, change_log); + } }; - + TEST_F( SparseIndexTest , testSparseIndexCanBeInstantiated ) { - SparseIndex cut(16 * 1024); + auto cut = createSparseIndex(16 * 1024); + } + + TEST_F( SparseIndexTest , testSparseIndexBaseCanUseEmptyHeaderMixin ) + { + using EmptySparseIndexBase = SparseIndexBase; + auto cut = createSparseIndex(16 * 1024); + + cut.emplace(1, 1, 10); + cut.emplace(1, 3, 30); + cut.update(1, 4, 40); + cut.modifyMixIn().refresh(); + + ASSERT_FALSE(cut.lookup(1, 1)); + ASSERT_FALSE(cut.lookup(1, 3)); + auto updated = cut.lookup(1, 4); + ASSERT_TRUE(updated); + ASSERT_EQ(updated.m_storage_page_num, 40u); } TEST_F( SparseIndexTest , testSparseIndexCanAppendPageDescriptors ) { - SparseIndex cut(16 * 1024); + auto cut = createSparseIndex(16 * 1024); cut.emplace(0, 0, 0); } void testSparseIndexLookupPageDescriptors(std::size_t node_size) { - SparseIndex cut(node_size); + auto cut = SparseIndexTest::createSparseIndex(node_size); std::vector items { // page number, state number, physical page number, page type { 0, 1, 0 }, { 1, 1, 1 }, { 2, 1, 2 }, { 3, 2, 3 }, { 0, 2, 4 }, { 2, 3, 5 }, { 4, 4, 6 } @@ -92,36 +122,47 @@ namespace tests testSparseIndexLookupPageDescriptors(16 * 1024); } - TEST_F( SparseIndexTest , testSparseIndexCanTrackMaxStoragePageNum ) + TEST_F( SparseIndexTest , testSparseIndexOwnerCanRecordNextStoragePageNum ) { - SparseIndex cut(16 * 1024); - std::vector items { - // page number, state number, physical page number, page type - { 0, 0, 0 }, { 1, 0, 1 }, { 2, 0, 2 }, { 3, 1, 3 }, { 0, 1, 4 }, { 2, 2, 5 }, { 4, 3, 6 } - }; - for (auto &item: items) { - cut.insert(item); - } - ASSERT_EQ(cut.getNextStoragePageNum(), 7); + auto cut = createSparseIndex(16 * 1024); + cut.emplace(4, 3, 6); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), std::nullopt); + cut.modifyMixIn().recordNextStoragePageNum(7); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), 7); } - TEST_F( SparseIndexTest , testSparseIndexCanTrackMaxStateNum ) + TEST_F( SparseIndexTest , testSparseIndexOwnerCanRecordMaxStateNum ) { - SparseIndex cut(16 * 1024); - std::vector items { - // page number, state number, physical page number, page type - { 0, 0, 0 }, { 1, 0, 1 }, { 2, 0, 2 }, { 3, 1, 3 }, { 0, 1, 4 }, { 2, 2, 5 }, { 4, 3, 6 } - }; - for (auto &item: items) { - cut.insert(item); - } - ASSERT_EQ(cut.getMaxStateNum(), 3); + auto cut = createSparseIndex(16 * 1024); + cut.emplace(4, 3, 6); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 0); + cut.modifyMixIn().recordMaxStateNum(3); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 3); + } + + TEST_F( SparseIndexTest , testSparseIndexUpdateReplacesOlderPageDescriptors ) + { + auto cut = createSparseIndex(16 * 1024); + cut.emplace(1, 1, 10); + cut.emplace(1, 3, 30); + cut.emplace(2, 2, 20); + + cut.update(1, 4, 40); + + ASSERT_FALSE(cut.lookup(1, 1)); + ASSERT_FALSE(cut.lookup(1, 3)); + auto updated = cut.lookup(1, 4); + ASSERT_TRUE(updated); + ASSERT_EQ(updated.m_storage_page_num, 40u); + ASSERT_TRUE(cut.lookup(2, 2)); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), std::nullopt); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 0); } TEST_F( SparseIndexTest , testSparseIndexCanBeUpdatedByDRAMSpaceSwap ) { std::size_t node_size = 16 * 1024; - SparseIndex sparse_index(node_size); + auto sparse_index = createSparseIndex(node_size); DRAM_Pair dram_pair; auto dram_space = DRAMSpace::create(node_size, [&](DRAM_Pair dp) { dram_pair = dp; @@ -155,15 +196,14 @@ namespace tests (*dram_pair.first) = sparse_index.getDRAMPrefix(); // make sure the contents is in-sync for (unsigned int i = 0; i < 5; ++i) { - auto state_num = sparse_index.getMaxStateNum(); - ASSERT_EQ(cut.lookup(i, state_num), sparse_index.lookup(i, state_num)); + ASSERT_EQ(cut.lookup(i, 3), sparse_index.lookup(i, 3)); } } TEST_F( SparseIndexTest , testSparseIndexMaxStateNumUpdatedAfterRefresh ) { std::size_t node_size = 16 * 1024; - SparseIndex sparse_index(node_size); + auto sparse_index = createSparseIndex(node_size); DRAM_Pair dram_pair; auto dram_space = DRAMSpace::create(node_size, [&](DRAM_Pair dp) { dram_pair = dp; @@ -178,12 +218,13 @@ namespace tests for (auto &item: items_1) { sparse_index.insert(item); } + sparse_index.modifyMixIn().recordMaxStateNum(1); // copy DRAM binary contents between the instances *(dram_pair.first) = sparse_index.getDRAMPrefix(); // make sure max-state-number reported correctly after refresh cut.refresh(); - ASSERT_EQ(cut.getMaxStateNum(), 1); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 1); std::vector items_2 { // page number, state number, physical page number, page type @@ -193,15 +234,16 @@ namespace tests for (auto &item: items_2) { sparse_index.insert(item); } + sparse_index.modifyMixIn().recordMaxStateNum(3); (*dram_pair.first) = sparse_index.getDRAMPrefix(); cut.refresh(); - ASSERT_EQ(cut.getMaxStateNum(), 3); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 3); } TEST_F( SparseIndexTest , testSparseIndexInsertFailingCase ) { - SparseIndex cut(16 * 1024); + auto cut = createSparseIndex(16 * 1024); std::vector items { // page number, state number, physical page number, page type { 0, 1, 0 } @@ -219,7 +261,7 @@ namespace tests TEST_F( SparseIndexTest , testSparseIndexInsertLookupFailingCase ) { - SparseIndex cut(16 * 1024); + auto cut = createSparseIndex(16 * 1024); std::vector items { // page number, state number, physical page number, page type { 0, 1, 0 } @@ -230,5 +272,326 @@ namespace tests ASSERT_TRUE(cut.lookup(0, 1)); } + + TEST_F( SparseIndexTest , testSparseIndexCanEraseExactPageState ) + { + auto cut = createSparseIndex(16 * 1024); + cut.emplace(1, 1, 10); + cut.emplace(1, 3, 30); + cut.emplace(2, 1, 20); + + ASSERT_TRUE(cut.erase(1, 3)); + ASSERT_FALSE(cut.erase(1, 3)); + ASSERT_EQ(cut.size(), 2u); + ASSERT_EQ(cut.lookup(1, 3).m_storage_page_num, 10u); + ASSERT_EQ(cut.lookup(2, 1).m_storage_page_num, 20u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseBelowKeepsThresholdState ) + { + auto cut = createSparseIndex(16 * 1024); + cut.emplace(1, 1, 10); + cut.emplace(1, 3, 30); + cut.emplace(1, 5, 50); + cut.emplace(2, 1, 20); + + ASSERT_EQ(cut.eraseBelow(1, 3), 1u); + ASSERT_FALSE(cut.lookup(1, 1)); + ASSERT_EQ(cut.lookup(1, 3).m_storage_page_num, 30u); + ASSERT_EQ(cut.lookup(1, 5).m_storage_page_num, 50u); + ASSERT_EQ(cut.lookup(2, 5).m_storage_page_num, 20u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseBelowNoOpCases ) + { + auto cut = createSparseIndex(16 * 1024); + cut.emplace(1, 1, 10); + cut.emplace(1, 3, 30); + + ASSERT_EQ(cut.eraseBelow(1, 0), 0u); + ASSERT_EQ(cut.eraseBelow(1, 1), 0u); + ASSERT_EQ(cut.eraseBelow(2, 5), 0u); + ASSERT_EQ(cut.size(), 2u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseBelowCanEraseAcrossNodes ) + { + auto cut = createSparseIndex(256); + for (std::uint32_t state_num = 1; state_num <= 200; ++state_num) { + cut.emplace(1, state_num, state_num); + cut.emplace(2, state_num, 1000 + state_num); + } + + ASSERT_EQ(cut.eraseBelow(1, 150), 149u); + ASSERT_FALSE(cut.lookup(1, 149)); + ASSERT_EQ(cut.lookup(1, 150).m_storage_page_num, 150u); + ASSERT_EQ(cut.lookup(2, 149).m_storage_page_num, 1149u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseRangeSupportsOptionalBounds ) + { + auto cut = createSparseIndex(256); + for (std::uint32_t state_num = 1; state_num <= 20; ++state_num) { + cut.emplace(1, state_num, state_num); + cut.emplace(2, state_num, 1000 + state_num); + cut.emplace(3, state_num, 2000 + state_num); + } + ASSERT_EQ(cut.size(), 60u); + + ASSERT_EQ(cut.eraseRange(1, 5, 10), 5u); + ASSERT_EQ(cut.size(), 55u); + ASSERT_EQ(cut.lookup(1, 4).m_storage_page_num, 4u); + ASSERT_EQ(cut.lookup(1, 5).m_state_num, 4u); + ASSERT_EQ(cut.lookup(1, 9).m_state_num, 4u); + ASSERT_EQ(cut.lookup(1, 10).m_storage_page_num, 10u); + ASSERT_EQ(cut.lookup(2, 9).m_storage_page_num, 1009u); + + ASSERT_EQ(cut.eraseRange(1, {}, 4), 3u); + ASSERT_EQ(cut.size(), 52u); + ASSERT_EQ(cut.lookup(1, 4).m_storage_page_num, 4u); + + ASSERT_EQ(cut.eraseRange(2, 18, {}), 3u); + ASSERT_EQ(cut.size(), 49u); + ASSERT_EQ(cut.lookup(2, 17).m_storage_page_num, 1017u); + ASSERT_EQ(cut.lookup(2, 18).m_state_num, 17u); + ASSERT_EQ(cut.lookup(2, 20).m_state_num, 17u); + ASSERT_EQ(cut.lookup(3, 20).m_storage_page_num, 2020u); + + ASSERT_EQ(cut.eraseRange(3), 20u); + ASSERT_EQ(cut.size(), 29u); + ASSERT_FALSE(cut.lookup(3, 20)); + ASSERT_EQ(cut.lookup(2, 17).m_storage_page_num, 1017u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseRangeNoOpCases ) + { + auto cut = createSparseIndex(16 * 1024); + cut.emplace(1, 1, 10); + cut.emplace(1, 3, 30); + + ASSERT_EQ(cut.eraseRange(1, 1, 1), 0u); + ASSERT_EQ(cut.eraseRange(1, 3, 1), 0u); + ASSERT_EQ(cut.eraseRange(2), 0u); + ASSERT_EQ(cut.size(), 2u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseRangeLowerOnlyAtMaxPage ) + { + auto cut = createSparseIndex(16 * 1024); + constexpr auto page_num = (static_cast(std::numeric_limits::max()) << 24) | 0xFFFFFFu; + constexpr auto max_state_num = std::numeric_limits::max(); + cut.emplace(page_num, 1, 10); + cut.emplace(page_num, max_state_num, 20); + cut.emplace(page_num - 1, max_state_num, 30); + + ASSERT_EQ(cut.eraseRange(page_num, max_state_num, {}), 1u); + ASSERT_EQ(cut.lookup(page_num, 1).m_storage_page_num, 10u); + ASSERT_EQ(cut.lookup(page_num, max_state_num).m_state_num, 1u); + ASSERT_EQ(cut.lookup(page_num - 1, max_state_num).m_storage_page_num, 30u); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseBelowEdgeCasesWithSmallNodes ) + { + auto cut = createSparseIndex(192); + for (std::uint32_t state_num = 1; state_num <= 80; ++state_num) { + cut.emplace(1, state_num, state_num); + cut.emplace(2, state_num, 1000 + state_num); + } + ASSERT_EQ(cut.size(), 160u); + + ASSERT_EQ(cut.eraseBelow(1, 1), 0u); + ASSERT_EQ(cut.eraseBelow(1, 0), 0u); + ASSERT_EQ(cut.eraseBelow(99, 50), 0u); + ASSERT_EQ(cut.size(), 160u); + + ASSERT_EQ(cut.eraseBelow(1, 41), 40u); + ASSERT_FALSE(cut.lookup(1, 40)); + ASSERT_EQ(cut.lookup(1, 41).m_storage_page_num, 41u); + ASSERT_EQ(cut.lookup(2, 40).m_storage_page_num, 1040u); + ASSERT_EQ(cut.size(), 120u); + + ASSERT_EQ(cut.eraseBelow(1, 41), 0u); + ASSERT_TRUE(cut.erase(1, 41)); + ASSERT_FALSE(cut.lookup(1, 41)); + ASSERT_EQ(cut.lookup(1, 42).m_storage_page_num, 42u); + ASSERT_EQ(cut.size(), 119u); + + ASSERT_EQ(cut.eraseBelow(1, std::numeric_limits::max()), 39u); + ASSERT_FALSE(cut.lookup(1, 100)); + ASSERT_EQ(cut.lookup(2, 80).m_storage_page_num, 1080u); + ASSERT_EQ(cut.size(), 80u); + + ASSERT_EQ(cut.eraseBelow(2, std::numeric_limits::max()), 80u); + ASSERT_TRUE(cut.empty()); + } + + TEST_F( SparseIndexTest , testSparseIndexEraseDoesNotRecordChangeLog ) + { + std::vector change_log; + auto cut = createSparseIndex(16 * 1024, &change_log); + cut.emplace(1, 1, 10); + cut.emplace(1, 2, 20); + cut.emplace(1, 3, 30); + change_log.clear(); + + ASSERT_EQ(cut.eraseBelow(1, 3), 2u); + ASSERT_TRUE(change_log.empty()); + + change_log.clear(); + ASSERT_TRUE(cut.erase(1, 3)); + ASSERT_TRUE(change_log.empty()); + + change_log.clear(); + ASSERT_FALSE(cut.erase(1, 3)); + ASSERT_TRUE(change_log.empty()); + + ASSERT_EQ(cut.eraseRange(1), 0u); + ASSERT_TRUE(change_log.empty()); + } + + TEST_F( SparseIndexTest , testSparseIndexClearRemovesAllDescriptorsAndPreservesCounters ) + { + auto cut = createSparseIndex(192); + for (std::uint32_t state_num = 1; state_num <= 80; ++state_num) { + cut.emplace(1, state_num, state_num); + cut.emplace(2, state_num, 1000 + state_num); + } + cut.modifyMixIn().recordNextStoragePageNum(1081); + cut.modifyMixIn().recordMaxStateNum(80); + ASSERT_GT(cut.size(), 2u); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), 1081u); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 80u); + + cut.clear(); + + ASSERT_TRUE(cut.empty()); + ASSERT_EQ(cut.size(), 0u); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), 1081u); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 80u); + ASSERT_FALSE(cut.lookup(1, 80)); + ASSERT_FALSE(cut.lookup(2, 80)); + + cut.emplace(3, 81, 0); + ASSERT_EQ(cut.size(), 1u); + ASSERT_EQ(cut.lookup(3, 81).m_storage_page_num, 0u); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), 1081u); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 80u); + } + + TEST_F( SparseIndexTest , testSparseIndexClearEmptyAndChangeLogNoOp ) + { + std::vector change_log; + auto cut = createSparseIndex(16 * 1024, &change_log); + + cut.clear(); + ASSERT_TRUE(cut.empty()); + ASSERT_EQ(cut.size(), 0u); + ASSERT_TRUE(change_log.empty()); + + cut.emplace(1, 1, 10); + ASSERT_FALSE(change_log.empty()); + change_log.clear(); + + cut.clear(); + ASSERT_TRUE(cut.empty()); + ASSERT_EQ(cut.size(), 0u); + ASSERT_TRUE(change_log.empty()); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), std::nullopt); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 0u); + + cut.emplace(2, 2, 0); + ASSERT_EQ(cut.mixIn().getNextStoragePageNum(), std::nullopt); + ASSERT_EQ(cut.mixIn().getMaxStateNum(), 0u); + } + + TEST_F( SparseIndexTest , testSparseIndexForPageRangeUsesHalfOpenBounds ) + { + auto cut = createSparseIndex(16 * 1024); + constexpr std::uint64_t slot_size = 1ull << 24; + constexpr std::uint64_t slot_1_first = slot_size; + constexpr std::uint64_t slot_2_first = slot_size * 2; + + cut.emplace(slot_1_first - 1, 1, 10); + cut.emplace(slot_1_first, 1, 20); + cut.emplace(slot_1_first + 7, 2, 21); + cut.emplace(slot_2_first, 1, 30); + + std::vector page_nums; + cut.forPageRange(slot_1_first, slot_2_first, [&](const SI_Item &item) { + page_nums.push_back(item.m_page_num); + }); + + ASSERT_EQ(page_nums, (std::vector { slot_1_first, slot_1_first + 7 })); + } + + TEST_F( SparseIndexTest , testSparseIndexForPageRangeHandlesEmptyAndOutOfRangeScans ) + { + auto empty_cut = createSparseIndex(16 * 1024); + std::size_t callback_count = 0; + empty_cut.forPageRange(1, 10, [&](const SI_Item &) { + ++callback_count; + }); + ASSERT_EQ(callback_count, 0u); + + auto cut = createSparseIndex(16 * 1024); + cut.emplace(100, 1, 10); + cut.emplace(200, 1, 20); + + cut.forPageRange(10, 10, [&](const SI_Item &) { + ++callback_count; + }); + cut.forPageRange(10, 20, [&](const SI_Item &) { + ++callback_count; + }); + ASSERT_EQ(callback_count, 0u); + } + + TEST_F( SparseIndexTest , testSparseIndexForPageRangeScansAcrossMultipleNodes ) + { + auto cut = createSparseIndex(512); + for (std::uint64_t page_num = 0; page_num < 200; ++page_num) { + cut.emplace(page_num, 1, page_num + 1000); + } + + std::vector page_nums; + cut.forPageRange(40, 75, [&](const SI_Item &item) { + page_nums.push_back(item.m_page_num); + }); + + ASSERT_EQ(page_nums.size(), 35u); + ASSERT_EQ(page_nums.front(), 40u); + ASSERT_EQ(page_nums.back(), 74u); + } + + TEST_F( SparseIndexTest , testSparseIndexForUniquePageRangeDeduplicatesMultipleStates ) + { + auto cut = createSparseIndex(512); + constexpr std::uint64_t page_count = 300; + constexpr std::uint32_t high_state_count = 20; + + for (std::uint64_t page_num = 0; page_num < page_count; ++page_num) { + cut.emplace(page_num, 1, page_num + 1000); + if (page_num % 13 == 0) { + for (std::uint32_t state_num = 2; state_num <= high_state_count; ++state_num) { + cut.emplace(page_num, state_num, page_num + (state_num * 1000)); + } + } + } + + std::vector items; + cut.forUniquePageRange(40, 260, [&](const SI_Item &item) { + items.push_back(item); + }); + + ASSERT_EQ(items.size(), 220u); + for (std::size_t i = 0; i < items.size(); ++i) { + ASSERT_EQ(items[i].m_page_num, i + 40); + ASSERT_EQ(items[i].m_state_num, 1u); + if (i > 0) { + ASSERT_NE(items[i - 1].m_page_num, items[i].m_page_num); + ASSERT_TRUE(items[i - 1].m_page_num < items[i].m_page_num); + } + } + } } diff --git a/tests/unit_tests/SparsePairQueryTest.cpp b/tests/unit_tests/SparsePairQueryTest.cpp new file mode 100644 index 00000000..244cb94a --- /dev/null +++ b/tests/unit_tests/SparsePairQueryTest.cpp @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2025 DBZero Software sp. z o.o. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace db0; +using namespace db0::tests; + +namespace tests + +{ + + class SparsePairQueryTest: public testing::Test + { + public: + static constexpr const char *file_name = "sparse-pair-query-test.db0"; + static constexpr std::size_t page_size = 4096; + + void SetUp() override + { + drop(file_name); + CFile::create(file_name, {}); + } + + void TearDown() override + { + drop(file_name); + } + + static DRAM_Pair createMappingPair() + { + return { + std::make_shared(page_size), + std::make_shared(page_size) + }; + } + + static Diff_IO createIO(CFile &file) + { + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + return Diff_IO(0, file, page_size, page_size * 16, page_size, 0, 1, tail_function, 0); + } + + static RandomIO_Stream createStream(Diff_IO &io) + { + return RandomIO_Stream(io, 2); + } + }; + + TEST_F( SparsePairQueryTest , testSinglePageUsesSinglePageMapping ) + { + unsigned int single_page_mapping_calls = 0; + unsigned int bucket_mapping_calls = 0; + StorageOptions options; + options.m_storage_slab_bucketing = [&](std::uint64_t address) { + ++single_page_mapping_calls; + return static_cast(address / page_size); + }; + options.m_storage_slab_bucket = [&](std::uint64_t) { + ++bucket_mapping_calls; + return StorageOptions::StorageSlabBucket { 9u, 0u, 10u }; + }; + + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + SparsePair root_pair(SparsePair::tag_create(), createMappingPair()); + auto meta_space = MS_MetaSpace::create(page_size, root_pair, stream); + SparsePairManager manager(meta_space, AccessType::READ_WRITE, {}); + + SparsePairQuery query(options, page_size, 3, 4, manager); + + ASSERT_TRUE(query.hasNext()); + ASSERT_EQ(query.pageNum(), 3u); + ASSERT_EQ(query.currentSparsePair(), nullptr); + ASSERT_EQ(query.slotId(), 3u); + ASSERT_EQ(single_page_mapping_calls, 1u); + ASSERT_EQ(bucket_mapping_calls, 0u); + } + + TEST_F( SparsePairQueryTest , testMultiPageCachesSparsePairWithinBucket ) + { + unsigned int bucket_mapping_calls = 0; + StorageOptions options; + options.m_storage_slab_bucketing = [](std::uint64_t) { + return 0u; + }; + options.m_storage_slab_bucket = [&](std::uint64_t) { + ++bucket_mapping_calls; + return StorageOptions::StorageSlabBucket { 7u, 0u, 16u }; + }; + + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + SparsePair root_pair(SparsePair::tag_create(), createMappingPair()); + auto meta_space = MS_MetaSpace::create(page_size, root_pair, stream); + SparsePairManager manager(meta_space, AccessType::READ_WRITE, {}); + SparsePairQuery query(options, page_size, 4, 6, manager); + + ASSERT_EQ(bucket_mapping_calls, 1u); + ASSERT_EQ(query.slotId(), 7u); + ASSERT_EQ(query.currentSparsePair(), nullptr); + ++query; + ASSERT_EQ(query.slotId(), 7u); + ASSERT_EQ(query.currentSparsePair(), nullptr); + ASSERT_EQ(bucket_mapping_calls, 1u); + } + + TEST_F( SparsePairQueryTest , testMultiPageRefreshesAtBucketBoundary ) + { + unsigned int bucket_mapping_calls = 0; + StorageOptions options; + options.m_storage_slab_bucketing = [](std::uint64_t) { + return 0u; + }; + options.m_storage_slab_bucket = [&](std::uint64_t address) { + ++bucket_mapping_calls; + auto page_num = address / page_size; + return StorageOptions::StorageSlabBucket { + static_cast(page_num), page_num, page_num + 1 + }; + }; + + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + SparsePair root_pair(SparsePair::tag_create(), createMappingPair()); + auto meta_space = MS_MetaSpace::create(page_size, root_pair, stream); + SparsePairManager manager(meta_space, AccessType::READ_WRITE, {}); + SparsePairQuery query(options, page_size, 0, 2, manager); + + ASSERT_EQ(query.currentSparsePair(), nullptr); + ++query; + ASSERT_EQ(query.currentSparsePair(), nullptr); + ASSERT_EQ(bucket_mapping_calls, 2u); + } + + TEST_F( SparsePairQueryTest , testWriteQueryCreatesSparsePair ) + { + StorageOptions options; + options.m_storage_slab_bucketing = [](std::uint64_t) { + return 0u; + }; + + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + SparsePair root_pair(SparsePair::tag_create(), createMappingPair()); + auto meta_space = MS_MetaSpace::create(page_size, root_pair, stream); + SparsePairManager manager(meta_space, AccessType::READ_WRITE, {}); + + SparsePairQuery query(options, page_size, 0, 1, manager); + + auto &sparse_pair = query.currentOrCreateSparsePair(); + ASSERT_EQ(manager.tryGetExisting(0), &sparse_pair); + } + +} diff --git a/tests/unit_tests/SparsePairTest.cpp b/tests/unit_tests/SparsePairTest.cpp index 88294e25..479f065b 100644 --- a/tests/unit_tests/SparsePairTest.cpp +++ b/tests/unit_tests/SparsePairTest.cpp @@ -2,12 +2,21 @@ // Copyright (c) 2025 DBZero Software sp. z o.o. #include +#include #include #include +#include #include #include +#include +#include +#include #include #include +#include +#include +#include +#include #include #include @@ -23,7 +32,7 @@ namespace tests { public: static constexpr const char *file_name = "my-test-prefix_1.db0"; - using DP_ChangeLogStreamT = SparsePair::DP_ChangeLogStreamT; + static constexpr std::size_t page_size = 4096; SparsePairTest() = default; @@ -34,12 +43,495 @@ namespace tests void TearDown() override { drop(file_name); } + + static DRAM_Pair createMappingPair() + { + return { + std::make_shared(page_size), + std::make_shared(page_size) + }; + } + + static Diff_IO createIO(CFile &file) + { + auto tail_function = [&file]() -> std::uint64_t { + return file.size(); + }; + return Diff_IO(0, file, page_size, page_size * 16, page_size, 0, 1, tail_function, 0); + } + + static RandomIO_Stream createStream(Diff_IO &io) + { + return RandomIO_Stream(io, 2); + } + + static bool flushMeta(Memspace &memspace, RandomIO_Stream &io, SparsePair &sparse_pair) + { + auto &prefix = dynamic_cast(memspace.getPrefix()); + if (prefix.getDirtySize() != 0) { + sparse_pair.recordMaxStateNum(prefix.getStateNum(false) + 1); + } + return flush(prefix, io); + } + + static Allocator::SlotId addressSlotId(Address address) + { + return MS_Address::from(address.getOffset()).slot_id(); + } + }; + + class SlotRecordingDRAMAllocator: public db0::DRAM_Allocator + { + public: + explicit SlotRecordingDRAMAllocator(std::size_t page_size) + : db0::DRAM_Allocator(page_size) + { + } + + std::optional
tryAlloc(std::size_t size, SlotId slot_num, + bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override + { + m_slot_records.push_back(slot_num); + return DRAM_Allocator::tryAlloc(size, 0, aligned, realm_id, locality); + } + + const std::vector &slotRecords() const { + return m_slot_records; + } + + private: + std::vector m_slot_records; + }; + + struct TestPageItem + { + std::uint64_t m_page_num = 0; }; + + class TestPageIterator + { + public: + explicit TestPageIterator(std::vector page_nums) + : m_page_nums(std::move(page_nums)) + { + } + + bool is_end() const { + return m_pos >= m_page_nums.size(); + } + + TestPageItem operator*() const { + ++m_deref_count; + return { m_page_nums[m_pos] }; + } + + TestPageIterator &operator++() + { + ++m_pos; + return *this; + } + + std::size_t derefCount() const { + return m_deref_count; + } + + private: + std::vector m_page_nums; + std::size_t m_pos = 0; + mutable std::size_t m_deref_count = 0; + }; + + TEST_F( SparsePairTest , testSparsePairAllocatesInternalStorageFromRequestedSlot ) + { + constexpr std::size_t node_size = 4096; + constexpr Allocator::SlotId slot_num = 7; + auto prefix = std::make_shared(node_size); + auto allocator = std::make_shared(node_size); + DRAM_Pair dram_pair { prefix, allocator }; + + SparsePair cut(SparsePair::tag_create(), dram_pair, slot_num); + ASSERT_GE(allocator->slotRecords().size(), 2u); + ASSERT_TRUE(std::all_of(allocator->slotRecords().begin(), allocator->slotRecords().end(), + [](Allocator::SlotId recorded_slot_num) { + return recorded_slot_num == slot_num; + })); + + for (std::uint64_t i = 1; i <= 300; ++i) { + cut.getSparseIndex().emplace(i << 24, static_cast(i), i + 1000); + cut.getDiffIndex().insert((i + 1000) << 24, static_cast(i), i + 2000); + } + + auto allocation_count_after_growth = allocator->slotRecords().size(); + ASSERT_GT(allocation_count_after_growth, 2u); + ASSERT_TRUE(std::all_of(allocator->slotRecords().begin(), allocator->slotRecords().end(), + [](Allocator::SlotId recorded_slot_num) { + return recorded_slot_num == slot_num; + })); + + SparsePair reopened(dram_pair, AccessType::READ_WRITE, {}, {}, slot_num); + for (std::uint64_t i = 301; i <= 600; ++i) { + reopened.getSparseIndex().emplace(i << 24, static_cast(i), i + 1000); + reopened.getDiffIndex().insert((i + 1000) << 24, static_cast(i), i + 2000); + } + + ASSERT_GT(allocator->slotRecords().size(), allocation_count_after_growth); + ASSERT_TRUE(std::all_of(allocator->slotRecords().begin(), allocator->slotRecords().end(), + [](Allocator::SlotId recorded_slot_num) { + return recorded_slot_num == slot_num; + })); + } + + TEST_F( SparsePairTest , testSparsePairManagerCachesPairsBySparseSlotId ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(meta_space); + + auto &slot_7_first = manager.getOrCreate(7); + auto &slot_7_second = manager.getOrCreate(7); + auto &slot_19 = manager.getOrCreate(19); + + ASSERT_EQ(&slot_7_first, &slot_7_second); + ASSERT_EQ(&slot_7_first, manager.tryGetCached(7)); + ASSERT_NE(&slot_7_first, &slot_19); + ASSERT_EQ(addressSlotId(slot_7_first.getSparseIndex().getIndexAddress()), 7u); + ASSERT_EQ(addressSlotId(slot_7_first.getDiffIndex().getIndexAddress()), 7u); + ASSERT_EQ(addressSlotId(slot_19.getSparseIndex().getIndexAddress()), 19u); + ASSERT_EQ(addressSlotId(slot_19.getDiffIndex().getIndexAddress()), 19u); + } + + TEST_F( SparsePairTest , testSparsePairCanUseExternalChangeLog ) + { + SparsePair::ChangeLogT change_log; + auto dram_pair = createMappingPair(); + SparsePair cut(SparsePair::tag_create(), dram_pair, 0, &change_log); + + cut.getSparseIndex().emplace(11, 1, 100); + cut.getDiffIndex().insert(12, 2, 101); + + ASSERT_EQ(cut.getChangeLogSize(), 2u); + ASSERT_EQ(change_log, (SparsePair::ChangeLogT { 11, 12 })); + } + + TEST_F( SparsePairTest , testSparsePairManagerUsesSharedChangeLog ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(meta_space); + + auto &slot_7 = manager.getOrCreate(7); + auto &slot_19 = manager.getOrCreate(19); + slot_7.getSparseIndex().emplace(11, 1, 100); + slot_19.getDiffIndex().insert(12, 2, 101); + + ASSERT_EQ(manager.getChangeLogSize(), 2u); + auto page_nums = manager.extractChangeLogPages(); + ASSERT_EQ(page_nums, (std::vector { + MS_Address::encode(7, 11), + MS_Address::encode(19, 12) + })); + ASSERT_EQ(manager.getChangeLogSize(), 0u); + } + + TEST_F( SparsePairTest , testSparsePairManagerCommitOnlyUsesDirtyCachedPairs ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(meta_space); + + auto &dirty_slot = manager.getOrCreate(7); + auto &other_dirty_slot = manager.getOrCreate(19); + auto &clean_slot = manager.getOrCreate(31); + dirty_slot.getSparseIndex().emplace(11, 1, 100); + other_dirty_slot.getSparseIndex().emplace(13, 1, 102); + dirty_slot.getDiffIndex().insert(12, 2, 101); + + manager.commit(); + + ASSERT_EQ(manager.getChangeLogSize(), 3u); + auto page_nums = manager.extractChangeLogPages(); + ASSERT_EQ(page_nums, (std::vector { + MS_Address::encode(7, 11), + MS_Address::encode(19, 13), + MS_Address::encode(7, 12) + })); + ASSERT_TRUE(!!dirty_slot.getSparseIndex().lookup(11, 1)); + ASSERT_TRUE(!!other_dirty_slot.getSparseIndex().lookup(13, 1)); + ASSERT_TRUE(clean_slot.empty()); + } + + TEST_F( SparsePairTest , testSparsePairManagerRefreshesAffectedSlotInPlace ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(meta_space); + + auto &slot_7 = manager.getOrCreate(7); + auto &slot_19 = manager.getOrCreate(19); + slot_7.getSparseIndex().insert({ 11, 1, 100 }); + manager.commit(); + ASSERT_TRUE(flushMeta(meta_space, stream, meta_pair)); + + auto *slot_7_before = &slot_7; + auto *slot_19_before = &slot_19; + manager.refreshPages({ + MS_Address::encode(7, 11), + MS_Address::encode(7, 11) + }); + + ASSERT_EQ(manager.tryGetCached(7), slot_7_before); + ASSERT_EQ(manager.tryGetCached(19), slot_19_before); + ASSERT_EQ(manager.tryGetExisting(7), slot_7_before); + ASSERT_EQ(manager.tryGetCached(7), slot_7_before); + } + + TEST_F( SparsePairTest , testSparsePairManagerEvictsSlot ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(meta_space); + + auto &slot_7 = manager.getOrCreate(7); + auto &slot_19 = manager.getOrCreate(19); + slot_7.getSparseIndex().insert({ 11, 1, 100 }); + slot_19.getSparseIndex().insert({ 12, 1, 101 }); + manager.commit(); + ASSERT_TRUE(flushMeta(meta_space, stream, meta_pair)); + + manager.evictSlot(7); + + ASSERT_EQ(manager.tryGetCached(7), nullptr); + auto &reopened_slot_7 = manager.getOrCreate(7); + auto *reopened_slot_7_ptr = &reopened_slot_7; + auto *slot_19_ptr = &slot_19; + + manager.refreshPages({ + MS_Address::encode(7, 11), + MS_Address::encode(19, 12) + }); + + ASSERT_EQ(manager.tryGetCached(7), reopened_slot_7_ptr); + ASSERT_EQ(manager.tryGetExisting(7), reopened_slot_7_ptr); + ASSERT_EQ(manager.tryGetCached(7), reopened_slot_7_ptr); + ASSERT_EQ(manager.tryGetCached(19), slot_19_ptr); + ASSERT_EQ(manager.tryGetExisting(19), slot_19_ptr); + ASSERT_EQ(manager.tryGetCached(19), slot_19_ptr); + } + + TEST_F( SparsePairTest , testSparsePairManagerOpensExistingSlotPair ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + + { + SparsePairManager manager(meta_space); + auto &slot_pair = manager.getOrCreate(17); + slot_pair.getSparseIndex().insert({ 42, 3, 77 }); + slot_pair.getDiffIndex().insert(43, 4, 78); + } + + SparsePairManager reopened_manager(meta_space); + auto &reopened_pair = reopened_manager.getOrCreate(17); + auto sparse_item = reopened_pair.getSparseIndex().lookup(42, 3); + + ASSERT_TRUE(!!sparse_item); + ASSERT_EQ(sparse_item.m_storage_page_num, 77u); + ASSERT_EQ(reopened_pair.getDiffIndex().findLower(43, 4), 4u); + } + + TEST_F( SparsePairTest , testSparsePairManagerOpensSlotPairAfterMetaSpaceFlush ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + + { + auto meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(meta_space); + auto &slot_pair = manager.getOrCreate(23); + slot_pair.getSparseIndex().insert({ 100, 5, 700 }); + ASSERT_TRUE(flushMeta(meta_space, stream, meta_pair)); + } + + auto reopened_meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager manager(reopened_meta_space); + auto &reopened_pair = manager.getOrCreate(23); + auto sparse_item = reopened_pair.getSparseIndex().lookup(100, 5); + + ASSERT_TRUE(!!sparse_item); + ASSERT_EQ(sparse_item.m_storage_page_num, 700u); + } + + TEST_F( SparsePairTest , testSparsePairPageIteratorAdvanceSkipsStaleLowerPages ) + { + TestPageIterator it({ 7, 10, 10, 12 }); + + auto page_num = detail::advancePageIteratorPast(it, 10u); + + ASSERT_EQ(page_num, 12u); + ASSERT_FALSE(it.is_end()); + ASSERT_EQ((*it).m_page_num, 12u); + } + + TEST_F( SparsePairTest , testSparsePairPageIteratorAdvanceReturnsEmptyAtEnd ) + { + TestPageIterator it({ 7, 10 }); + + auto page_num = detail::advancePageIteratorPast(it, 10u); + + ASSERT_FALSE(page_num); + ASSERT_TRUE(it.is_end()); + } + + TEST_F( SparsePairTest , testSparsePairForUniquePageRangeCombinesSparseAndDiffPages ) + { + auto dram_pair = createMappingPair(); + SparsePair cut(SparsePair::tag_create(), dram_pair); + + cut.getSparseIndex().emplace(10, 1, 100); + cut.getSparseIndex().emplace(10, 3, 101); + cut.getSparseIndex().emplace(12, 1, 102); + cut.getSparseIndex().emplace(15, 1, 103); + cut.getSparseIndex().emplace(21, 1, 104); + + cut.getDiffIndex().insert(11, 2, 200); + cut.getDiffIndex().insert(12, 4, 201); + cut.getDiffIndex().insert(14, 1, 202); + cut.getDiffIndex().insert(14, 3, 203); + cut.getDiffIndex().insert(20, 1, 204); + + std::vector page_nums; + cut.forUniquePageRange(10, 20, [&](SparsePair::PageNumT page_num) { + page_nums.push_back(page_num); + }); + + ASSERT_EQ(page_nums, (std::vector { 10, 11, 12, 14, 15 })); + } + + TEST_F( SparsePairTest , testSparsePairForUniquePageRangeReturnsDiffOnlyPage ) + { + auto dram_pair = createMappingPair(); + SparsePair cut(SparsePair::tag_create(), dram_pair); + + cut.getSparseIndex().emplace(4, 1, 100); + cut.getSparseIndex().emplace(9, 1, 101); + cut.getDiffIndex().insert(6, 2, 200); + + std::vector page_nums; + cut.forUniquePageRange(5, 8, [&](SparsePair::PageNumT page_num) { + page_nums.push_back(page_num); + }); + + ASSERT_EQ(page_nums, (std::vector { 6 })); + } + + TEST_F( SparsePairTest , testSparsePairForUniquePageRangeCallbackReceivesPageNum ) + { + auto dram_pair = createMappingPair(); + SparsePair cut(SparsePair::tag_create(), dram_pair); + cut.getDiffIndex().insert(3, 1, 30); + + std::vector page_nums; + cut.forUniquePageRange(0, 10, [&](SparsePair::PageNumT page_num) { + static_assert(std::is_same_v); + page_nums.push_back(page_num); + }); + + ASSERT_EQ(page_nums, (std::vector { 3 })); + } + + TEST_F( SparsePairTest , testSparsePairForUniquePageRangeWithoutBoundsCombinesAllUniquePages ) + { + auto dram_pair = createMappingPair(); + SparsePair cut(SparsePair::tag_create(), dram_pair); + + cut.getSparseIndex().emplace(2, 1, 100); + cut.getSparseIndex().emplace(2, 3, 101); + cut.getSparseIndex().emplace(8, 1, 102); + cut.getSparseIndex().emplace(15, 1, 103); + + cut.getDiffIndex().insert(1, 1, 200); + cut.getDiffIndex().insert(8, 4, 201); + cut.getDiffIndex().insert(11, 1, 202); + cut.getDiffIndex().insert(11, 3, 203); + + std::vector page_nums; + cut.forUniquePageRange([&](SparsePair::PageNumT page_num) { + page_nums.push_back(page_num); + }); + + ASSERT_EQ(page_nums, (std::vector { 1, 2, 8, 11, 15 })); + } + + TEST_F( SparsePairTest , testSparsePairManagerRefreshSeesSlotCreatedAfterMiss ) + { + CFile::create(file_name, {}); + CFile file(file_name, AccessType::READ_WRITE); + auto io = createIO(file); + auto stream = createStream(io); + auto mapping_pair = createMappingPair(); + SparsePair meta_pair(SparsePair::tag_create(), mapping_pair); + + auto writer_meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + auto reader_meta_space = MS_MetaSpace::create(page_size, meta_pair, stream); + SparsePairManager reader_manager(reader_meta_space); + + ASSERT_EQ(reader_manager.tryGetExisting(0), nullptr); + + { + SparsePairManager writer_manager(writer_meta_space); + auto &slot_pair = writer_manager.getOrCreate(0); + slot_pair.getSparseIndex().insert({ 200, 7, 900 }); + writer_manager.commit(); + auto changed_pages = writer_manager.extractChangeLogPages(); + ASSERT_TRUE(flushMeta(writer_meta_space, stream, meta_pair)); + reader_manager.refreshPages(changed_pages); + } + + auto *reopened_pair = reader_manager.tryGetExisting(0); + + ASSERT_NE(reopened_pair, nullptr); + auto sparse_item = reopened_pair->getSparseIndex().lookup(200, 7); + ASSERT_TRUE(!!sparse_item); + ASSERT_EQ(sparse_item.m_storage_page_num, 900u); + } TEST_F( SparsePairTest , testSparsePairCollectsChangeLogOfAddedItems ) { std::size_t node_size = 16 * 1024; - SparsePair sparse_pair(node_size); DRAM_Pair dram_pair; auto dram_space = DRAMSpace::create(node_size, [&](DRAM_Pair dp) { dram_pair = dp; @@ -55,23 +547,15 @@ namespace tests for (auto &item: items_1) { sparse_index.insert(item); } + cut.recordMaxStateNum(1); CFile::create(file_name, {}); CFile file(file_name, AccessType::READ_WRITE); - auto tail_function = [&]() { - return file.size(); - }; { - DP_ChangeLogStreamT io(file, 0, 4096, tail_function); - auto &change_log = cut.extractChangeLog(io, 0); - std::vector data; - for (auto value: change_log) { - data.push_back(value); - } - io.close(); - ASSERT_EQ(data, (std::vector { 0, 1 })); - ASSERT_EQ(change_log.m_state_num, 1u); + auto change_log = cut.extractChangeLogPages(); + ASSERT_EQ(change_log, (std::vector { 1, 0 })); + ASSERT_EQ(cut.getMaxStateNum(), 1u); } std::vector items_2 { @@ -82,19 +566,13 @@ namespace tests for (auto &item: items_2) { sparse_index.insert(item); } + cut.recordMaxStateNum(5); { - DP_ChangeLogStreamT io(file, 0, 4096, tail_function); - while (io.readChangeLogChunk()); - auto &change_log = cut.extractChangeLog(io, 0); - std::vector expected_data { 0, 2, 3, 4 }; - std::vector data; - for (auto value: change_log) { - data.push_back(value); - } - io.close(); - ASSERT_EQ(data, expected_data); - ASSERT_EQ(change_log.m_state_num, 5u); + auto change_log = cut.extractChangeLogPages(); + std::vector expected_data { 2, 3, 0, 2, 4 }; + ASSERT_EQ(change_log, expected_data); + ASSERT_EQ(cut.getMaxStateNum(), 5u); } } @@ -107,9 +585,6 @@ namespace tests CFile::create(file_name, {}); db0::CFile file(file_name, AccessType::READ_WRITE); - auto tail_function = [&]() { - return file.size(); - }; { // create an empty instance @@ -118,17 +593,15 @@ namespace tests int count = 10; for (int i = 0; i < count; ++i) { - SparsePair cut({ prefix, allocator}, AccessType::READ_WRITE); + SparsePair cut({ prefix, allocator}, AccessType::READ_WRITE, allocator->firstAlloc()); auto &sparse_index = cut.getSparseIndex(); for (unsigned int page_num = 0; page_num < 1000; ++page_num) { sparse_index.emplace(page_num, i, 999); } + cut.recordMaxStateNum(i); // simulate change log extraction - DP_ChangeLogStreamT io(file, 0, 16 << 10, tail_function, AccessType::READ_WRITE); - while (io.readChangeLogChunk()); - cut.extractChangeLog(io, 0); - io.close(); + cut.extractChangeLogPages(); // refresh updates local cached variables with DRAM prefix cut.refresh(); diff --git a/tests/utils/EmbeddedAllocator.cpp b/tests/utils/EmbeddedAllocator.cpp index 4937ad5b..e1ddef3b 100644 --- a/tests/utils/EmbeddedAllocator.cpp +++ b/tests/utils/EmbeddedAllocator.cpp @@ -8,7 +8,7 @@ namespace db0 { - std::optional
EmbeddedAllocator::tryAlloc(std::size_t size, std::uint32_t slot_num, + std::optional
EmbeddedAllocator::tryAlloc(std::size_t size, SlotId slot_num, bool aligned, unsigned char, unsigned char) { auto new_address = Address::fromOffset(4096 * ++m_count); diff --git a/tests/utils/EmbeddedAllocator.hpp b/tests/utils/EmbeddedAllocator.hpp index 0e756d99..b51d4d3f 100644 --- a/tests/utils/EmbeddedAllocator.hpp +++ b/tests/utils/EmbeddedAllocator.hpp @@ -18,10 +18,10 @@ namespace db0 class EmbeddedAllocator: public Allocator { public: - using AllocCallbackT = std::function)>; + using AllocCallbackT = std::function)>; EmbeddedAllocator() = default; - std::optional
tryAlloc(std::size_t size, std::uint32_t, + std::optional
tryAlloc(std::size_t size, SlotId, bool aligned = false, unsigned char realm_id = 0, unsigned char locality = 0) override; void free(Address) override; diff --git a/tests/utils/ScopedWorkspaceFixture.hpp b/tests/utils/ScopedWorkspaceFixture.hpp new file mode 100644 index 00000000..96a1bbe8 --- /dev/null +++ b/tests/utils/ScopedWorkspaceFixture.hpp @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later +// Copyright (c) 2026 DBZero Software sp. z o.o. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace tests +{ + + class ScopedWorkspaceFixture + { + public: + explicit ScopedWorkspaceFixture(const char *prefix_name) + : m_prefix_name(prefix_name) + , m_workspace("", {}, {}, {}, {}, db0::object_model::initializer()) + { + db0::tests::dropPrefixFiles(m_prefix_name.c_str()); + m_fixture = m_workspace.getFixture(m_prefix_name); + } + + ~ScopedWorkspaceFixture() + { + close(); + db0::tests::dropPrefixFiles(m_prefix_name.c_str()); + } + + ScopedWorkspaceFixture(const ScopedWorkspaceFixture &) = delete; + ScopedWorkspaceFixture &operator=(const ScopedWorkspaceFixture &) = delete; + + db0::swine_ptr &fixture() + { + return m_fixture; + } + + db0::Workspace &workspace() + { + return m_workspace; + } + + void close() + { + if (!m_closed) { + m_fixture = nullptr; + m_workspace.close(); + m_closed = true; + } + } + + private: + std::string m_prefix_name; + db0::Workspace m_workspace; + db0::swine_ptr m_fixture; + bool m_closed = false; + }; + +} diff --git a/tests/utils/utils.cpp b/tests/utils/utils.cpp index 69f49183..f70af768 100644 --- a/tests/utils/utils.cpp +++ b/tests/utils/utils.cpp @@ -24,6 +24,14 @@ namespace db0::tests std::remove(filename); } } + + void dropPrefixFiles(const char *prefix_name) + { + auto data_file_name = std::string(prefix_name) + ".db0"; + drop(data_file_name.c_str()); + auto lock_file_name = data_file_name + ".lock"; + drop(lock_file_name.c_str()); + } std::vector randomPage(std::size_t size) { std::vector result(size); @@ -95,4 +103,4 @@ namespace db0::tests return result; } -} \ No newline at end of file +} diff --git a/tests/utils/utils.hpp b/tests/utils/utils.hpp index ef895961..077b3069 100644 --- a/tests/utils/utils.hpp +++ b/tests/utils/utils.hpp @@ -16,6 +16,8 @@ namespace db0::tests void drop(const char *filename); + void dropPrefixFiles(const char *prefix_name); + std::vector randomPage(std::size_t size); bool equal(const std::vector &v1, const std::vector &v2); @@ -40,4 +42,4 @@ namespace db0::tests // Load rows from a comma-separated values (CSV) std::vector > loadArray(const std::string &file_name); -} \ No newline at end of file +}