diff --git a/README.md b/README.md index 02969d0..73218f2 100644 --- a/README.md +++ b/README.md @@ -309,7 +309,7 @@ Each example is self-contained and runnable. See the example source for detailed ## Status -Lode is at **v0.5.0** and under active development. +Lode is at **v0.7.0** and under active development. APIs are stabilizing; some changes are possible before v1.0. If you are evaluating Lode, focus on: diff --git a/docs/IMPLEMENTATION_PLAN.md b/docs/IMPLEMENTATION_PLAN.md index 5106877..d0bfd81 100644 --- a/docs/IMPLEMENTATION_PLAN.md +++ b/docs/IMPLEMENTATION_PLAN.md @@ -203,7 +203,7 @@ Explore new adapters or codecs without expanding the public API. - [x] CONTRACT_STORAGE.md compliance verified - [x] Zstd compressor added as an additive compression option - [x] Parquet codec implemented -- [ ] Manifest stats extensions finalized (additive) +- [x] Manifest stats extensions finalized (additive) ### S3 Adapter @@ -262,7 +262,7 @@ Any change that affects contract behavior must: ### Priority Track B — Format and Ecosystem - [x] Prioritize Parquet codec delivery -- [ ] Define additive manifest stats needed for Parquet-oriented pruning workflows +- [x] Define additive manifest stats needed for Parquet-oriented pruning workflows - [x] Add/refresh examples for columnar and streaming workflows ### Priority Track C — Zarr/Xarray Direction diff --git a/docs/contracts/CONTRACT_CORE.md b/docs/contracts/CONTRACT_CORE.md index 8258684..dcc1ac3 100644 --- a/docs/contracts/CONTRACT_CORE.md +++ b/docs/contracts/CONTRACT_CORE.md @@ -84,10 +84,10 @@ Manifests are immutable once written. ## Metadata Rules - Metadata MUST be explicit on every snapshot. -- `nil` metadata is invalid and MUST error. +- `nil` metadata is coalesced to empty (`{}`) at the API boundary. - Empty metadata (`{}`) is valid and MUST be persisted as an explicit object. - Metadata values MUST be JSON-serializable. -- Metadata MUST never be inferred or defaulted. +- Manifests always contain a non-nil metadata map. --- diff --git a/docs/contracts/CONTRACT_ERRORS.md b/docs/contracts/CONTRACT_ERRORS.md index cf09f0b..b3ee49a 100644 --- a/docs/contracts/CONTRACT_ERRORS.md +++ b/docs/contracts/CONTRACT_ERRORS.md @@ -79,6 +79,7 @@ These indicate a manifest fails structural or semantic validation. - `ParentSnapshotID`: May be empty for first snapshot - `MinTimestamp`, `MaxTimestamp`: May be nil when not applicable - `Checksum` in `FileRef`: May be empty +- `Stats` in `FileRef`: May be nil (omitted when codec does not report statistics) **File Validation**: - Each `FileRef.Path` must be non-empty diff --git a/docs/contracts/CONTRACT_TEST_MATRIX.md b/docs/contracts/CONTRACT_TEST_MATRIX.md index 96050bc..f0e98bd 100644 --- a/docs/contracts/CONTRACT_TEST_MATRIX.md +++ b/docs/contracts/CONTRACT_TEST_MATRIX.md @@ -58,10 +58,13 @@ Gaps are tracked with codes indicating category and priority: | ParentSnapshotID | First snapshot tests (no parent) | | MinTimestamp/MaxTimestamp | `TestDataset_Write_NonTimestampedRecords_OmitsMinMax` | | Checksum | `TestDataset_Write_WithoutChecksum_OmitsChecksum` | +| FileRef.Stats (present) | `TestDataset_Write_ParquetCodec_StatsPopulated` | +| FileRef.Stats (absent) | `TestDataset_Write_JSONLCodec_StatsNil`, `TestDataset_Write_RawBlob_StatsNil` | +| FileRef.Stats (JSON round-trip) | `TestFileRef_Stats_JSONRoundTrip`, `TestFileRef_Stats_BackwardCompat`, `TestFileRef_Stats_OmittedWhenNil` | **Metadata Rules**: All covered ✅ -- nil metadata rejected: `TestDataset_Write_NilMetadata_ReturnsError`, `TestDataset_StreamWrite_NilMetadata_ReturnsError`, `TestDataset_StreamWriteRecords_NilMetadata_ReturnsError` +- nil metadata coalesced: `TestDataset_Write_NilMetadata_CoalescesToEmpty`, `TestDataset_StreamWrite_NilMetadata_CoalescesToEmpty`, `TestDataset_StreamWriteRecords_NilMetadata_CoalescesToEmpty` - Empty metadata valid: `TestDataset_Write_EmptyMetadata_ValidAndPersisted`, etc. **Immutability**: All covered ✅ @@ -78,7 +81,7 @@ Gaps are tracked with codes indicating category and priority: | Requirement | Test | |-------------|------| | Creates snapshot | Multiple write tests | -| nil metadata error | `TestDataset_Write_NilMetadata_ReturnsError` | +| nil metadata coalesced | `TestDataset_Write_NilMetadata_CoalescesToEmpty` | | Parent snapshot linked | `TestDataset_StreamWrite_ParentSnapshotLinked` | | Raw blob RowCount=1 | `TestDataset_StreamWrite_Success` | @@ -86,7 +89,7 @@ Gaps are tracked with codes indicating category and priority: | Requirement | Test | |-------------|------| -| nil metadata error | `TestDataset_StreamWrite_NilMetadata_ReturnsError` | +| nil metadata coalesced | `TestDataset_StreamWrite_NilMetadata_CoalescesToEmpty` | | Commit writes manifest | `TestDataset_StreamWrite_Success` | | Snapshot invisible before Commit | `TestDataset_StreamWrite_NotVisibleBeforeCommit` | | Abort → no manifest | `TestDataset_StreamWrite_Abort_NoManifest` | @@ -103,7 +106,7 @@ Gaps are tracked with codes indicating category and priority: | Requirement | Test | |-------------|------| -| nil metadata error | `TestDataset_StreamWriteRecords_NilMetadata_ReturnsError` | +| nil metadata coalesced | `TestDataset_StreamWriteRecords_NilMetadata_CoalescesToEmpty` | | nil iterator error | `TestDataset_StreamWriteRecords_NilIterator_ReturnsError` | | Non-streaming codec error | `TestDataset_StreamWriteRecords_NonStreamingCodec_ReturnsError` | | Partitioning error | `TestDataset_StreamWriteRecords_WithPartitioner_ReturnsError` | @@ -120,6 +123,22 @@ Gaps are tracked with codes indicating category and priority: | Non-timestamped omits | `TestDataset_Write_NonTimestampedRecords_OmitsMinMax` | | Raw blob omits | `TestDataset_Write_RawBlob_OmitsTimestamps` | +**Per-File Statistics**: All covered ✅ + +| Requirement | Test | +|-------------|------| +| StatisticalCodec populates stats | `TestDataset_Write_ParquetCodec_StatsPopulated` | +| Non-statistical codec → nil stats | `TestDataset_Write_JSONLCodec_StatsNil` | +| Raw blob → nil stats | `TestDataset_Write_RawBlob_StatsNil` | +| StreamWriteRecords → nil stats (JSONL) | `TestDataset_StreamWriteRecords_StatsNil` | +| Parquet basic types stats | `TestParquetCodec_FileStats_BasicTypes` | +| Parquet nullable field stats | `TestParquetCodec_FileStats_NullableFields` | +| Parquet all-null column stats | `TestParquetCodec_FileStats_AllNulls` | +| Parquet single record stats | `TestParquetCodec_FileStats_SingleRecord` | +| Parquet bool/bytes no min/max | `TestParquetCodec_FileStats_BoolAndBytes_NoMinMax` | +| Parquet timestamp stats | `TestParquetCodec_FileStats_Timestamps` | +| Parquet empty records stats | `TestParquetCodec_FileStats_EmptyRecords` | + **Empty Dataset**: All covered ✅ | Requirement | Test | @@ -257,7 +276,7 @@ All error sentinels covered ✅ |-------------|------| | End-to-end round-trip | `TestVolume_StageCommitReadAt_EndToEnd` | | Cumulative manifest | `TestVolume_CumulativeManifest` | -| nil metadata rejected | `TestVolume_Commit_NilMetadata_ReturnsError` | +| nil metadata coalesced | `TestVolume_Commit_NilMetadata_CoalescesToEmpty` | | Empty metadata accepted | `TestVolume_Commit_EmptyMetadata_Succeeds` | | Empty blocks rejected | `TestVolume_Commit_EmptyBlocks_ReturnsError` | | Empty block path rejected | `TestVolume_Commit_EmptyBlockPath_ReturnsError` | diff --git a/docs/contracts/CONTRACT_VOLUME.md b/docs/contracts/CONTRACT_VOLUME.md index b5d41b2..f730fb5 100644 --- a/docs/contracts/CONTRACT_VOLUME.md +++ b/docs/contracts/CONTRACT_VOLUME.md @@ -237,7 +237,7 @@ Volume accepts a minimal set of options: - Missing committed range MUST return `ErrRangeMissing`. - Overlapping blocks at Commit MUST return `ErrOverlappingBlocks`. - Empty block list at Commit MUST return an error. -- Nil metadata at Commit MUST return an error. +- `nil` metadata at Commit MUST be coalesced to empty (`Metadata{}`). - Range reads MUST NOT return partial data without error. - `Latest` on empty volume MUST return `ErrNoSnapshots`. - `Snapshot` for unknown ID MUST return `ErrNotFound`. diff --git a/docs/contracts/CONTRACT_WRITE_API.md b/docs/contracts/CONTRACT_WRITE_API.md index 2b9c3f8..55a9f07 100644 --- a/docs/contracts/CONTRACT_WRITE_API.md +++ b/docs/contracts/CONTRACT_WRITE_API.md @@ -27,7 +27,7 @@ It is authoritative for any `Dataset` implementation. ### Required behavior - `Write(ctx, data, metadata)` MUST create a new snapshot on success. -- `metadata` MUST be non-nil; nil MUST return an error. +- `nil` metadata MUST be coalesced to empty (`Metadata{}`). - Empty metadata is valid and MUST be persisted explicitly. - The new snapshot MUST reference the previous snapshot as its parent (if any). - Writes MUST NOT mutate existing snapshots or manifests. @@ -40,7 +40,7 @@ It is authoritative for any `Dataset` implementation. ### StreamWrite Semantics -- `StreamWrite(ctx, metadata)` MUST return an error if metadata is nil. +- `StreamWrite(ctx, metadata)` MUST coalesce `nil` metadata to empty (`Metadata{}`). - `StreamWrite` MUST return a `StreamWriter` for a single binary data unit. - `StreamWriter.Commit(ctx)` MUST write the manifest and return the new snapshot. - A snapshot MUST NOT be visible before `Commit` writes the manifest. @@ -55,7 +55,7 @@ It is authoritative for any `Dataset` implementation. ### StreamWriteRecords Semantics -- `StreamWriteRecords(ctx, records, metadata)` MUST return an error if metadata is nil. +- `StreamWriteRecords(ctx, records, metadata)` MUST coalesce `nil` metadata to empty (`Metadata{}`). - `StreamWriteRecords` MUST return an error if records iterator is nil. - `StreamWriteRecords` MUST consume records via a pull-based iterator. - `StreamWriteRecords` MUST return an error if the configured codec does not support diff --git a/lode/dataset.go b/lode/dataset.go index 3a81348..abe3bbf 100644 --- a/lode/dataset.go +++ b/lode/dataset.go @@ -253,7 +253,7 @@ func (d *dataset) ID() DatasetID { func (d *dataset) Write(ctx context.Context, data []any, metadata Metadata) (*DatasetSnapshot, error) { if metadata == nil { - return nil, errors.New("lode: metadata must be non-nil (use empty map {} for no metadata)") + metadata = Metadata{} } var parentID DatasetSnapshotID @@ -449,7 +449,7 @@ func (d *dataset) Latest(ctx context.Context) (*DatasetSnapshot, error) { func (d *dataset) StreamWrite(ctx context.Context, metadata Metadata) (StreamWriter, error) { if metadata == nil { - return nil, errors.New("lode: metadata must be non-nil (use empty map {} for no metadata)") + metadata = Metadata{} } if d.codec != nil { return nil, ErrCodecConfigured @@ -513,7 +513,7 @@ func (d *dataset) StreamWrite(ctx context.Context, metadata Metadata) (StreamWri func (d *dataset) StreamWriteRecords(ctx context.Context, records RecordIterator, metadata Metadata) (*DatasetSnapshot, error) { if metadata == nil { - return nil, errors.New("lode: metadata must be non-nil (use empty map {} for no metadata)") + metadata = Metadata{} } if records == nil { return nil, ErrNilIterator diff --git a/lode/dataset_test.go b/lode/dataset_test.go index dbc7253..1861b8d 100644 --- a/lode/dataset_test.go +++ b/lode/dataset_test.go @@ -776,18 +776,21 @@ func TestDataset_Snapshot_EmptyDataset_ReturnsErrNotFound(t *testing.T) { // Write validation tests // ----------------------------------------------------------------------------- -func TestDataset_Write_NilMetadata_ReturnsError(t *testing.T) { +func TestDataset_Write_NilMetadata_CoalescesToEmpty(t *testing.T) { ds, err := NewDataset("test-ds", NewMemoryFactory()) if err != nil { t.Fatal(err) } - _, err = ds.Write(t.Context(), []any{[]byte("data")}, nil) - if err == nil { - t.Fatal("expected error for nil metadata, got nil") + snap, err := ds.Write(t.Context(), []any{[]byte("data")}, nil) + if err != nil { + t.Fatalf("expected nil metadata to succeed, got: %v", err) } - if !strings.Contains(err.Error(), "metadata must be non-nil") { - t.Errorf("expected metadata error, got: %v", err) + if snap.Manifest.Metadata == nil { + t.Fatal("expected non-nil metadata in manifest after nil coalescing") + } + if len(snap.Manifest.Metadata) != 0 { + t.Errorf("expected empty metadata, got %v", snap.Manifest.Metadata) } } @@ -961,18 +964,26 @@ func TestDataset_StreamWrite_CloseWithoutCommit_BehavesAsAbort(t *testing.T) { } } -func TestDataset_StreamWrite_NilMetadata_ReturnsError(t *testing.T) { +func TestDataset_StreamWrite_NilMetadata_CoalescesToEmpty(t *testing.T) { ds, err := NewDataset("test-ds", NewMemoryFactory()) if err != nil { t.Fatal(err) } - _, err = ds.StreamWrite(t.Context(), nil) - if err == nil { - t.Fatal("expected error for nil metadata, got nil") + sw, err := ds.StreamWrite(t.Context(), nil) + if err != nil { + t.Fatalf("expected nil metadata to succeed, got: %v", err) } - if !strings.Contains(err.Error(), "metadata must be non-nil") { - t.Errorf("expected metadata error, got: %v", err) + _, _ = sw.Write([]byte("data")) + snap, err := sw.Commit(t.Context()) + if err != nil { + t.Fatal(err) + } + if snap.Manifest.Metadata == nil { + t.Fatal("expected non-nil metadata in manifest after nil coalescing") + } + if len(snap.Manifest.Metadata) != 0 { + t.Errorf("expected empty metadata, got %v", snap.Manifest.Metadata) } } @@ -1426,19 +1437,22 @@ func TestDataset_StreamWriteRecords_NoCodec_ReturnsError(t *testing.T) { } } -func TestDataset_StreamWriteRecords_NilMetadata_ReturnsError(t *testing.T) { +func TestDataset_StreamWriteRecords_NilMetadata_CoalescesToEmpty(t *testing.T) { ds, err := NewDataset("test-ds", NewMemoryFactory(), WithCodec(NewJSONLCodec())) if err != nil { t.Fatal(err) } iter := &sliceIterator{records: []any{D{"id": "1"}}} - _, err = ds.StreamWriteRecords(t.Context(), iter, nil) - if err == nil { - t.Fatal("expected error for nil metadata, got nil") + snap, err := ds.StreamWriteRecords(t.Context(), iter, nil) + if err != nil { + t.Fatalf("expected nil metadata to succeed, got: %v", err) + } + if snap.Manifest.Metadata == nil { + t.Fatal("expected non-nil metadata in manifest after nil coalescing") } - if !strings.Contains(err.Error(), "metadata must be non-nil") { - t.Errorf("expected metadata error, got: %v", err) + if len(snap.Manifest.Metadata) != 0 { + t.Errorf("expected empty metadata, got %v", snap.Manifest.Metadata) } } diff --git a/lode/volume.go b/lode/volume.go index 4ae5dfc..0a4b6ae 100644 --- a/lode/volume.go +++ b/lode/volume.go @@ -129,7 +129,7 @@ func (v *volume) StageWriteAt(ctx context.Context, offset int64, r io.Reader) (B // Commit records the provided blocks into a new immutable snapshot. func (v *volume) Commit(ctx context.Context, blocks []BlockRef, metadata Metadata) (*VolumeSnapshot, error) { if metadata == nil { - return nil, fmt.Errorf("lode: metadata must not be nil (use empty Metadata{} for no metadata)") + metadata = Metadata{} } if len(blocks) == 0 { return nil, fmt.Errorf("lode: commit must include at least one new block") diff --git a/lode/volume_test.go b/lode/volume_test.go index cc49801..3db5222 100644 --- a/lode/volume_test.go +++ b/lode/volume_test.go @@ -270,7 +270,7 @@ func TestVolume_ReadAt_MissingRange_ReturnsErrRangeMissing(t *testing.T) { } } -func TestVolume_Commit_NilMetadata_ReturnsError(t *testing.T) { +func TestVolume_Commit_NilMetadata_CoalescesToEmpty(t *testing.T) { vol, err := NewVolume("test-vol", NewMemoryFactory(), 100) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -282,9 +282,15 @@ func TestVolume_Commit_NilMetadata_ReturnsError(t *testing.T) { t.Fatalf("StageWriteAt failed: %v", err) } - _, err = vol.Commit(ctx, []BlockRef{blk}, nil) - if err == nil { - t.Fatal("expected error for nil metadata") + snap, err := vol.Commit(ctx, []BlockRef{blk}, nil) + if err != nil { + t.Fatalf("expected nil metadata to succeed, got: %v", err) + } + if snap.Manifest.Metadata == nil { + t.Fatal("expected non-nil metadata in manifest after nil coalescing") + } + if len(snap.Manifest.Metadata) != 0 { + t.Errorf("expected empty metadata, got %v", snap.Manifest.Metadata) } }