From 5689f2225897ad63935408b294376092d0925ab4 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:20:36 +0200 Subject: [PATCH 1/7] feat(variant): encode constant vortex.variant columns (Layer A) VariantEncodingEncoder previously threw "encode not yet implemented". Implement the container encode for constant variant columns: a single vortex.constant child (core_storage) wrapping the variant scalar, no shredded child, VariantMetadata with absent shredded_dtype. Also add the missing DType.Variant case to VortexWriter.serializeDType so the column's logical type is written into the file's DType blob. Scope is Layer A only (every row holds the same value). Physical variant-bytes storage (Layer B) and shredding (Layer C) are future work. Java-side read of a constant variant is not yet supported; the round-trip is proven against the Rust reference reader via JNI. Co-Authored-By: Claude Opus 4.8 --- ...antJavaWritesRustReadsIntegrationTest.java | 74 ++++++++++ .../dfa1/vortex/writer/VortexWriter.java | 10 ++ .../vortex/writer/encode/VariantData.java | 25 ++++ .../writer/encode/VariantEncodingEncoder.java | 37 ++++- .../encode/VariantEncodingEncoderTest.java | 129 ++++++++++++++++++ 5 files changed, 272 insertions(+), 3 deletions(-) create mode 100644 integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java create mode 100644 writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java create mode 100644 writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java new file mode 100644 index 00000000..6d2a8e5f --- /dev/null +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java @@ -0,0 +1,74 @@ +package io.github.dfa1.vortex.integration; + +import dev.vortex.api.DataSource; +import dev.vortex.api.Session; +import dev.vortex.arrow.ArrowAllocation; +import dev.vortex.jni.NativeLoader; +import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.proto.Primitive; +import io.github.dfa1.vortex.proto.Scalar; +import io.github.dfa1.vortex.proto.ScalarValue; +import io.github.dfa1.vortex.writer.VortexWriter; +import io.github.dfa1.vortex.writer.WriteOptions; +import io.github.dfa1.vortex.writer.encode.VariantData; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.List; +import java.util.Map; +import java.util.OptionalLong; + +import static org.assertj.core.api.Assertions.assertThat; + +/// Cross-compatibility for `vortex.variant` (Layer A): Java writes a constant variant +/// column, the Rust (JNI) reader opens the file and reports the correct row count and +/// schema. Arrow has no native variant type, so this asserts the file parses end-to-end +/// against the reference reader rather than decoding per-row values. +class VariantJavaWritesRustReadsIntegrationTest { + + private static final Session SESSION = Session.create(); + private static final BufferAllocator ALLOCATOR = ArrowAllocation.rootAllocator(); + + private static final DType.Struct VARIANT_SCHEMA = new DType.Struct( + List.of("v"), + List.of(new DType.Variant(false)), + false); + + static { + NativeLoader.loadJni(); + } + + @Test + void javaWriter_jniReader_constantVariantColumn(@TempDir Path tmp) throws IOException { + // Given — a constant variant column: every one of N rows is the i32 variant value 7. + // Built as Rust's Scalar::variant(Scalar::primitive(7i32)): the inner Scalar carries + // its own i32 dtype so the reference reader knows the wrapped value's type. + Path file = tmp.resolve("java_variant.vtx"); + int rows = 5; + Scalar inner = new Scalar( + io.github.dfa1.vortex.proto.DType.ofPrimitive( + new Primitive(io.github.dfa1.vortex.proto.PType.I32, false)), + ScalarValue.ofInt64Value(7L)); + VariantData data = new VariantData(rows, inner); + + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, VARIANT_SCHEMA, WriteOptions.defaults())) { + // When + sut.writeChunk(Map.of("v", data)); + } + + // Then — the Rust reader parses the variant layout and agrees on row count + schema. + DataSource ds = DataSource.open(SESSION, file.toAbsolutePath().toUri().toString()); + OptionalLong count = ds.rowCount().asOptional(); + assertThat(count).hasValue(rows); + + Schema schema = ds.arrowSchema(ALLOCATOR); + assertThat(schema.getFields()).extracting(f -> f.getName()).contains("v"); + } +} diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index 95d0dad4..f7e02fd0 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -245,6 +245,7 @@ private static long arrayLength(Object data) { case DateTimePartsData d -> d.timestamps().length; case FixedSizeListData d -> d.outerLen(); case io.github.dfa1.vortex.writer.encode.NullableData d -> d.validity().length; + case io.github.dfa1.vortex.writer.encode.VariantData d -> d.length(); default -> throw new UnsupportedOperationException( "unsupported data type: " + data.getClass()); }; @@ -316,6 +317,10 @@ private static int serializeDType(FlatBufferBuilder fbb, DType dtype) { int inner = Extension.createExtension(fbb, idOff, storageDtypeOff, metaOff); yield io.github.dfa1.vortex.fbs.DType.createDType(fbb, Type.Extension, inner); } + case DType.Variant v -> { + int inner = io.github.dfa1.vortex.fbs.Variant.createVariant(fbb, v.nullable()); + yield io.github.dfa1.vortex.fbs.DType.createDType(fbb, Type.Variant, inner); + } default -> throw new UnsupportedOperationException("unsupported DType: " + dtype); }; } @@ -486,6 +491,11 @@ private int writeSegment(DType dtype, Object data, EncodingEncoder encodingOverr && !(dtype instanceof DType.Extension)) { encodingOverride = new MaskedEncodingEncoder(); } + // Variant columns bypass the cascade: the container encoding is structural, not a + // compressible primitive codec, so route straight to the dedicated encoder. + if (encodingOverride == null && dtype instanceof DType.Variant) { + encodingOverride = new io.github.dfa1.vortex.writer.encode.VariantEncodingEncoder(); + } try (Arena arena = Arena.ofConfined()) { EncodeResult result; if (encodingOverride != null) { diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java new file mode 100644 index 00000000..4bb0584e --- /dev/null +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java @@ -0,0 +1,25 @@ +package io.github.dfa1.vortex.writer.encode; + +import io.github.dfa1.vortex.proto.Scalar; + +/// Input data for encoding a constant `vortex.variant` column (Layer A). +/// +/// Every row holds the same variant value `constantValue` — a typed inner scalar +/// wrapped as a variant, mirroring Rust `Scalar::variant(inner)`. The column is +/// physically stored as a single `vortex.constant` child (`core_storage`) under the +/// variant container, with no shredded child and no buffers beyond the constant scalar. +/// +/// @param length number of rows in the column +/// @param constantValue the inner typed scalar repeated on every row +public record VariantData(long length, Scalar constantValue) { + + /// Validates the constant variant input. + public VariantData { + if (length < 0) { + throw new IllegalArgumentException("length must be non-negative, got " + length); + } + if (constantValue == null) { + throw new IllegalArgumentException("constantValue must not be null"); + } + } +} diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java index 6d1323c7..79b94afc 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java @@ -3,8 +3,19 @@ import io.github.dfa1.vortex.core.DType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.proto.ScalarValue; +import io.github.dfa1.vortex.proto.VariantMetadata; -/// Write-only encoder for `vortex.variant` — currently throws (not implemented). +import java.lang.foreign.MemorySegment; +import java.nio.ByteBuffer; +import java.util.List; + +/// Write-only encoder for `vortex.variant`. +/// +/// Layer A: encodes a constant variant column — every row carries the same value. +/// The container emits a single `vortex.constant` child (`core_storage`) whose scalar +/// is the variant value; there is no shredded child and the container owns no buffers. +/// The `VariantMetadata` proto records no `shredded_dtype`. public final class VariantEncodingEncoder implements EncodingEncoder { /// Public no-arg constructor required by {@link java.util.ServiceLoader}. @@ -18,11 +29,31 @@ public EncodingId encodingId() { @Override public boolean accepts(DType dtype) { - return false; + return dtype instanceof DType.Variant; } @Override public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { - throw new VortexException(EncodingId.VORTEX_VARIANT, "encode not yet implemented"); + if (!(dtype instanceof DType.Variant)) { + throw new VortexException(EncodingId.VORTEX_VARIANT, "encode requires Variant dtype, got " + dtype); + } + if (!(data instanceof VariantData variantData)) { + throw new VortexException(EncodingId.VORTEX_VARIANT, + "encode requires VariantData, got " + (data == null ? "null" : data.getClass().getName())); + } + + // core_storage: a single vortex.constant child holding the variant scalar for every row. + // The constant encoding stores its scalar in buffer 0 (not metadata), so the child is a + // buffer-backed leaf and the container threads that single buffer up to the segment. + ScalarValue coreScalar = ScalarValue.ofVariantValue(variantData.constantValue()); + MemorySegment coreBuffer = MemorySegment.ofArray(coreScalar.encode()); + EncodeNode coreChild = EncodeNode.leaf(EncodingId.VORTEX_CONSTANT, 0); + + // Container metadata: no shredding in Layer A, so shredded_dtype is absent. + ByteBuffer containerMeta = ByteBuffer.wrap(new VariantMetadata(null).encode()); + EncodeNode root = new EncodeNode(EncodingId.VORTEX_VARIANT, containerMeta, + new EncodeNode[]{coreChild}, new int[0]); + + return new EncodeResult(root, List.of(coreBuffer), null, null); } } diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java new file mode 100644 index 00000000..9bede9a8 --- /dev/null +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java @@ -0,0 +1,129 @@ +package io.github.dfa1.vortex.writer.encode; + +import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.core.VortexException; +import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.proto.Primitive; +import io.github.dfa1.vortex.proto.Scalar; +import io.github.dfa1.vortex.proto.ScalarValue; +import io.github.dfa1.vortex.proto.VariantMetadata; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +import java.lang.foreign.MemorySegment; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class VariantEncodingEncoderTest { + + private static final VariantEncodingEncoder SUT = new VariantEncodingEncoder(); + + private static Scalar i32Scalar(long value) { + // Inner typed scalar carrying its own i32 dtype, wrapped as a variant value + // (mirrors Rust Scalar::variant(Scalar::primitive(value))). + return new Scalar( + io.github.dfa1.vortex.proto.DType.ofPrimitive( + new Primitive(io.github.dfa1.vortex.proto.PType.I32, false)), + ScalarValue.ofInt64Value(value)); + } + + @Nested + class Accepts { + + @Test + void trueForVariant_falseForPrimitive() { + assertThat(SUT.accepts(new DType.Variant(false))).isTrue(); + assertThat(SUT.accepts(new DType.Primitive(io.github.dfa1.vortex.core.PType.I64, false))).isFalse(); + } + } + + @Nested + class Encode { + + @Test + void constantColumn_emitsVariantContainerOverConstantChild() throws Exception { + // Given + DType.Variant dtype = new DType.Variant(false); + VariantData data = new VariantData(5, i32Scalar(7L)); + + // When + EncodeResult result = SUT.encode(dtype, data, EncodeTestHelper.testCtx()); + + // Then — container node holds exactly one buffer-backed constant child, no buffers of its own. + EncodeNode root = result.rootNode(); + assertThat(root.encodingId()).isEqualTo(EncodingId.VORTEX_VARIANT); + assertThat(root.bufferIndices()).isEmpty(); + assertThat(root.children()).hasSize(1); + + EncodeNode child = root.children()[0]; + assertThat(child.encodingId()).isEqualTo(EncodingId.VORTEX_CONSTANT); + assertThat(child.bufferIndices()).containsExactly(0); + assertThat(result.buffers()).hasSize(1); + } + + @Test + void constantColumn_metadataHasNoShreddedDtype() throws Exception { + // Given a Layer-A constant column (no shredding) + VariantData data = new VariantData(3, i32Scalar(7L)); + + // When + EncodeResult result = SUT.encode(new DType.Variant(false), data, EncodeTestHelper.testCtx()); + + // Then the container metadata decodes to VariantMetadata with absent shredded_dtype. + MemorySegment meta = MemorySegment.ofBuffer(result.rootNode().metadata().duplicate()); + VariantMetadata decoded = VariantMetadata.decode(meta, 0, meta.byteSize()); + assertThat(decoded.shredded_dtype()).isNull(); + } + + @Test + void constantColumn_childBufferIsVariantScalar() throws Exception { + // Given + VariantData data = new VariantData(3, i32Scalar(42L)); + + // When + EncodeResult result = SUT.encode(new DType.Variant(false), data, EncodeTestHelper.testCtx()); + + // Then the constant child's buffer is a ScalarValue wrapping the inner i32 variant value. + MemorySegment buf = result.buffers().get(0); + ScalarValue scalar = ScalarValue.decode(buf, 0, buf.byteSize()); + assertThat(scalar.variant_value()).isNotNull(); + assertThat(scalar.variant_value().value().int64_value()).isEqualTo(42L); + } + + @Test + void wrongDtype_throws() { + VariantData data = new VariantData(1, i32Scalar(1L)); + assertThatThrownBy(() -> SUT.encode( + new DType.Primitive(io.github.dfa1.vortex.core.PType.I64, false), data, EncodeTestHelper.testCtx())) + .isInstanceOf(VortexException.class) + .hasMessageContaining("Variant dtype"); + } + + @Test + void wrongDataType_throws() { + assertThatThrownBy(() -> SUT.encode( + new DType.Variant(false), new long[]{1L}, EncodeTestHelper.testCtx())) + .isInstanceOf(VortexException.class) + .hasMessageContaining("VariantData"); + } + } + + @Nested + class Validation { + + @Test + void negativeLength_throws() { + assertThatThrownBy(() -> new VariantData(-1, i32Scalar(1L))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("length"); + } + + @Test + void nullConstantValue_throws() { + assertThatThrownBy(() -> new VariantData(1, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("constantValue"); + } + } +} From 2abbcd748a835c357caa92b13f943d4480a52dce Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:30:06 +0200 Subject: [PATCH 2/7] feat(variant): encode row-varying variant columns via chunked constants Extend VariantData to carry one inner scalar per row instead of a single constant. The encoder coalesces adjacent equal values into runs: - all rows equal -> a single vortex.constant child (Layer A, unchanged on the wire); - varying values -> core_storage = vortex.chunked, child 0 the cumulative u64 run offsets, then one vortex.constant per run. This mirrors the Rust reference, which represents a row-varying variant column as a chunked array of constant variant scalars under the canonical variant array (vortex.variant), with no new physical encoding. Efficient Apache Variant binary storage (vortex.parquet.variant) and shredding remain future work. VariantData.constant(length, value) preserves the constant-column ergonomics. Verified end-to-end: the Rust (JNI) reader round-trips both a constant and a row-varying Java-written variant column. Co-Authored-By: Claude Opus 4.8 --- ...antJavaWritesRustReadsIntegrationTest.java | 36 ++++- .../vortex/writer/encode/VariantData.java | 50 +++++-- .../writer/encode/VariantEncodingEncoder.java | 90 +++++++++++-- .../encode/VariantEncodingEncoderTest.java | 123 +++++++++++++----- 4 files changed, 236 insertions(+), 63 deletions(-) diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java index 6d2a8e5f..f5a60cce 100644 --- a/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java @@ -51,11 +51,7 @@ void javaWriter_jniReader_constantVariantColumn(@TempDir Path tmp) throws IOExce // its own i32 dtype so the reference reader knows the wrapped value's type. Path file = tmp.resolve("java_variant.vtx"); int rows = 5; - Scalar inner = new Scalar( - io.github.dfa1.vortex.proto.DType.ofPrimitive( - new Primitive(io.github.dfa1.vortex.proto.PType.I32, false)), - ScalarValue.ofInt64Value(7L)); - VariantData data = new VariantData(rows, inner); + VariantData data = VariantData.constant(rows, i32Variant(7L)); try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); var sut = VortexWriter.create(ch, VARIANT_SCHEMA, WriteOptions.defaults())) { @@ -71,4 +67,34 @@ void javaWriter_jniReader_constantVariantColumn(@TempDir Path tmp) throws IOExce Schema schema = ds.arrowSchema(ALLOCATOR); assertThat(schema.getFields()).extracting(f -> f.getName()).contains("v"); } + + @Test + void javaWriter_jniReader_varyingVariantColumn(@TempDir Path tmp) throws IOException { + // Given — a non-constant variant column: distinct per-row i32 values. The encoder + // lays this out as core_storage = vortex.chunked of one vortex.constant per row, + // exactly the representation the Rust reference uses for a row-varying variant array. + Path file = tmp.resolve("java_variant_varying.vtx"); + List values = List.of(i32Variant(10L), i32Variant(20L), i32Variant(30L), i32Variant(40L)); + VariantData data = new VariantData(values); + + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, VARIANT_SCHEMA, WriteOptions.defaults())) { + // When + sut.writeChunk(Map.of("v", data)); + } + + // Then — the Rust reader parses the chunked variant layout and agrees on row count + schema. + DataSource ds = DataSource.open(SESSION, file.toAbsolutePath().toUri().toString()); + assertThat(ds.rowCount().asOptional()).hasValue(values.size()); + assertThat(ds.arrowSchema(ALLOCATOR).getFields()).extracting(f -> f.getName()).contains("v"); + } + + private static Scalar i32Variant(long value) { + // Inner typed scalar carrying its own i32 dtype, wrapped as a variant value + // (mirrors Rust Scalar::variant(Scalar::primitive(value))). + return new Scalar( + io.github.dfa1.vortex.proto.DType.ofPrimitive( + new Primitive(io.github.dfa1.vortex.proto.PType.I32, false)), + ScalarValue.ofInt64Value(value)); + } } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java index 4bb0584e..50b9cdde 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java @@ -2,24 +2,48 @@ import io.github.dfa1.vortex.proto.Scalar; -/// Input data for encoding a constant `vortex.variant` column (Layer A). +import java.util.Collections; +import java.util.List; + +/// Input data for encoding a `vortex.variant` column. /// -/// Every row holds the same variant value `constantValue` — a typed inner scalar -/// wrapped as a variant, mirroring Rust `Scalar::variant(inner)`. The column is -/// physically stored as a single `vortex.constant` child (`core_storage`) under the -/// variant container, with no shredded child and no buffers beyond the constant scalar. +/// Holds one inner typed scalar per row, each wrapped as a variant value (mirroring +/// Rust `Scalar::variant(inner)`). The encoder coalesces adjacent equal values into +/// constant runs: an all-equal column becomes a single `vortex.constant` child, while +/// a column with varying values becomes a `vortex.chunked` of per-run constants. There +/// is no shredded child. /// -/// @param length number of rows in the column -/// @param constantValue the inner typed scalar repeated on every row -public record VariantData(long length, Scalar constantValue) { +/// @param values one inner scalar per row, in row order +public record VariantData(List values) { - /// Validates the constant variant input. + /// Validates and defensively copies the per-row values. Rejects empty input and + /// `null` elements. public VariantData { - if (length < 0) { - throw new IllegalArgumentException("length must be non-negative, got " + length); + values = List.copyOf(values); + if (values.isEmpty()) { + throw new IllegalArgumentException("values must not be empty"); + } + } + + /// Creates input for a constant variant column: `length` rows all holding `value`. + /// + /// @param length number of rows; must be positive + /// @param value the inner scalar repeated on every row + /// @return variant input describing a constant column + public static VariantData constant(int length, Scalar value) { + if (length <= 0) { + throw new IllegalArgumentException("length must be positive, got " + length); } - if (constantValue == null) { - throw new IllegalArgumentException("constantValue must not be null"); + if (value == null) { + throw new IllegalArgumentException("value must not be null"); } + return new VariantData(Collections.nCopies(length, value)); + } + + /// Returns the number of rows in the column. + /// + /// @return row count + public long length() { + return values.size(); } } diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java index 79b94afc..374c7aa2 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java @@ -1,23 +1,36 @@ package io.github.dfa1.vortex.writer.encode; import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.core.VortexException; import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.proto.Scalar; import io.github.dfa1.vortex.proto.ScalarValue; import io.github.dfa1.vortex.proto.VariantMetadata; import java.lang.foreign.MemorySegment; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; /// Write-only encoder for `vortex.variant`. /// -/// Layer A: encodes a constant variant column — every row carries the same value. -/// The container emits a single `vortex.constant` child (`core_storage`) whose scalar -/// is the variant value; there is no shredded child and the container owns no buffers. -/// The `VariantMetadata` proto records no `shredded_dtype`. +/// Emits the canonical variant container: a single `core_storage` child holding the +/// full value per row, no shredded child, and `VariantMetadata` with no `shredded_dtype`. +/// The container itself owns no buffers. +/// +/// `core_storage` is built from the per-row scalars in [VariantData], coalescing adjacent +/// equal values into constant runs: +/// - all rows equal → one `vortex.constant` child (the constant broadcasts to every row); +/// - otherwise → a `vortex.chunked` whose first child is the cumulative `u64` run offsets +/// and whose remaining children are one `vortex.constant` per run. +/// +/// This mirrors the Rust reference, where a non-constant variant column is a chunked +/// array of constant variant scalars under the canonical variant array. public final class VariantEncodingEncoder implements EncodingEncoder { + private static final DType U64 = new DType.Primitive(PType.U64, false); + /// Public no-arg constructor required by {@link java.util.ServiceLoader}. public VariantEncodingEncoder() { } @@ -42,18 +55,69 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { "encode requires VariantData, got " + (data == null ? "null" : data.getClass().getName())); } - // core_storage: a single vortex.constant child holding the variant scalar for every row. - // The constant encoding stores its scalar in buffer 0 (not metadata), so the child is a - // buffer-backed leaf and the container threads that single buffer up to the segment. - ScalarValue coreScalar = ScalarValue.ofVariantValue(variantData.constantValue()); - MemorySegment coreBuffer = MemorySegment.ofArray(coreScalar.encode()); - EncodeNode coreChild = EncodeNode.leaf(EncodingId.VORTEX_CONSTANT, 0); + List values = variantData.values(); + List runValues = new ArrayList<>(); + List runLengths = new ArrayList<>(); + coalesceRuns(values, runValues, runLengths); + + List buffers = new ArrayList<>(); + EncodeNode coreStorage = runValues.size() == 1 + ? constantChild(runValues.get(0), buffers) + : chunkedConstants(runValues, runLengths, ctx, buffers); - // Container metadata: no shredding in Layer A, so shredded_dtype is absent. ByteBuffer containerMeta = ByteBuffer.wrap(new VariantMetadata(null).encode()); EncodeNode root = new EncodeNode(EncodingId.VORTEX_VARIANT, containerMeta, - new EncodeNode[]{coreChild}, new int[0]); + new EncodeNode[]{coreStorage}, new int[0]); + return new EncodeResult(root, List.copyOf(buffers), null, null); + } + + /// Groups adjacent equal scalars into runs, appending each run's value and length. + private static void coalesceRuns(List values, List runValues, List runLengths) { + Scalar prev = null; + long runLen = 0; + for (Scalar s : values) { + if (prev != null && prev.equals(s)) { + runLen++; + } else { + if (prev != null) { + runValues.add(prev); + runLengths.add(runLen); + } + prev = s; + runLen = 1; + } + } + runValues.add(prev); + runLengths.add(runLen); + } + + /// Builds a buffer-backed `vortex.constant` child for one variant scalar, appending + /// its serialized scalar to `buffers`. + private static EncodeNode constantChild(Scalar value, List buffers) { + ScalarValue scalar = ScalarValue.ofVariantValue(value); + int bufIdx = buffers.size(); + buffers.add(MemorySegment.ofArray(scalar.encode())); + return EncodeNode.leaf(EncodingId.VORTEX_CONSTANT, bufIdx); + } - return new EncodeResult(root, List.of(coreBuffer), null, null); + /// Builds a `vortex.chunked` node: child 0 is the cumulative `u64` run offsets, the + /// rest are one constant child per run. Appends all buffers to `buffers`. + private static EncodeNode chunkedConstants(List runValues, List runLengths, + EncodeContext ctx, List buffers) { + int nruns = runValues.size(); + long[] offsets = new long[nruns + 1]; + for (int i = 0; i < nruns; i++) { + offsets[i + 1] = offsets[i] + runLengths.get(i); + } + + EncodeResult offsetsResult = ctx.lookupEncoder(EncodingId.VORTEX_PRIMITIVE).encode(U64, offsets, ctx); + buffers.addAll(offsetsResult.buffers()); + + EncodeNode[] children = new EncodeNode[nruns + 1]; + children[0] = offsetsResult.rootNode(); + for (int i = 0; i < nruns; i++) { + children[i + 1] = constantChild(runValues.get(i), buffers); + } + return new EncodeNode(EncodingId.VORTEX_CHUNKED, ByteBuffer.wrap(new byte[0]), children, new int[0]); } } diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java index 9bede9a8..f3af7c6a 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java @@ -11,6 +11,8 @@ import org.junit.jupiter.api.Test; import java.lang.foreign.MemorySegment; +import java.nio.ByteOrder; +import java.util.List; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -18,6 +20,7 @@ class VariantEncodingEncoderTest { private static final VariantEncodingEncoder SUT = new VariantEncodingEncoder(); + private static final DType.Variant VARIANT = new DType.Variant(false); private static Scalar i32Scalar(long value) { // Inner typed scalar carrying its own i32 dtype, wrapped as a variant value @@ -28,29 +31,34 @@ private static Scalar i32Scalar(long value) { ScalarValue.ofInt64Value(value)); } + private static long innerInt(MemorySegment buf) throws Exception { + ScalarValue scalar = ScalarValue.decode(buf, 0, buf.byteSize()); + assertThat(scalar.variant_value()).isNotNull(); + return scalar.variant_value().value().int64_value(); + } + @Nested class Accepts { @Test void trueForVariant_falseForPrimitive() { - assertThat(SUT.accepts(new DType.Variant(false))).isTrue(); + assertThat(SUT.accepts(VARIANT)).isTrue(); assertThat(SUT.accepts(new DType.Primitive(io.github.dfa1.vortex.core.PType.I64, false))).isFalse(); } } @Nested - class Encode { + class ConstantColumn { @Test - void constantColumn_emitsVariantContainerOverConstantChild() throws Exception { - // Given - DType.Variant dtype = new DType.Variant(false); - VariantData data = new VariantData(5, i32Scalar(7L)); + void allEqual_emitsSingleConstantChild() throws Exception { + // Given a column whose rows are all the same value + VariantData data = VariantData.constant(5, i32Scalar(7L)); // When - EncodeResult result = SUT.encode(dtype, data, EncodeTestHelper.testCtx()); + EncodeResult result = SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()); - // Then — container node holds exactly one buffer-backed constant child, no buffers of its own. + // Then — container holds exactly one buffer-backed constant child, no chunked layer. EncodeNode root = result.rootNode(); assertThat(root.encodingId()).isEqualTo(EncodingId.VORTEX_VARIANT); assertThat(root.bufferIndices()).isEmpty(); @@ -60,40 +68,85 @@ void constantColumn_emitsVariantContainerOverConstantChild() throws Exception { assertThat(child.encodingId()).isEqualTo(EncodingId.VORTEX_CONSTANT); assertThat(child.bufferIndices()).containsExactly(0); assertThat(result.buffers()).hasSize(1); + assertThat(innerInt(result.buffers().get(0))).isEqualTo(7L); } @Test - void constantColumn_metadataHasNoShreddedDtype() throws Exception { - // Given a Layer-A constant column (no shredding) - VariantData data = new VariantData(3, i32Scalar(7L)); + void metadataHasNoShreddedDtype() throws Exception { + EncodeResult result = SUT.encode(VARIANT, VariantData.constant(3, i32Scalar(7L)), EncodeTestHelper.testCtx()); - // When - EncodeResult result = SUT.encode(new DType.Variant(false), data, EncodeTestHelper.testCtx()); - - // Then the container metadata decodes to VariantMetadata with absent shredded_dtype. MemorySegment meta = MemorySegment.ofBuffer(result.rootNode().metadata().duplicate()); VariantMetadata decoded = VariantMetadata.decode(meta, 0, meta.byteSize()); assertThat(decoded.shredded_dtype()).isNull(); } + } + + @Nested + class VaryingColumn { @Test - void constantColumn_childBufferIsVariantScalar() throws Exception { - // Given - VariantData data = new VariantData(3, i32Scalar(42L)); + void distinctValues_emitChunkedOfConstants() throws Exception { + // Given three distinct per-row values + VariantData data = new VariantData(List.of(i32Scalar(7L), i32Scalar(8L), i32Scalar(9L))); // When - EncodeResult result = SUT.encode(new DType.Variant(false), data, EncodeTestHelper.testCtx()); + EncodeResult result = SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()); + + // Then — container wraps a chunked node: child 0 is the offsets, then one constant per run. + EncodeNode chunked = result.rootNode().children()[0]; + assertThat(chunked.encodingId()).isEqualTo(EncodingId.VORTEX_CHUNKED); + assertThat(chunked.children()).hasSize(4); + assertThat(chunked.children()[0].encodingId()).isEqualTo(EncodingId.VORTEX_PRIMITIVE); + for (int i = 1; i <= 3; i++) { + assertThat(chunked.children()[i].encodingId()).isEqualTo(EncodingId.VORTEX_CONSTANT); + } + // offsets buffer (index 0) + one buffer per constant = 4 total + assertThat(result.buffers()).hasSize(4); + } + + @Test + void distinctValues_offsetsAreCumulativeRunLengths() { + // Given one row per distinct value: run offsets must be 0,1,2,3 + VariantData data = new VariantData(List.of(i32Scalar(7L), i32Scalar(8L), i32Scalar(9L))); + + EncodeResult result = SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()); + + MemorySegment offsets = result.buffers().get(0); + var bb = offsets.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + assertThat(bb.getLong(0)).isZero(); + assertThat(bb.getLong(8)).isEqualTo(1L); + assertThat(bb.getLong(16)).isEqualTo(2L); + assertThat(bb.getLong(24)).isEqualTo(3L); + } - // Then the constant child's buffer is a ScalarValue wrapping the inner i32 variant value. - MemorySegment buf = result.buffers().get(0); - ScalarValue scalar = ScalarValue.decode(buf, 0, buf.byteSize()); - assertThat(scalar.variant_value()).isNotNull(); - assertThat(scalar.variant_value().value().int64_value()).isEqualTo(42L); + @Test + void adjacentEqualValues_coalesceIntoOneRun() throws Exception { + // Given [7,7,8]: two runs (7 length 2, 8 length 1) → offsets 0,2,3 + VariantData data = new VariantData(List.of(i32Scalar(7L), i32Scalar(7L), i32Scalar(8L))); + + EncodeResult result = SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()); + + EncodeNode chunked = result.rootNode().children()[0]; + assertThat(chunked.children()).hasSize(3); // offsets + 2 constants + assertThat(result.buffers()).hasSize(3); + + MemorySegment offsets = result.buffers().get(0); + var bb = offsets.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + assertThat(bb.getLong(0)).isZero(); + assertThat(bb.getLong(8)).isEqualTo(2L); + assertThat(bb.getLong(16)).isEqualTo(3L); + // run values preserved in order + assertThat(innerInt(result.buffers().get(1))).isEqualTo(7L); + assertThat(innerInt(result.buffers().get(2))).isEqualTo(8L); } + } + + @Nested + class Errors { @Test void wrongDtype_throws() { - VariantData data = new VariantData(1, i32Scalar(1L)); + VariantData data = VariantData.constant(1, i32Scalar(1L)); assertThatThrownBy(() -> SUT.encode( new DType.Primitive(io.github.dfa1.vortex.core.PType.I64, false), data, EncodeTestHelper.testCtx())) .isInstanceOf(VortexException.class) @@ -102,8 +155,7 @@ void wrongDtype_throws() { @Test void wrongDataType_throws() { - assertThatThrownBy(() -> SUT.encode( - new DType.Variant(false), new long[]{1L}, EncodeTestHelper.testCtx())) + assertThatThrownBy(() -> SUT.encode(VARIANT, new long[]{1L}, EncodeTestHelper.testCtx())) .isInstanceOf(VortexException.class) .hasMessageContaining("VariantData"); } @@ -113,17 +165,24 @@ void wrongDataType_throws() { class Validation { @Test - void negativeLength_throws() { - assertThatThrownBy(() -> new VariantData(-1, i32Scalar(1L))) + void emptyValues_throws() { + assertThatThrownBy(() -> new VariantData(List.of())) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("empty"); + } + + @Test + void constant_nonPositiveLength_throws() { + assertThatThrownBy(() -> VariantData.constant(0, i32Scalar(1L))) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("length"); } @Test - void nullConstantValue_throws() { - assertThatThrownBy(() -> new VariantData(1, null)) + void constant_nullValue_throws() { + assertThatThrownBy(() -> VariantData.constant(1, null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("constantValue"); + .hasMessageContaining("value"); } } } From c5ade5bbadb183663582ae7cb7f67d34677a6e89 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:41:00 +0200 Subject: [PATCH 3/7] =?UTF-8?q?docs(adr):=200014=20variant=20encoding=20st?= =?UTF-8?q?rategy=20=E2=80=94=20chunked=20constants=20now,=20parquet.varia?= =?UTF-8?q?nt=20later?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Record the decision to encode variant columns via the canonical vortex.variant container over existing encodings (constant / chunked-of-constants), and to defer the vortex.parquet.variant physical encoding (and shredding of arbitrary objects) until a real JSON-shaped object-column need arrives. Captures the Rust wire-format references gathered while implementing the encoder. Co-Authored-By: Claude Opus 4.8 --- docs/adr/0014-variant-encoding-strategy.md | 90 ++++++++++++++++++++++ docs/adr/ADR.md | 1 + 2 files changed, 91 insertions(+) create mode 100644 docs/adr/0014-variant-encoding-strategy.md diff --git a/docs/adr/0014-variant-encoding-strategy.md b/docs/adr/0014-variant-encoding-strategy.md new file mode 100644 index 00000000..6102e0d5 --- /dev/null +++ b/docs/adr/0014-variant-encoding-strategy.md @@ -0,0 +1,90 @@ +# ADR 0014: Variant encoding strategy — chunked constants now, parquet.variant later + +- **Status:** Accepted +- **Date:** 2026-06-18 +- **Deciders:** project maintainer +- **Supersedes:** — +- **Superseded by:** — + +## Context + +Java could already *decode* the canonical variant container (`vortex.variant`) but +`VariantEncodingEncoder` threw `"encode not yet implemented"`, so no Java-written file +could carry a variant column. Variant is the biggest external-facing gap: every +JSON-shaped ingest pipeline maps to it. + +The Rust reference has two distinct encodings: + +- **`vortex.variant`** — the canonical container. Purely structural: `nbuffers = 0`, + slots `[core_storage, shredded]`. It stores no bytes itself. `core_storage` must be a + `DType::Variant` array but "may be chunked, constant, or otherwise encoded". `shredded` + is an optional row-aligned typed tree for selected paths, its dtype recorded in + `VariantMetadata.shredded_dtype`. +- **`vortex.parquet.variant`** — a separate *physical* encoding (id literally + `"vortex.parquet.variant"`), slots `[validity, metadata, value, typed_value]`, where + `metadata` and `value` are opaque Binary columns holding the Apache Variant binary + (metadata dictionary + value bytes) per row. This is what Rust writes for real, + arbitrary JSON-shaped columns. + +A row-varying variant column therefore has (at least) two valid physical layouts. The +Rust test suite itself builds a non-constant column *without* `parquet.variant`: a +`ChunkedArray` of one-row `ConstantArray`s of variant scalars, wrapped in the canonical +`VariantArray`. + +## Decision + +Encode variant columns using **only the canonical `vortex.variant` container over +existing encodings** — no new physical encoding for now. + +`core_storage` is built from per-row inner scalars (`Scalar::variant(inner)`): + +- all rows equal → a single `vortex.constant` child (the constant broadcasts); +- varying rows → `vortex.chunked`, child 0 = cumulative `u64` run offsets, then one + `vortex.constant` per run of equal adjacent values. + +Adjacent-equal values are coalesced into runs so an all-equal column collapses to one +constant. Shredding is expressed through the container's `shredded` child plus +`VariantMetadata.shredded_dtype`. + +**Defer `vortex.parquet.variant`** (the efficient Apache-Variant-binary physical +encoding) until a concrete need for arbitrary JSON-shaped object columns arrives. + +## Consequences + +### Positive + +- No new `EncodingId`, decoder, encoder, proto message, or reader array type. Reuses + `vortex.variant` + `vortex.chunked` + `vortex.constant`, all already round-tripped. +- Matches a layout the Rust reference produces and reads, so Java↔Rust interop holds + (verified via JNI: rowCount + arrow schema for constant and row-varying columns). +- Keeps the variant surface small and reviewable; ships value now. + +### Negative + +- Per-row values are limited to **typed scalars** wrapped as variants, not arbitrary + Apache Variant binary objects (nested JSON). Real object columns need + `parquet.variant`. +- One `vortex.constant` per distinct run is space-inefficient for high-cardinality + columns versus two packed Binary columns. + +### Risks to manage + +- Heterogeneous inner dtypes across rows: the chunked-of-constants read path assumes a + consistent inner dtype. Mixed-type variant columns are out of scope until + `parquet.variant`. +- When `parquet.variant` lands it becomes a second physical layout for the same logical + column; readers must handle both (the canonical container already abstracts this). + +## Alternatives considered + +- **Implement `vortex.parquet.variant` now.** Rejected for the first milestone: new + encoding id + decoder + encoder + `ParquetVariantMetadata{has_value, typed_value_dtype, + value_nullable}` proto (regeneration) + reader array type — a much larger surface, not + needed until arbitrary object columns are required. Tracked as the next step. + +## References + +- Rust: `vortex-array/src/arrays/variant/vtable/mod.rs` (`vortex.variant`), + `encodings/parquet-variant/src/vtable.rs` (`vortex.parquet.variant`), + `vortex-array/src/arrays/chunked/array.rs` (cumulative `chunk_offsets`). +- Commits `35438c72` (Layer A constant), `3b1be436` (Layer B chunked constants). diff --git a/docs/adr/ADR.md b/docs/adr/ADR.md index ec4f7c84..d356f702 100644 --- a/docs/adr/ADR.md +++ b/docs/adr/ADR.md @@ -26,3 +26,4 @@ Each ADR is a Markdown file named `NNNN-short-title.md`. Use `template.md` as th | 0011 | Writer zero-copy MemorySegment overload | Deferred | | 0012 | Zero-copy layout decoding: lazy Chunked/Dict | Implemented | | 0013 | Compute primitives: masks, kernels, no-materialise | Proposed | +| 0014 | Variant encoding: chunked constants now, parquet.variant later | Accepted | From df6fa2cc777a41999532ae824c0eb3afc05d0670 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:41:00 +0200 Subject: [PATCH 4/7] feat(variant): decode variant columns Java-side (constant + chunked) ConstantEncodingDecoder now handles DType.Variant by unwrapping the variant scalar to its typed inner scalar and materialising the inner-typed constant array; the dispatch was refactored to thread the dtype explicitly so Extension and Variant can recurse without re-reading the buffer. ChunkedEncodingDecoder wraps Variant chunks under their inner dtype. VariantEncodingDecoder.accepts now reports Variant. Together with VariantEncodingDecoder this makes the Java reader symmetric with the writer: a Java-encoded variant column round-trips through Java decode, with core storage exposing each row's inner value (verified for constant and row-varying columns). Co-Authored-By: Claude Opus 4.8 --- .../reader/decode/ChunkedEncodingDecoder.java | 17 ++++ .../decode/ConstantEncodingDecoder.java | 86 ++++++++++--------- .../reader/decode/VariantEncodingDecoder.java | 2 +- .../encode/VariantEncodingEncoderTest.java | 57 ++++++++++++ 4 files changed, 122 insertions(+), 40 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ChunkedEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ChunkedEncodingDecoder.java index 920beb00..633b7977 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ChunkedEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ChunkedEncodingDecoder.java @@ -86,6 +86,23 @@ private static Array wrap(List chunks, DType dtype, long totalRows) { if (dtype instanceof DType.Struct struct) { return wrapStruct(chunks, struct, totalRows); } + if (dtype instanceof DType.Variant) { + // Each chunk decoded as Variant materialises to its inner-typed constant array + // (see ConstantEncodingDecoder). Wrap the chunks under that inner dtype; the + // VariantArray container re-applies the logical Variant dtype. + if (chunks.isEmpty()) { + throw new VortexException(EncodingId.VORTEX_CHUNKED, "chunked variant has no chunks"); + } + DType innerDtype = chunks.get(0).dtype(); + if (innerDtype instanceof DType.Primitive innerPt) { + return wrapPrimitive(chunks, innerPt, innerDtype, totalRows); + } + if (innerDtype instanceof DType.Bool) { + return ChunkedBoolArray.of(innerDtype, totalRows, chunks); + } + throw new VortexException(EncodingId.VORTEX_CHUNKED, + "chunked variant inner dtype not supported: " + innerDtype); + } throw new VortexException(EncodingId.VORTEX_CHUNKED, "chunked not supported for dtype: " + dtype); } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ConstantEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ConstantEncodingDecoder.java index 5147e11a..ba07421d 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ConstantEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/ConstantEncodingDecoder.java @@ -52,69 +52,77 @@ public Array decode(DecodeContext ctx) { throw new VortexException(EncodingId.VORTEX_CONSTANT, "invalid scalar value", e); } - long n = ctx.rowCount(); + return arrayFromScalar(ctx, scalar, ctx.dtype(), ctx.rowCount()); + } - if (ctx.dtype() instanceof DType.Null) { - return new NullArray(ctx.dtype(), n); + /// Builds the constant array for `scalar` interpreted as `dtype`, broadcast to `n` rows. + /// Recurses for Extension (its storage dtype) and Variant (the wrapped inner scalar). + private static Array arrayFromScalar(DecodeContext ctx, ScalarValue scalar, DType dtype, long n) { + if (dtype instanceof DType.Null) { + return new NullArray(dtype, n); } - - if (ctx.dtype() instanceof DType.Utf8 || ctx.dtype() instanceof DType.Binary) { - return decodeString(ctx, scalar, n); + if (dtype instanceof DType.Variant) { + // A constant variant wraps a typed inner scalar (Scalar::variant(inner)); the + // physical storage is the inner-typed constant array. The VariantArray wrapper + // re-applies the logical Variant dtype. + io.github.dfa1.vortex.proto.Scalar inner = scalar.variant_value(); + if (inner == null || inner.value() == null) { + throw new VortexException(EncodingId.VORTEX_CONSTANT, "constant variant missing variant_value"); + } + DType innerDtype = VariantEncodingDecoder.dtypeFromProto(inner.dtype()); + return arrayFromScalar(ctx, inner.value(), innerDtype, n); } - - if (ctx.dtype() instanceof DType.Bool) { - return decodeBool(ctx, scalar, n); + if (dtype instanceof DType.Utf8 || dtype instanceof DType.Binary) { + return decodeString(ctx, scalar, dtype, n); } - - if (ctx.dtype() instanceof DType.Decimal) { - return decodeDecimal(ctx, scalar, n); + if (dtype instanceof DType.Bool) { + return decodeBool(dtype, scalar, n); } - - if (ctx.dtype() instanceof DType.Extension ext) { - var storageCtx = new DecodeContext(ctx.node(), ext.storageDType(), ctx.rowCount(), - ctx.segmentBuffers(), ctx.registry(), ctx.arena()); - Array storage = decode(storageCtx); - // GenericArray needs a backing buffer; the recursive call now returns a - // metadata-only LazyConstantXxxArray. Materialise once into the chunk arena - // so downstream extension consumers that read via ArraySegments.of(arr) still - // find a segment. Extension-on-constant is rare enough that the small alloc - // doesn't matter — the bare primitive path stays buffer-free. - return new GenericArray(ctx.dtype(), n, ArraySegments.of(storage, ctx.arena())); + if (dtype instanceof DType.Decimal) { + return decodeDecimal(dtype, scalar, n); } - - if (!(ctx.dtype() instanceof DType.Primitive p)) { - throw new VortexException(EncodingId.VORTEX_CONSTANT, "unsupported dtype " + ctx.dtype()); + if (dtype instanceof DType.Extension ext) { + Array storage = arrayFromScalar(ctx, scalar, ext.storageDType(), n); + // GenericArray needs a backing buffer; the recursive call returns a metadata-only + // LazyConstantXxxArray. Materialise once into the chunk arena so downstream + // extension consumers that read via ArraySegments.of(arr) still find a segment. + // Extension-on-constant is rare enough that the small alloc doesn't matter — the + // bare primitive path stays buffer-free. + return new GenericArray(dtype, n, ArraySegments.of(storage, ctx.arena())); + } + if (!(dtype instanceof DType.Primitive p)) { + throw new VortexException(EncodingId.VORTEX_CONSTANT, "unsupported dtype " + dtype); } PType ptype = p.ptype(); long rawBits = scalarToRawBits(scalar, ptype); return switch (ptype) { - case I64, U64 -> new LazyConstantLongArray(ctx.dtype(), n, rawBits); - case I32, U32 -> new LazyConstantIntArray(ctx.dtype(), n, (int) rawBits); - case F64 -> new LazyConstantDoubleArray(ctx.dtype(), n, Double.longBitsToDouble(rawBits)); - case F32 -> new LazyConstantFloatArray(ctx.dtype(), n, Float.intBitsToFloat((int) rawBits)); - case I16, U16 -> new LazyConstantShortArray(ctx.dtype(), n, (short) rawBits); - case I8, U8 -> new LazyConstantByteArray(ctx.dtype(), n, (byte) rawBits); + case I64, U64 -> new LazyConstantLongArray(dtype, n, rawBits); + case I32, U32 -> new LazyConstantIntArray(dtype, n, (int) rawBits); + case F64 -> new LazyConstantDoubleArray(dtype, n, Double.longBitsToDouble(rawBits)); + case F32 -> new LazyConstantFloatArray(dtype, n, Float.intBitsToFloat((int) rawBits)); + case I16, U16 -> new LazyConstantShortArray(dtype, n, (short) rawBits); + case I8, U8 -> new LazyConstantByteArray(dtype, n, (byte) rawBits); default -> throw new VortexException(EncodingId.VORTEX_CONSTANT, "unsupported ptype " + ptype); }; } - private static Array decodeDecimal(DecodeContext ctx, ScalarValue scalar, long n) { + private static Array decodeDecimal(DType dtype, ScalarValue scalar, long n) { byte[] elemBytes = scalar.bytes_value(); int elemLen = elemBytes.length; // Decode the single scalar value via LazyDecimalArray (reuses its LE byte-order logic), // then wrap in a constant array — O(1) allocation regardless of row count. - var value = new LazyDecimalArray(ctx.dtype(), 1, MemorySegment.ofArray(elemBytes), elemLen).getDecimal(0); - return new LazyConstantDecimalArray(ctx.dtype(), n, value, elemLen); + var value = new LazyDecimalArray(dtype, 1, MemorySegment.ofArray(elemBytes), elemLen).getDecimal(0); + return new LazyConstantDecimalArray(dtype, n, value, elemLen); } - private static Array decodeBool(DecodeContext ctx, ScalarValue scalar, long n) { + private static Array decodeBool(DType dtype, ScalarValue scalar, long n) { boolean value = scalar.bool_value() != null && scalar.bool_value(); - return new LazyConstantBoolArray(ctx.dtype(), n, value); + return new LazyConstantBoolArray(dtype, n, value); } - private static Array decodeString(DecodeContext ctx, ScalarValue scalar, long n) { + private static Array decodeString(DecodeContext ctx, ScalarValue scalar, DType dtype, long n) { byte[] strBytes = scalar.string_value() != null ? scalar.string_value().getBytes(StandardCharsets.UTF_8) : (scalar.bytes_value() != null ? scalar.bytes_value() : new byte[0]); @@ -131,7 +139,7 @@ private static Array decodeString(DecodeContext ctx, ScalarValue scalar, long n) offsetsSeg.setAtIndex(PTypeIO.LE_INT, i, (int) (i * strLen)); } - return new VarBinArray.OffsetMode(ctx.dtype(), n, bytesSeg.asReadOnly(), offsetsSeg.asReadOnly(), PType.I32); + return new VarBinArray.OffsetMode(dtype, n, bytesSeg.asReadOnly(), offsetsSeg.asReadOnly(), PType.I32); } private static long scalarToRawBits(ScalarValue scalar, PType ptype) { diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/VariantEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/VariantEncodingDecoder.java index 1942c2cb..54b6f27e 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/VariantEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/VariantEncodingDecoder.java @@ -28,7 +28,7 @@ public EncodingId encodingId() { @Override public boolean accepts(DType dtype) { - return false; + return dtype instanceof DType.Variant; } @Override diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java index f3af7c6a..cb4c778a 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java @@ -141,6 +141,63 @@ void adjacentEqualValues_coalesceIntoOneRun() throws Exception { } } + @Nested + class RoundTrip { + + private static final io.github.dfa1.vortex.reader.ReadRegistry REGISTRY = + io.github.dfa1.vortex.reader.decode.TestRegistry.ofDecoders( + new io.github.dfa1.vortex.reader.decode.VariantEncodingDecoder(), + new io.github.dfa1.vortex.reader.decode.ConstantEncodingDecoder(), + new io.github.dfa1.vortex.reader.decode.ChunkedEncodingDecoder(), + new io.github.dfa1.vortex.reader.decode.PrimitiveEncodingDecoder()); + + private static io.github.dfa1.vortex.reader.decode.ArrayNode toArrayNode(EncodeNode node) { + io.github.dfa1.vortex.reader.decode.ArrayNode[] children = + new io.github.dfa1.vortex.reader.decode.ArrayNode[node.children().length]; + for (int i = 0; i < children.length; i++) { + children[i] = toArrayNode(node.children()[i]); + } + return io.github.dfa1.vortex.reader.decode.ArrayNode.of( + node.encodingId(), node.metadata(), children, node.bufferIndices()); + } + + private static io.github.dfa1.vortex.reader.array.VariantArray decode(EncodeResult result, long rows) { + MemorySegment[] bufs = result.buffers().toArray(MemorySegment[]::new); + var ctx = new io.github.dfa1.vortex.reader.decode.DecodeContext( + toArrayNode(result.rootNode()), VARIANT, rows, bufs, REGISTRY, java.lang.foreign.Arena.global()); + return (io.github.dfa1.vortex.reader.array.VariantArray) new io.github.dfa1.vortex.reader.decode.VariantEncodingDecoder().decode(ctx); + } + + @Test + void constantColumn_decodesToBroadcastInnerValues() { + // Given/When a constant column is encoded then decoded back + var result = SUT.encode(VARIANT, VariantData.constant(4, i32Scalar(7L)), EncodeTestHelper.testCtx()); + var variant = decode(result, 4); + + // Then core storage is the inner i32 value broadcast to every row + assertThat(variant.length()).isEqualTo(4); + assertThat(variant.shredded()).isNull(); + var core = (io.github.dfa1.vortex.reader.array.IntArray) variant.coreStorage(); + assertThat(core.dtype()).isEqualTo(new DType.Primitive(io.github.dfa1.vortex.core.PType.I32, false)); + for (long i = 0; i < 4; i++) { + assertThat(core.getInt(i)).isEqualTo(7); + } + } + + @Test + void varyingColumn_decodesPerRowValuesInOrder() { + // Given/When distinct per-row values are encoded (chunked) then decoded back + var data = new VariantData(List.of(i32Scalar(10L), i32Scalar(20L), i32Scalar(30L))); + var variant = decode(SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()), 3); + + // Then the chunked core storage yields each row's inner value in order + var core = (io.github.dfa1.vortex.reader.array.IntArray) variant.coreStorage(); + assertThat(core.getInt(0)).isEqualTo(10); + assertThat(core.getInt(1)).isEqualTo(20); + assertThat(core.getInt(2)).isEqualTo(30); + } + } + @Nested class Errors { From 12d78fe1c158941dfbcf7f99204007a37917278d Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:47:33 +0200 Subject: [PATCH 5/7] =?UTF-8?q?feat(variant):=20shredding=20=E2=80=94=20op?= =?UTF-8?q?tional=20typed=20shredded=20child=20(Layer=20C)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VariantData can now carry a row-aligned shredded typed column (shreddedData + shreddedDtype). When present, the encoder emits it as the container's second child (encoded via the matching primitive/varbin/bool encoder) and records its dtype in VariantMetadata.shredded_dtype; otherwise the container stays single-child as before. The existing VariantEncodingDecoder already reads the shredded child, so this round-trips Java-side (decode exposes VariantArray.shredded()) and through the Rust reference reader (JNI). NOTE: with the current scalar-only variant model there are no object paths, so a shredded child can only re-express the values as a typed column. Real path-based shredding of nested objects needs vortex.parquet.variant (deferred, ADR 0014). Co-Authored-By: Claude Opus 4.8 --- ...antJavaWritesRustReadsIntegrationTest.java | 22 ++++++++ .../vortex/writer/encode/VariantData.java | 50 ++++++++++++++--- .../writer/encode/VariantEncodingEncoder.java | 56 ++++++++++++++++++- .../encode/VariantEncodingEncoderTest.java | 53 ++++++++++++++++++ 4 files changed, 169 insertions(+), 12 deletions(-) diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java index f5a60cce..11e70666 100644 --- a/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/VariantJavaWritesRustReadsIntegrationTest.java @@ -5,6 +5,7 @@ import dev.vortex.arrow.ArrowAllocation; import dev.vortex.jni.NativeLoader; import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.core.PType; import io.github.dfa1.vortex.proto.Primitive; import io.github.dfa1.vortex.proto.Scalar; import io.github.dfa1.vortex.proto.ScalarValue; @@ -89,6 +90,27 @@ void javaWriter_jniReader_varyingVariantColumn(@TempDir Path tmp) throws IOExcep assertThat(ds.arrowSchema(ALLOCATOR).getFields()).extracting(f -> f.getName()).contains("v"); } + @Test + void javaWriter_jniReader_shreddedVariantColumn(@TempDir Path tmp) throws IOException { + // Given — a variant column with a row-aligned shredded i32 projection. The container + // gains a second (shredded) typed child plus shredded_dtype in its metadata. + Path file = tmp.resolve("java_variant_shredded.vtx"); + List values = List.of(i32Variant(10L), i32Variant(20L), i32Variant(30L)); + VariantData data = VariantData.shredded( + values, new int[]{10, 20, 30}, new DType.Primitive(PType.I32, false)); + + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, VARIANT_SCHEMA, WriteOptions.defaults())) { + // When + sut.writeChunk(Map.of("v", data)); + } + + // Then — the Rust reader parses the shredded variant layout and agrees on row count + schema. + DataSource ds = DataSource.open(SESSION, file.toAbsolutePath().toUri().toString()); + assertThat(ds.rowCount().asOptional()).hasValue(values.size()); + assertThat(ds.arrowSchema(ALLOCATOR).getFields()).extracting(f -> f.getName()).contains("v"); + } + private static Scalar i32Variant(long value) { // Inner typed scalar carrying its own i32 dtype, wrapped as a variant value // (mirrors Rust Scalar::variant(Scalar::primitive(value))). diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java index 50b9cdde..585cb7aa 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantData.java @@ -1,5 +1,6 @@ package io.github.dfa1.vortex.writer.encode; +import io.github.dfa1.vortex.core.DType; import io.github.dfa1.vortex.proto.Scalar; import java.util.Collections; @@ -7,22 +8,40 @@ /// Input data for encoding a `vortex.variant` column. /// -/// Holds one inner typed scalar per row, each wrapped as a variant value (mirroring -/// Rust `Scalar::variant(inner)`). The encoder coalesces adjacent equal values into -/// constant runs: an all-equal column becomes a single `vortex.constant` child, while -/// a column with varying values becomes a `vortex.chunked` of per-run constants. There -/// is no shredded child. +/// `values` holds one inner typed scalar per row, each wrapped as a variant value +/// (mirroring Rust `Scalar::variant(inner)`). The encoder coalesces adjacent equal +/// values into constant runs: an all-equal column becomes a single `vortex.constant` +/// child, while a varying column becomes a `vortex.chunked` of per-run constants. /// -/// @param values one inner scalar per row, in row order -public record VariantData(List values) { +/// Optionally, `shreddedData` carries a row-aligned typed column pulled out for fast +/// typed access; when present it is encoded as the container's `shredded` child and its +/// `shreddedDtype` is recorded in `VariantMetadata.shredded_dtype`. `shreddedData` uses +/// the same encoder-input shape as the matching encoding (e.g. `int[]` for I32, +/// `String[]` for Utf8). Both `shreddedData` and `shreddedDtype` must be set together or +/// both left `null`. +/// +/// @param values one inner scalar per row, in row order +/// @param shreddedData optional typed column data for the shredded child, or `null` +/// @param shreddedDtype dtype of `shreddedData`, or `null` +public record VariantData(List values, Object shreddedData, DType shreddedDtype) { - /// Validates and defensively copies the per-row values. Rejects empty input and - /// `null` elements. + /// Validates and defensively copies the per-row values; rejects empty input and a + /// half-specified shredded column. public VariantData { values = List.copyOf(values); if (values.isEmpty()) { throw new IllegalArgumentException("values must not be empty"); } + if ((shreddedData == null) != (shreddedDtype == null)) { + throw new IllegalArgumentException("shreddedData and shreddedDtype must be both set or both null"); + } + } + + /// Creates unshredded input from per-row scalars. + /// + /// @param values one inner scalar per row, in row order + public VariantData(List values) { + this(values, null, null); } /// Creates input for a constant variant column: `length` rows all holding `value`. @@ -40,6 +59,19 @@ public static VariantData constant(int length, Scalar value) { return new VariantData(Collections.nCopies(length, value)); } + /// Creates input with a row-aligned shredded typed column. + /// + /// @param values one inner scalar per row, in row order + /// @param shreddedData typed column data (encoder-input shape for `shreddedDtype`) + /// @param shreddedDtype dtype of `shreddedData` + /// @return variant input carrying a shredded child + public static VariantData shredded(List values, Object shreddedData, DType shreddedDtype) { + if (shreddedData == null || shreddedDtype == null) { + throw new IllegalArgumentException("shreddedData and shreddedDtype are required"); + } + return new VariantData(values, shreddedData, shreddedDtype); + } + /// Returns the number of rows in the column. /// /// @return row count diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java index 374c7aa2..34f92a82 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoder.java @@ -65,12 +65,62 @@ public EncodeResult encode(DType dtype, Object data, EncodeContext ctx) { ? constantChild(runValues.get(0), buffers) : chunkedConstants(runValues, runLengths, ctx, buffers); - ByteBuffer containerMeta = ByteBuffer.wrap(new VariantMetadata(null).encode()); - EncodeNode root = new EncodeNode(EncodingId.VORTEX_VARIANT, containerMeta, - new EncodeNode[]{coreStorage}, new int[0]); + EncodeNode[] children; + io.github.dfa1.vortex.proto.DType shreddedProto = null; + if (variantData.shreddedData() != null) { + children = new EncodeNode[]{coreStorage, encodeShredded(variantData, ctx, buffers)}; + shreddedProto = toProtoDtype(variantData.shreddedDtype()); + } else { + children = new EncodeNode[]{coreStorage}; + } + + ByteBuffer containerMeta = ByteBuffer.wrap(new VariantMetadata(shreddedProto).encode()); + EncodeNode root = new EncodeNode(EncodingId.VORTEX_VARIANT, containerMeta, children, new int[0]); return new EncodeResult(root, List.copyOf(buffers), null, null); } + /// Encoders eligible to back the shredded child, tried in order by dtype. + private static final List SHREDDED_FALLBACK = List.of( + new PrimitiveEncodingEncoder(), new VarBinEncodingEncoder(), + new BoolEncodingEncoder(), new NullEncodingEncoder()); + + /// Encodes the shredded typed column as a child node, appending its buffers (remapped + /// to follow the core-storage buffers already in `buffers`). + private static EncodeNode encodeShredded(VariantData data, EncodeContext ctx, List buffers) { + EncodingEncoder enc = null; + for (EncodingEncoder e : SHREDDED_FALLBACK) { + if (e.accepts(data.shreddedDtype())) { + enc = e; + break; + } + } + if (enc == null) { + throw new VortexException(EncodingId.VORTEX_VARIANT, + "no encoder for shredded dtype: " + data.shreddedDtype()); + } + EncodeResult shredded = enc.encode(data.shreddedDtype(), data.shreddedData(), ctx); + EncodeNode child = EncodeNode.remapBufferIndices(shredded.rootNode(), buffers.size()); + buffers.addAll(shredded.buffers()); + return child; + } + + /// Converts a shreddable scalar dtype to its protobuf form for `VariantMetadata`. + private static io.github.dfa1.vortex.proto.DType toProtoDtype(DType dtype) { + return switch (dtype) { + case DType.Primitive p -> io.github.dfa1.vortex.proto.DType.ofPrimitive( + new io.github.dfa1.vortex.proto.Primitive( + io.github.dfa1.vortex.proto.PType.fromValue(p.ptype().ordinal()), p.nullable())); + case DType.Bool b -> io.github.dfa1.vortex.proto.DType.ofBool( + new io.github.dfa1.vortex.proto.Bool(b.nullable())); + case DType.Utf8 u -> io.github.dfa1.vortex.proto.DType.ofUtf8( + new io.github.dfa1.vortex.proto.Utf8(u.nullable())); + case DType.Binary bin -> io.github.dfa1.vortex.proto.DType.ofBinary( + new io.github.dfa1.vortex.proto.Binary(bin.nullable())); + default -> throw new VortexException(EncodingId.VORTEX_VARIANT, + "shredded dtype not supported: " + dtype); + }; + } + /// Groups adjacent equal scalars into runs, appending each run's value and length. private static void coalesceRuns(List values, List runValues, List runLengths) { Scalar prev = null; diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java index cb4c778a..34748826 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/VariantEncodingEncoderTest.java @@ -196,6 +196,59 @@ void varyingColumn_decodesPerRowValuesInOrder() { assertThat(core.getInt(1)).isEqualTo(20); assertThat(core.getInt(2)).isEqualTo(30); } + + @Test + void shreddedColumn_decodesShreddedTypedChild() { + // Given/When a column with a shredded i32 projection is encoded then decoded + DType i32 = new DType.Primitive(io.github.dfa1.vortex.core.PType.I32, false); + var data = VariantData.shredded( + List.of(i32Scalar(10L), i32Scalar(20L), i32Scalar(30L)), new int[]{10, 20, 30}, i32); + var variant = decode(SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()), 3); + + // Then the shredded child decodes as the typed column + assertThat(variant.shredded()).isNotNull(); + var shredded = (io.github.dfa1.vortex.reader.array.IntArray) variant.shredded(); + assertThat(shredded.dtype()).isEqualTo(i32); + assertThat(shredded.getInt(0)).isEqualTo(10); + assertThat(shredded.getInt(1)).isEqualTo(20); + assertThat(shredded.getInt(2)).isEqualTo(30); + } + } + + @Nested + class Shredded { + + @Test + void emitsSecondChildAndRecordsShreddedDtype() throws Exception { + // Given a column with a shredded i32 projection + DType i32 = new DType.Primitive(io.github.dfa1.vortex.core.PType.I32, false); + var data = VariantData.shredded( + List.of(i32Scalar(10L), i32Scalar(20L), i32Scalar(30L)), new int[]{10, 20, 30}, i32); + + // When + EncodeResult result = SUT.encode(VARIANT, data, EncodeTestHelper.testCtx()); + + // Then the container has a second (shredded) child encoded as a primitive array... + EncodeNode root = result.rootNode(); + assertThat(root.children()).hasSize(2); + assertThat(root.children()[1].encodingId()).isEqualTo(EncodingId.VORTEX_PRIMITIVE); + + // ...and the metadata records shredded_dtype = i32. + MemorySegment meta = MemorySegment.ofBuffer(root.metadata().duplicate()); + VariantMetadata vm = VariantMetadata.decode(meta, 0, meta.byteSize()); + assertThat(vm.shredded_dtype()).isNotNull(); + assertThat(vm.shredded_dtype().primitive()).isNotNull(); + assertThat(vm.shredded_dtype().primitive().type()).isEqualTo(io.github.dfa1.vortex.proto.PType.I32); + } + + @Test + void halfSpecifiedShredded_throws() { + assertThatThrownBy(() -> VariantData.shredded(List.of(i32Scalar(1L)), null, null)) + .isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> new VariantData(List.of(i32Scalar(1L)), new int[]{1}, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("both set or both null"); + } } @Nested From d17a89095863bf7faa18369beae121ddfb7372f2 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:52:50 +0200 Subject: [PATCH 6/7] docs(variant): mark vortex.variant encode + Java-side read as shipped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the compatibility matrix (encode now ✅; reframe the wire-format gap as "arbitrary nested objects need parquet.variant"), the decode-shape table, the EncodingId doc comment, and add a CHANGELOG entry under [Unreleased]. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 5 +++++ .../java/io/github/dfa1/vortex/encoding/EncodingId.java | 2 +- docs/compatibility.md | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bc1a251..e00a1563 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Writer: `vortex.variant` encoder. Encodes a variant column as the canonical `vortex.variant` container over `core_storage` — an all-equal column becomes a single `vortex.constant`, a row-varying column a `vortex.chunked` of per-run constants — with an optional row-aligned typed `shredded` child recorded in `VariantMetadata.shredded_dtype`. Input is `VariantData(List)` with `.constant(n, v)` / `.shredded(...)` factories. Java↔Rust (JNI) round-trip verified for constant, row-varying, and shredded columns. Scalar values only — arbitrary nested objects need `vortex.parquet.variant` (deferred, [ADR 0014](docs/adr/0014-variant-encoding-strategy.md)). +- Reader: variant columns now decode Java-side. `ConstantEncodingDecoder` and `ChunkedEncodingDecoder` handle `DType.Variant` (materialising the inner-typed array); `VariantEncodingDecoder` wraps the result as `VariantArray`, exposing `coreStorage()` and `shredded()`. + ## [0.7.3] — 2026-06-17 Parquet ZSTD support, `vortex.patched` encoder, constant-encoding selection fix, Windows TUI raw-mode fix. diff --git a/core/src/main/java/io/github/dfa1/vortex/encoding/EncodingId.java b/core/src/main/java/io/github/dfa1/vortex/encoding/EncodingId.java index 207e7c18..9264042d 100644 --- a/core/src/main/java/io/github/dfa1/vortex/encoding/EncodingId.java +++ b/core/src/main/java/io/github/dfa1/vortex/encoding/EncodingId.java @@ -77,7 +77,7 @@ public enum EncodingId { VORTEX_MASKED("vortex.masked"), /// Patched encoding (not yet implemented; registered to prevent parse errors). VORTEX_PATCHED("vortex.patched"), - /// Variant encoding (not yet implemented; registered to prevent parse errors). + /// Variant logical encoding: canonical container over `core_storage` plus an optional shredded child. VORTEX_VARIANT("vortex.variant"), ; diff --git a/docs/compatibility.md b/docs/compatibility.md index c215252f..12eb97d9 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -33,7 +33,7 @@ resolves only the standalone decoders in `reader`; no encoder class is loaded. |------|------------|-------------| | `DType::Union` (`fbs.DType.Type.Union = 12`) | Rust 0.71.0 | ❌ Decode throws `VortexException("unsupported DType typeType=12")`. No `DType.Union` variant in Java's sealed type. | | `vortex.onpair` experimental string encoding | Rust 0.74.0 | ❌ Not registered. Files using it fail to decode unless `Registry.allowUnknown()` is enabled. | -| `vortex.variant` write path | Rust 0.73.0 (`Allow writing Variant to files`, #7945) | ❌ Java decode works; Java encode throws `"encode not yet implemented"`. Java→Rust round-trip not possible for Variant columns. | +| `vortex.variant` arbitrary nested objects | Rust (`vortex.parquet.variant`) | ⚠️ Java encodes/decodes variant columns of **typed scalar** values (constant / chunked-of-constants core, optional shredded child); Java↔Rust round-trip verified. Arbitrary nested JSON objects and real path-based shredding need the `vortex.parquet.variant` physical encoding — deferred ([ADR 0014](adr/0014-variant-encoding-strategy.md)). | | Arrow extension array import affecting Variant shape | Rust 0.74.0 (#8125) | Untested. Re-run integration fixtures against v0.74.0 once published. | ## Encodings @@ -72,7 +72,7 @@ resolves only the standalone decoders in `reader`; no encoder class is loaded. | `fastlanes.for` | `FrameOfReferenceEncodingDecoder`| `FrameOfReferenceEncodingEncoder`| ✅ | ✅ | Integer PTypes | | `fastlanes.rle` | `RleEncodingDecoder` | `RleEncodingEncoder` | ✅ | ✅ | Chunk-based RLE | | `vortex.patched` | `PatchedEncodingDecoder` | `PatchedEncodingEncoder` | ✅ | ❌ | Primitive PTypes; encode not yet implemented | -| `vortex.variant` | `VariantEncodingDecoder` | `VariantEncodingEncoder` | ✅ | ❌ | Decode (incl. shredded child); encode not yet implemented (Rust 0.73+) | +| `vortex.variant` | `VariantEncodingDecoder` | `VariantEncodingEncoder` | ✅ | ✅ | Canonical container; constant / chunked-of-constants core + optional shredded child. Typed-scalar values only — nested objects need `parquet.variant` (ADR 0014) | | `vortex.onpair` | _none_ | _none_ | ❌ | ❌ | Experimental in Rust 0.74.0; not yet ported | ### Decode shape @@ -124,7 +124,7 @@ decoder falls into one of three shapes: | `fastlanes.for` | Lazy | Lazy | `LazyForXxxArray` (I8/U8/I16/U16/I32/U32/I64/U64), ADR 0010 + 0013 | | `fastlanes.rle` | Lazy | Lazy | `LazyRleXxxArray`; validity → `OffsetBoolArray`; empty → `LazyConstantXxxArray`, ADR 0013 | | `vortex.patched` | Materialized | Materialized | inner is full base + chunked patches (1024-elem blocks, lane-window-sorted); per-row access requires 2 laneOffsets reads + binary search inside the chunk window, so eager scatter wins for full scans | -| `vortex.variant` | Materialized | TBD | shredded child reassembly | +| `vortex.variant` | Lazy | Lazy | container wraps constant/chunked core (inner-typed) + optional shredded child | | `vortex.onpair` | n/a | n/a | not ported | Decompression-style encodings (Bitpacked / Pco / Zstd / Fsst / Delta) stay Materialized by design From f2e34dbb80cf3077a238b35e3b855f2b10432932 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Thu, 18 Jun 2026 21:55:52 +0200 Subject: [PATCH 7/7] docs(adr): mark 0014 Implemented Variant encode + Java-side read + shredding are realized and verified (unit + Rust JNI). parquet.variant remains deferred by design. Co-Authored-By: Claude Opus 4.8 --- docs/adr/0014-variant-encoding-strategy.md | 2 +- docs/adr/ADR.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/adr/0014-variant-encoding-strategy.md b/docs/adr/0014-variant-encoding-strategy.md index 6102e0d5..a07d46ae 100644 --- a/docs/adr/0014-variant-encoding-strategy.md +++ b/docs/adr/0014-variant-encoding-strategy.md @@ -1,6 +1,6 @@ # ADR 0014: Variant encoding strategy — chunked constants now, parquet.variant later -- **Status:** Accepted +- **Status:** Implemented - **Date:** 2026-06-18 - **Deciders:** project maintainer - **Supersedes:** — diff --git a/docs/adr/ADR.md b/docs/adr/ADR.md index d356f702..315c5403 100644 --- a/docs/adr/ADR.md +++ b/docs/adr/ADR.md @@ -26,4 +26,4 @@ Each ADR is a Markdown file named `NNNN-short-title.md`. Use `template.md` as th | 0011 | Writer zero-copy MemorySegment overload | Deferred | | 0012 | Zero-copy layout decoding: lazy Chunked/Dict | Implemented | | 0013 | Compute primitives: masks, kernels, no-materialise | Proposed | -| 0014 | Variant encoding: chunked constants now, parquet.variant later | Accepted | +| 0014 | Variant encoding: chunked constants now, parquet.variant later | Implemented |