From ef32f00f42bb24b10f12f7ee5fff13ee2675fb62 Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Fri, 19 Jun 2026 18:54:17 +0200 Subject: [PATCH 1/2] test(reader): cover DictEncodingDecoder decode paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The encoding-level vortex.dict decoder had almost no unit coverage (3 of 223 lines) — it only ran transitively when a parent decoder called decodeChild on a dict segment. Add a focused test exercising: - proto + legacy primitive layouts, every code type (U8/U16/U32) × value width (I8/I16/I32/I64), both the fast and broadcast (single-value) expand paths - F32/F64 round-trips (typedArray float branches) - UTF8 dict, both legacy (3-buffer) and proto (varbin child) layouts - error paths: missing metadata (primitive + both utf8 variants), malformed proto metadata, unexpected/unsupported ptypes Covers 195/223 lines; the rest are unreachable defensive branches (expand default elem-size > 8, dead readCode arms — no PType has those widths). Co-Authored-By: Claude Opus 4.8 --- .../decode/DictEncodingDecoderTest.java | 467 ++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java new file mode 100644 index 00000000..101e0977 --- /dev/null +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java @@ -0,0 +1,467 @@ +package io.github.dfa1.vortex.reader.decode; + +import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.core.PType; +import io.github.dfa1.vortex.core.VortexException; +import io.github.dfa1.vortex.encoding.EncodingId; +import io.github.dfa1.vortex.proto.DictMetadata; +import io.github.dfa1.vortex.proto.VarBinMetadata; +import io.github.dfa1.vortex.reader.ReadRegistry; +import io.github.dfa1.vortex.reader.array.Array; +import io.github.dfa1.vortex.reader.array.ByteArray; +import io.github.dfa1.vortex.reader.array.DoubleArray; +import io.github.dfa1.vortex.reader.array.FloatArray; +import io.github.dfa1.vortex.reader.array.IntArray; +import io.github.dfa1.vortex.reader.array.LongArray; +import io.github.dfa1.vortex.reader.array.ShortArray; +import io.github.dfa1.vortex.reader.array.VarBinArray; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; +import java.util.stream.Stream; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class DictEncodingDecoderTest { + + private static final DictEncodingDecoder SUT = new DictEncodingDecoder(); + private static final ReadRegistry REGISTRY = TestRegistry.ofDecoders( + SUT, new PrimitiveEncodingDecoder(), new VarBinEncodingDecoder()); + + @Test + void acceptsPrimitiveAndUtf8_rejectsOthers() { + // Given / When / Then + assertThat(SUT.accepts(new DType.Primitive(PType.I32, false))).isTrue(); + assertThat(SUT.accepts(new DType.Utf8(false))).isTrue(); + assertThat(SUT.accepts(new DType.Bool(false))).isFalse(); + } + + @Nested + class PrimitiveProto { + + @ParameterizedTest(name = "codes={0} values={1}") + @MethodSource("io.github.dfa1.vortex.reader.decode.DictEncodingDecoderTest#codeAndValueTypes") + void fastPath_indexesDictionary(PType codePType, PType valPType) { + // Given — 4 codes pointing at 4 distinct dict values (codesCap == rowCount, valuesCap > 1) + long[] dict = {10, 20, 30, 40}; + long[] codes = {0, 1, 2, 3}; + DType dtype = new DType.Primitive(valPType, false); + + // When + Array result = decodeProto(dtype, codePType, codes, dict); + + // Then + assertLongValues(result, valPType, new long[]{10, 20, 30, 40}); + } + + @ParameterizedTest(name = "codes={0} values={1}") + @MethodSource("io.github.dfa1.vortex.reader.decode.DictEncodingDecoderTest#codeAndValueTypes") + void slowPath_broadcastsSingleValue(PType codePType, PType valPType) { + // Given — a single dict value (valuesCap == 1) forces the broadcast/modulo path + long[] dict = {77}; + long[] codes = {0, 0, 0, 0}; + DType dtype = new DType.Primitive(valPType, false); + + // When + Array result = decodeProto(dtype, codePType, codes, dict); + + // Then — every row resolves to the lone value + assertLongValues(result, valPType, new long[]{77, 77, 77, 77}); + } + + @Test + void f64Values_roundTripThroughDoubleArray() { + // Given — covers typedArray's F64 branch and the 8-byte expand path + MemorySegment codes = u8Codes(0, 1, 0); + MemorySegment values = doubleSegment(1.5, -2.25); + DType dtype = new DType.Primitive(PType.F64, false); + + // When + DoubleArray result = (DoubleArray) decodeProtoSegments(dtype, PType.U8, codes, values, 2, 3); + + // Then + assertThat(result.getDouble(0)).isEqualTo(1.5); + assertThat(result.getDouble(1)).isEqualTo(-2.25); + assertThat(result.getDouble(2)).isEqualTo(1.5); + } + + @Test + void f32Values_roundTripThroughFloatArray() { + // Given — covers typedArray's F32 branch and the 4-byte expand path + MemorySegment codes = u8Codes(1, 0); + MemorySegment values = floatSegment(3.5f, 4.75f); + DType dtype = new DType.Primitive(PType.F32, false); + + // When + FloatArray result = (FloatArray) decodeProtoSegments(dtype, PType.U8, codes, values, 2, 2); + + // Then + assertThat(result.getFloat(0)).isEqualTo(4.75f); + assertThat(result.getFloat(1)).isEqualTo(3.5f); + } + + @Test + void unexpectedCodeType_throws() { + // Given — proto declares an I64 code type, which decodeRustProto rejects + MemorySegment codes = i64Segment(0, 1); + MemorySegment values = i64Segment(5, 6); + DType dtype = new DType.Primitive(PType.I64, false); + + // When / Then + assertThatThrownBy(() -> decodeProtoSegments(dtype, PType.I64, codes, values, 2, 2)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("unexpected code type"); + } + + @Test + void unsupportedValuePType_throws() { + // Given — F16 expands fine (2 bytes) but typedArray has no F16 mapping + MemorySegment codes = u8Codes(0, 1); + MemorySegment values = shortSegment((short) 1, (short) 2); + DType dtype = new DType.Primitive(PType.F16, false); + + // When / Then + assertThatThrownBy(() -> decodeProtoSegments(dtype, PType.U8, codes, values, 2, 2)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("unsupported ptype"); + } + + @Test + void missingMetadata_throws() { + // Given — primitive dict with no metadata + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, null, new ArrayNode[0], new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Primitive(PType.I32, false), + 1, new MemorySegment[0], REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("missing metadata"); + } + + @Test + void malformedProtoMetadata_throws() { + // Given — >1 byte (routes to proto path) but a truncated varint that proto decode rejects + ByteBuffer meta = ByteBuffer.wrap(new byte[]{0x08, (byte) 0x80}); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, meta, + new ArrayNode[]{primitiveNode(0), primitiveNode(1)}, new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Primitive(PType.I32, false), + 1, new MemorySegment[]{u8Codes(0), i64Segment(0)}, REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("invalid proto metadata"); + } + } + + @Nested + class PrimitiveLegacy { + + @ParameterizedTest(name = "codes={0}") + @org.junit.jupiter.params.provider.EnumSource(value = PType.class, names = {"U8", "U16", "U32"}) + void singleByteMetadata_decodesViaLegacyPath(PType codePType) { + // Given — legacy layout: 1-byte metadata (code ptype), child[0]=values, child[1]=codes + long[] dict = {100, 200, 300}; + MemorySegment values = i64Segment(dict); + MemorySegment codes = codeSegment(codePType, new long[]{2, 0, 1, 2}); + + // When + Array result = decodeLegacy(new DType.Primitive(PType.I64, false), codePType, values, codes, 4); + + // Then + assertLongValues(result, PType.I64, new long[]{300, 100, 200, 300}); + } + + @Test + void nonStandardCodeType_hitsReadCodeAndThrows() { + // Given — code ptype I8 falls into the scalar default branch, where readCode rejects it + MemorySegment values = i64Segment(1, 2); + MemorySegment codes = MemorySegment.ofArray(new byte[]{0, 0}); + + // When / Then + assertThatThrownBy(() -> + decodeLegacy(new DType.Primitive(PType.I64, false), PType.I8, values, codes, 2)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("unexpected code type"); + } + } + + @Nested + class Utf8 { + + @Test + void legacyLayout_decodesStringsByCode() { + // Given — no children, 3 buffers (dict bytes, I64 offsets, codes), 1-byte metadata + byte[] dictBytes = "abcde".getBytes(StandardCharsets.UTF_8); // "ab","cde" + MemorySegment bytes = MemorySegment.ofArray(dictBytes); + MemorySegment offsets = i64Segment(0, 2, 5); + MemorySegment codes = u8Codes(1, 0, 1); + + ByteBuffer meta = ByteBuffer.wrap(new byte[]{(byte) PType.U8.ordinal()}); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, meta, new ArrayNode[0], new int[]{0, 1, 2}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 3, + new MemorySegment[]{bytes, offsets, codes}, REGISTRY, Arena.ofAuto()); + + // When + VarBinArray result = (VarBinArray) SUT.decode(ctx); + + // Then + assertThat(result.getString(0)).isEqualTo("cde"); + assertThat(result.getString(1)).isEqualTo("ab"); + assertThat(result.getString(2)).isEqualTo("cde"); + } + + @Test + void protoLayout_decodesStringsByCode() { + // Given — children present: child[0]=codes, child[1]=varbin dictionary values + byte[] dictBytes = "fizzbuzz".getBytes(StandardCharsets.UTF_8); // "fizz","buzz" + MemorySegment bytes = MemorySegment.ofArray(dictBytes); + MemorySegment offsets = i64Segment(0, 4, 8); + MemorySegment codes = u8Codes(0, 1, 0); + MemorySegment[] segs = {codes, bytes, offsets}; + + ByteBuffer dictMeta = ByteBuffer.wrap( + new DictMetadata(2, protoPType(PType.U8), null, null).encode()); + ByteBuffer varBinMeta = ByteBuffer.wrap(new VarBinMetadata(protoPType(PType.I64)).encode()); + + ArrayNode codesNode = primitiveNode(0); + ArrayNode offsetsNode = primitiveNode(2); + ArrayNode valuesNode = ArrayNode.of(EncodingId.VORTEX_VARBIN, varBinMeta, + new ArrayNode[]{offsetsNode}, new int[]{1}); + ArrayNode dictNode = ArrayNode.of(EncodingId.VORTEX_DICT, dictMeta, + new ArrayNode[]{codesNode, valuesNode}, new int[]{}); + + DecodeContext ctx = new DecodeContext(dictNode, new DType.Utf8(false), 3, + segs, REGISTRY, Arena.ofAuto()); + + // When + VarBinArray result = (VarBinArray) SUT.decode(ctx); + + // Then + assertThat(result.getString(0)).isEqualTo("fizz"); + assertThat(result.getString(1)).isEqualTo("buzz"); + assertThat(result.getString(2)).isEqualTo("fizz"); + } + + @Test + void legacyLayout_missingMetadata_throws() { + // Given — no children and no metadata + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, null, new ArrayNode[0], new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 0, + new MemorySegment[0], REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("legacy utf8 dict"); + } + + @Test + void protoLayout_malformedMetadata_throws() { + // Given — children present, metadata is an invalid (truncated varint) proto blob + ByteBuffer meta = ByteBuffer.wrap(new byte[]{0x08, (byte) 0x80}); + ArrayNode child = primitiveNode(0); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, meta, + new ArrayNode[]{child, child}, new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 1, + new MemorySegment[]{u8Codes(0)}, REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("invalid utf8 dict proto metadata"); + } + + @Test + void protoLayout_missingMetadata_throws() { + // Given — children present but metadata absent + ArrayNode child = primitiveNode(0); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, null, new ArrayNode[]{child, child}, new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 1, + new MemorySegment[]{u8Codes(0)}, REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("missing metadata for utf8 dict"); + } + } + + // ── parameter sources ────────────────────────────────────────────────────── + + static Stream codeAndValueTypes() { + PType[] codeTypes = {PType.U8, PType.U16, PType.U32}; + PType[] valueTypes = {PType.I8, PType.I16, PType.I32, PType.I64}; + Stream.Builder b = Stream.builder(); + for (PType code : codeTypes) { + for (PType val : valueTypes) { + b.add(Arguments.of(code, val)); + } + } + return b.build(); + } + + // ── decode harnesses ─────────────────────────────────────────────────────── + + private static Array decodeProto(DType dtype, PType codePType, long[] codes, long[] dict) { + MemorySegment codesSeg = codeSegment(codePType, codes); + MemorySegment valuesSeg = valueSegment(((DType.Primitive) dtype).ptype(), dict); + return decodeProtoSegments(dtype, codePType, codesSeg, valuesSeg, dict.length, codes.length); + } + + private static Array decodeProtoSegments(DType dtype, PType codePType, MemorySegment codes, + MemorySegment values, int valuesLen, int rowCount) { + ByteBuffer meta = ByteBuffer.wrap(new DictMetadata(valuesLen, protoPType(codePType), null, null).encode()); + MemorySegment[] segs = {codes, values}; + ArrayNode dictNode = ArrayNode.of(EncodingId.VORTEX_DICT, meta, + new ArrayNode[]{primitiveNode(0), primitiveNode(1)}, new int[]{}); + DecodeContext ctx = new DecodeContext(dictNode, dtype, rowCount, segs, REGISTRY, Arena.ofAuto()); + return SUT.decode(ctx); + } + + private static Array decodeLegacy(DType dtype, PType codePType, MemorySegment values, + MemorySegment codes, int rowCount) { + ByteBuffer meta = ByteBuffer.wrap(new byte[]{(byte) codePType.ordinal()}); + MemorySegment[] segs = {values, codes}; + ArrayNode dictNode = ArrayNode.of(EncodingId.VORTEX_DICT, meta, + new ArrayNode[]{primitiveNode(0), primitiveNode(1)}, new int[]{}); + DecodeContext ctx = new DecodeContext(dictNode, dtype, rowCount, segs, REGISTRY, Arena.ofAuto()); + return SUT.decode(ctx); + } + + private static ArrayNode primitiveNode(int bufferIndex) { + return ArrayNode.of(EncodingId.VORTEX_PRIMITIVE, null, new ArrayNode[0], new int[]{bufferIndex}); + } + + private static io.github.dfa1.vortex.proto.PType protoPType(PType core) { + return io.github.dfa1.vortex.proto.PType.valueOf(core.name()); + } + + // ── segment builders (little-endian) ─────────────────────────────────────── + + private static MemorySegment codeSegment(PType codePType, long[] codes) { + return switch (codePType) { + case U8 -> { + byte[] a = new byte[codes.length]; + for (int i = 0; i < codes.length; i++) { + a[i] = (byte) codes[i]; + } + yield MemorySegment.ofArray(a); + } + case U16 -> { + short[] s = new short[codes.length]; + for (int i = 0; i < codes.length; i++) { + s[i] = (short) codes[i]; + } + yield shortSegment(s); + } + case U32 -> { + int[] in = new int[codes.length]; + for (int i = 0; i < codes.length; i++) { + in[i] = (int) codes[i]; + } + yield intSegment(in); + } + default -> throw new IllegalArgumentException("unsupported code ptype: " + codePType); + }; + } + + private static MemorySegment valueSegment(PType valPType, long[] values) { + return switch (valPType) { + case I8, U8 -> { + byte[] a = new byte[values.length]; + for (int i = 0; i < values.length; i++) { + a[i] = (byte) values[i]; + } + yield MemorySegment.ofArray(a); + } + case I16, U16 -> { + short[] s = new short[values.length]; + for (int i = 0; i < values.length; i++) { + s[i] = (short) values[i]; + } + yield shortSegment(s); + } + case I32, U32 -> { + int[] in = new int[values.length]; + for (int i = 0; i < values.length; i++) { + in[i] = (int) values[i]; + } + yield intSegment(in); + } + case I64, U64 -> i64Segment(values); + default -> throw new IllegalArgumentException("unsupported value ptype: " + valPType); + }; + } + + private static MemorySegment u8Codes(int... codes) { + byte[] a = new byte[codes.length]; + for (int i = 0; i < codes.length; i++) { + a[i] = (byte) codes[i]; + } + return MemorySegment.ofArray(a); + } + + private static MemorySegment shortSegment(short... values) { + ByteBuffer bb = ByteBuffer.allocate(values.length * 2).order(ByteOrder.LITTLE_ENDIAN); + for (short v : values) { + bb.putShort(v); + } + return MemorySegment.ofArray(bb.array()); + } + + private static MemorySegment intSegment(int... values) { + ByteBuffer bb = ByteBuffer.allocate(values.length * 4).order(ByteOrder.LITTLE_ENDIAN); + for (int v : values) { + bb.putInt(v); + } + return MemorySegment.ofArray(bb.array()); + } + + private static MemorySegment i64Segment(long... values) { + ByteBuffer bb = ByteBuffer.allocate(values.length * 8).order(ByteOrder.LITTLE_ENDIAN); + for (long v : values) { + bb.putLong(v); + } + return MemorySegment.ofArray(bb.array()); + } + + private static MemorySegment doubleSegment(double... values) { + ByteBuffer bb = ByteBuffer.allocate(values.length * 8).order(ByteOrder.LITTLE_ENDIAN); + for (double v : values) { + bb.putDouble(v); + } + return MemorySegment.ofArray(bb.array()); + } + + private static MemorySegment floatSegment(float... values) { + ByteBuffer bb = ByteBuffer.allocate(values.length * 4).order(ByteOrder.LITTLE_ENDIAN); + for (float v : values) { + bb.putFloat(v); + } + return MemorySegment.ofArray(bb.array()); + } + + private static void assertLongValues(Array array, PType valPType, long[] expected) { + for (int i = 0; i < expected.length; i++) { + long actual = switch (valPType) { + case I8, U8 -> ((ByteArray) array).getByte(i); + case I16, U16 -> ((ShortArray) array).getShort(i); + case I32, U32 -> ((IntArray) array).getInt(i); + case I64, U64 -> ((LongArray) array).getLong(i); + default -> throw new IllegalArgumentException("unsupported: " + valPType); + }; + assertThat(actual).as("index %d", i).isEqualTo(expected[i]); + } + } +} From 3c6c48ad3d8625aeb18891a861613e6e5c8ca53d Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Fri, 19 Jun 2026 19:18:01 +0200 Subject: [PATCH 2/2] test(reader): reach 100% line+branch coverage of DictEncodingDecoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the initial coverage pass: - Drop the dead readCode helper and collapse decodeLegacyJava's default branch to a direct throw — it only ever ran for non-U8/U16/U32 codes, where readCode threw anyway. Removes ~13 lines of unreachable code. - Make expandU8/U16/U32 package-private and unit-test their generic copy fallback (element width != 1/2/4/8) plus the codes-broadcast branch directly. - Cover the empty-but-non-null metadata guard on all three decode entry points. DictEncodingDecoder now at 100% lines and 100% branches. Co-Authored-By: Claude Opus 4.8 --- .../reader/decode/DictEncodingDecoder.java | 24 +--- .../decode/DictEncodingDecoderTest.java | 126 +++++++++++++++++- 2 files changed, 128 insertions(+), 22 deletions(-) diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoder.java index 9d416feb..78710467 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoder.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoder.java @@ -91,12 +91,7 @@ private static Array decodeLegacyJava(DecodeContext ctx, byte codeTypeByte) { case U8 -> expandU8(codesBuf, valuesBuf, out, rowCount, elemSize); case U16 -> expandU16(codesBuf, valuesBuf, out, rowCount, elemSize); case U32 -> expandU32(codesBuf, valuesBuf, out, rowCount, elemSize); - default -> { - for (long i = 0; i < rowCount; i++) { - long code = readCode(codesBuf, codePType, i); - MemorySegment.copy(valuesBuf, code * elemSize, out, i * elemSize, elemSize); - } - } + default -> throw new VortexException(EncodingId.VORTEX_DICT, "unexpected code type: " + codePType); } return typedArray(ctx.dtype(), valPType, rowCount, out.asReadOnly()); } @@ -166,18 +161,7 @@ private static Array decodeUtf8DictProto(DecodeContext ctx, ByteBuffer metaBuf) codesBuf, codePType); } - private static long readCode(MemorySegment buf, PType codePType, long i) { - long cap = SegmentBroadcast.capacity(buf, codePType.byteSize()); - long idx = i % cap; - return switch (codePType) { - case U8 -> Byte.toUnsignedLong(buf.get(ValueLayout.JAVA_BYTE, idx)); - case U16 -> Short.toUnsignedLong(buf.get(PTypeIO.LE_SHORT, idx * 2)); - case U32 -> Integer.toUnsignedLong(buf.get(PTypeIO.LE_INT, idx * 4)); - default -> throw new VortexException(EncodingId.VORTEX_DICT, "unexpected code type: " + codePType); - }; - } - - private static void expandU8(MemorySegment codes, MemorySegment values, MemorySegment out, long rowCount, int elemSize) { + static void expandU8(MemorySegment codes, MemorySegment values, MemorySegment out, long rowCount, int elemSize) { long codesCap = SegmentBroadcast.capacity(codes, 1); long valuesCap = SegmentBroadcast.capacity(values, elemSize); boolean fast = codesCap >= rowCount && valuesCap > 1; @@ -250,7 +234,7 @@ private static void expandU8(MemorySegment codes, MemorySegment values, MemorySe } } - private static void expandU16(MemorySegment codes, MemorySegment values, MemorySegment out, long rowCount, int elemSize) { + static void expandU16(MemorySegment codes, MemorySegment values, MemorySegment out, long rowCount, int elemSize) { long codesCap = SegmentBroadcast.capacity(codes, 2); long valuesCap = SegmentBroadcast.capacity(values, elemSize); boolean fast = codesCap >= rowCount && valuesCap > 1; @@ -323,7 +307,7 @@ private static void expandU16(MemorySegment codes, MemorySegment values, MemoryS } } - private static void expandU32(MemorySegment codes, MemorySegment values, MemorySegment out, long rowCount, int elemSize) { + static void expandU32(MemorySegment codes, MemorySegment values, MemorySegment out, long rowCount, int elemSize) { long codesCap = SegmentBroadcast.capacity(codes, 4); long valuesCap = SegmentBroadcast.capacity(values, elemSize); boolean fast = codesCap >= rowCount && valuesCap > 1; diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java index 101e0977..112a09da 100644 --- a/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java +++ b/reader/src/test/java/io/github/dfa1/vortex/reader/decode/DictEncodingDecoderTest.java @@ -23,6 +23,7 @@ import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; @@ -148,6 +149,20 @@ void missingMetadata_throws() { .hasMessageContaining("missing metadata"); } + @Test + void emptyMetadata_throws() { + // Given — metadata present but with zero remaining bytes (exercises !hasRemaining) + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, ByteBuffer.allocate(0), + new ArrayNode[0], new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Primitive(PType.I32, false), + 1, new MemorySegment[0], REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("missing metadata"); + } + @Test void malformedProtoMetadata_throws() { // Given — >1 byte (routes to proto path) but a truncated varint that proto decode rejects @@ -183,8 +198,8 @@ void singleByteMetadata_decodesViaLegacyPath(PType codePType) { } @Test - void nonStandardCodeType_hitsReadCodeAndThrows() { - // Given — code ptype I8 falls into the scalar default branch, where readCode rejects it + void nonStandardCodeType_throws() { + // Given — code ptype I8 is not U8/U16/U32, so the legacy switch rejects it MemorySegment values = i64Segment(1, 2); MemorySegment codes = MemorySegment.ofArray(new byte[]{0, 0}); @@ -196,6 +211,84 @@ void nonStandardCodeType_hitsReadCodeAndThrows() { } } + /// Directly exercises the generic copy fallback (element width not 1/2/4/8) in each + /// expand* helper — unreachable through decode() since primitive widths are always + /// 1/2/4/8, but kept as a defensive bulk-copy path. + @Nested + class ExpandGenericElemSize { + + @ParameterizedTest(name = "codes={0}") + @org.junit.jupiter.params.provider.EnumSource(value = PType.class, names = {"U8", "U16", "U32"}) + void fastPath_copiesPerElement(PType codePType) { + // Given — two 3-byte elements, codes [0, 1] (codesCap == rowCount, valuesCap > 1) + MemorySegment values = bytes(1, 2, 3, 4, 5, 6); + MemorySegment codes = codeSegment(codePType, new long[]{0, 1}); + MemorySegment out = Arena.ofAuto().allocate(6); + + // When + expand(codePType, codes, values, out, 2, 3); + + // Then + assertThat(toByteArray(out, 6)).containsExactly(1, 2, 3, 4, 5, 6); + } + + @ParameterizedTest(name = "codes={0}") + @org.junit.jupiter.params.provider.EnumSource(value = PType.class, names = {"U8", "U16", "U32"}) + void slowPath_broadcastsSingleElement(PType codePType) { + // Given — one 3-byte element (valuesCap == 1) forces the broadcast branch + MemorySegment values = bytes(7, 8, 9); + MemorySegment codes = codeSegment(codePType, new long[]{0, 0}); + MemorySegment out = Arena.ofAuto().allocate(6); + + // When + expand(codePType, codes, values, out, 2, 3); + + // Then + assertThat(toByteArray(out, 6)).containsExactly(7, 8, 9, 7, 8, 9); + } + + @ParameterizedTest(name = "codes={0}") + @org.junit.jupiter.params.provider.EnumSource(value = PType.class, names = {"U8", "U16", "U32"}) + void broadcastCodes_whenCodesShorterThanRowCount(PType codePType) { + // Given — a single code element (codesCap < rowCount) takes the codes-broadcast branch + MemorySegment values = bytes(10, 11, 12, 20, 21, 22); + MemorySegment codes = codeSegment(codePType, new long[]{1}); + MemorySegment out = Arena.ofAuto().allocate(6); + + // When + expand(codePType, codes, values, out, 2, 3); + + // Then — code 1 resolves to the second element for every row + assertThat(toByteArray(out, 6)).containsExactly(20, 21, 22, 20, 21, 22); + } + + private void expand(PType codePType, MemorySegment codes, MemorySegment values, + MemorySegment out, long rowCount, int elemSize) { + switch (codePType) { + case U8 -> DictEncodingDecoder.expandU8(codes, values, out, rowCount, elemSize); + case U16 -> DictEncodingDecoder.expandU16(codes, values, out, rowCount, elemSize); + case U32 -> DictEncodingDecoder.expandU32(codes, values, out, rowCount, elemSize); + default -> throw new IllegalArgumentException("unsupported: " + codePType); + } + } + + private MemorySegment bytes(int... values) { + byte[] a = new byte[values.length]; + for (int i = 0; i < values.length; i++) { + a[i] = (byte) values[i]; + } + return MemorySegment.ofArray(a); + } + + private byte[] toByteArray(MemorySegment seg, int n) { + byte[] a = new byte[n]; + for (int i = 0; i < n; i++) { + a[i] = seg.get(ValueLayout.JAVA_BYTE, i); + } + return a; + } + } + @Nested class Utf8 { @@ -295,6 +388,35 @@ void protoLayout_missingMetadata_throws() { .isInstanceOf(VortexException.class) .hasMessageContaining("missing metadata for utf8 dict"); } + + @Test + void legacyLayout_emptyMetadata_throws() { + // Given — no children and zero-remaining metadata (exercises !hasRemaining) + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, ByteBuffer.allocate(0), + new ArrayNode[0], new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 0, + new MemorySegment[0], REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("legacy utf8 dict"); + } + + @Test + void protoLayout_emptyMetadata_throws() { + // Given — children present, zero-remaining metadata + ArrayNode child = primitiveNode(0); + ArrayNode node = ArrayNode.of(EncodingId.VORTEX_DICT, ByteBuffer.allocate(0), + new ArrayNode[]{child, child}, new int[]{}); + DecodeContext ctx = new DecodeContext(node, new DType.Utf8(false), 1, + new MemorySegment[]{u8Codes(0)}, REGISTRY, Arena.ofAuto()); + + // When / Then + assertThatThrownBy(() -> SUT.decode(ctx)) + .isInstanceOf(VortexException.class) + .hasMessageContaining("missing metadata for utf8 dict"); + } } // ── parameter sources ──────────────────────────────────────────────────────