-
Notifications
You must be signed in to change notification settings - Fork 3.4k
API: Harden variant binary parsing against malformed input #16568
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,29 +35,46 @@ static SerializedArray from(VariantMetadata metadata, byte[] bytes) { | |
| return from(metadata, ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| static SerializedArray from(VariantMetadata metadata, ByteBuffer value, int header) { | ||
| return from(metadata, value, header, 0); | ||
| } | ||
|
|
||
| static SerializedArray from(VariantMetadata metadata, ByteBuffer value, int header, int depth) { | ||
| Preconditions.checkArgument( | ||
| value.order() == ByteOrder.LITTLE_ENDIAN, "Unsupported byte order: big endian"); | ||
| BasicType basicType = VariantUtil.basicType(header); | ||
| Preconditions.checkArgument( | ||
| basicType == BasicType.ARRAY, "Invalid array, basic type: " + basicType); | ||
| return new SerializedArray(metadata, value, header); | ||
| return new SerializedArray(metadata, value, header, depth); | ||
| } | ||
|
|
||
| private final VariantMetadata metadata; | ||
| private final ByteBuffer value; | ||
| private final int offsetSize; | ||
| private final int offsetListOffset; | ||
| private final int dataOffset; | ||
| private final int depth; | ||
| private final VariantValue[] array; | ||
|
|
||
| private SerializedArray(VariantMetadata metadata, ByteBuffer value, int header) { | ||
| private SerializedArray(VariantMetadata metadata, ByteBuffer value, int header, int depth) { | ||
| this.metadata = metadata; | ||
| this.value = value; | ||
| this.depth = depth; | ||
| this.offsetSize = 1 + ((header & OFFSET_SIZE_MASK) >> OFFSET_SIZE_SHIFT); | ||
| int numElementsSize = ((header & IS_LARGE) == IS_LARGE) ? 4 : 1; | ||
| Preconditions.checkArgument( | ||
| value.remaining() >= HEADER_SIZE + numElementsSize, | ||
| "Invalid variant array: buffer too small for element count field"); | ||
| int numElements = ByteBuffers.readLittleEndianUnsigned(value, HEADER_SIZE, numElementsSize); | ||
| Preconditions.checkArgument( | ||
| numElements >= 0, "Invalid variant array: negative element count %s", numElements); | ||
| this.offsetListOffset = HEADER_SIZE + numElementsSize; | ||
| long offsetTableEnd = (long) offsetListOffset + ((long) numElements + 1L) * offsetSize; | ||
| Preconditions.checkArgument( | ||
| offsetTableEnd <= value.remaining(), | ||
| "Invalid variant array: element count %s exceeds buffer", | ||
| numElements); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've just realised that this and the parquet-java hardening don't worry about leftover data. "don't do that" is implicit the policy there, being as it is useless. I wonder what the rust parquet reader does. |
||
| this.dataOffset = offsetListOffset + ((1 + numElements) * offsetSize); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For I'd just reuse the value we already have: |
||
| this.array = new VariantValue[numElements]; | ||
| } | ||
|
|
@@ -76,8 +93,16 @@ public VariantValue get(int index) { | |
| int next = | ||
| ByteBuffers.readLittleEndianUnsigned( | ||
| value, offsetListOffset + (offsetSize * (1 + index)), offsetSize); | ||
| long dataLen = value.remaining() - (long) dataOffset; | ||
| Preconditions.checkArgument( | ||
| offset >= 0 && next >= offset && next <= dataLen, | ||
| "Invalid variant array: offset range [%s, %s] out of data region [0, %s]", | ||
| offset, | ||
| next, | ||
| dataLen); | ||
| array[index] = | ||
| VariantValue.from(metadata, VariantUtil.slice(value, dataOffset + offset, next - offset)); | ||
| VariantUtil.fromBuffer( | ||
| metadata, VariantUtil.slice(value, dataOffset + offset, next - offset), depth + 1); | ||
| } | ||
| return array[index]; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,12 +42,16 @@ static SerializedObject from(VariantMetadata metadata, byte[] bytes) { | |
| } | ||
|
|
||
| static SerializedObject from(VariantMetadata metadata, ByteBuffer value, int header) { | ||
| return from(metadata, value, header, 0); | ||
| } | ||
|
|
||
| static SerializedObject from(VariantMetadata metadata, ByteBuffer value, int header, int depth) { | ||
| Preconditions.checkArgument( | ||
| value.order() == ByteOrder.LITTLE_ENDIAN, "Unsupported byte order: big endian"); | ||
| BasicType basicType = VariantUtil.basicType(header); | ||
| Preconditions.checkArgument( | ||
| basicType == BasicType.OBJECT, "Invalid object, basic type: " + basicType); | ||
| return new SerializedObject(metadata, value, header); | ||
| return new SerializedObject(metadata, value, header, depth); | ||
| } | ||
|
|
||
| private final VariantMetadata metadata; | ||
|
|
@@ -60,18 +64,33 @@ static SerializedObject from(VariantMetadata metadata, ByteBuffer value, int hea | |
| private final int[] offsets; | ||
| private final int[] lengths; | ||
| private final int dataOffset; | ||
| private final int depth; | ||
| private final VariantValue[] values; | ||
|
|
||
| private SerializedObject(VariantMetadata metadata, ByteBuffer value, int header) { | ||
| private SerializedObject(VariantMetadata metadata, ByteBuffer value, int header, int depth) { | ||
| this.metadata = metadata; | ||
| this.value = value; | ||
| this.depth = depth; | ||
| this.offsetSize = 1 + ((header & OFFSET_SIZE_MASK) >> OFFSET_SIZE_SHIFT); | ||
| this.fieldIdSize = 1 + ((header & FIELD_ID_SIZE_MASK) >> FIELD_ID_SIZE_SHIFT); | ||
| int numElementsSize = ((header & IS_LARGE) == IS_LARGE) ? 4 : 1; | ||
| Preconditions.checkArgument( | ||
| value.remaining() >= HEADER_SIZE + numElementsSize, | ||
| "Invalid variant object: buffer too small for element count field"); | ||
| int numElements = ByteBuffers.readLittleEndianUnsigned(value, HEADER_SIZE, numElementsSize); | ||
| Preconditions.checkArgument( | ||
| numElements >= 0, "Invalid variant object: negative element count %s", numElements); | ||
| this.fieldIdListOffset = HEADER_SIZE + numElementsSize; | ||
| this.fieldIds = new Integer[numElements]; | ||
| long dataStart = | ||
| (long) fieldIdListOffset | ||
| + (long) numElements * fieldIdSize | ||
| + ((long) numElements + 1L) * offsetSize; | ||
| Preconditions.checkArgument( | ||
| dataStart <= value.remaining(), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same int-overflow as the array side: I'd derive both offsets from the |
||
| "Invalid variant object: element count %s exceeds buffer", | ||
| numElements); | ||
| this.offsetListOffset = fieldIdListOffset + (numElements * fieldIdSize); | ||
| this.fieldIds = new Integer[numElements]; | ||
| this.offsets = new int[numElements]; | ||
| this.lengths = new int[numElements]; | ||
| this.dataOffset = offsetListOffset + ((1 + numElements) * offsetSize); | ||
|
|
@@ -96,11 +115,26 @@ private void initOffsetsAndLengths(int numElements) { | |
| int dataLength = | ||
| ByteBuffers.readLittleEndianUnsigned( | ||
| value, offsetListOffset + (numElements * offsetSize), offsetSize); | ||
| long dataLen = value.remaining() - (long) dataOffset; | ||
| Preconditions.checkArgument( | ||
| dataLength >= 0 && dataLength <= dataLen, | ||
| "Invalid variant object: data length %s out of data region [0, %s]", | ||
| dataLength, | ||
| dataLen); | ||
| for (int index = 0; index < numElements; index += 1) { | ||
| Preconditions.checkArgument( | ||
| offsets[index] >= 0 && offsets[index] <= dataLength, | ||
| "Invalid variant object: offset %s out of declared data length %s", | ||
| offsets[index], | ||
| dataLength); | ||
| } | ||
| offsetToLength.put(dataLength, 0); | ||
|
|
||
| // populate lengths list by sorting offsets | ||
| List<Integer> sortedOffsets = | ||
| offsetToLength.keySet().stream().sorted().collect(Collectors.toList()); | ||
| Preconditions.checkArgument( | ||
| sortedOffsets.size() == numElements + 1, "Invalid variant object: duplicate field offsets"); | ||
| for (int index = 0; index < numElements; index += 1) { | ||
| int offset = sortedOffsets.get(index); | ||
| int length = sortedOffsets.get(index + 1) - offset; | ||
|
|
@@ -163,9 +197,16 @@ public String next() { | |
|
|
||
| private int id(int index) { | ||
| if (null == fieldIds[index]) { | ||
| fieldIds[index] = | ||
| int dictSize = metadata.dictionarySize(); | ||
| int id = | ||
| ByteBuffers.readLittleEndianUnsigned( | ||
| value, fieldIdListOffset + (index * fieldIdSize), fieldIdSize); | ||
| Preconditions.checkArgument( | ||
| id >= 0 && id < dictSize, | ||
| "Invalid variant object: field id %s out of range [0, %s)", | ||
| id, | ||
| dictSize); | ||
| fieldIds[index] = id; | ||
| } | ||
|
|
||
| return fieldIds[index]; | ||
|
|
@@ -182,8 +223,10 @@ public VariantValue get(String name) { | |
|
|
||
| if (null == values[index]) { | ||
| values[index] = | ||
| VariantValue.from( | ||
| metadata, VariantUtil.slice(value, dataOffset + offsets[index], lengths[index])); | ||
| VariantUtil.fromBuffer( | ||
| metadata, | ||
| VariantUtil.slice(value, dataOffset + offsets[index], lengths[index]), | ||
| depth + 1); | ||
| } | ||
|
|
||
| return values[index]; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,8 @@ | |
| import java.nio.ByteOrder; | ||
| import java.nio.charset.StandardCharsets; | ||
| import java.util.function.Function; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.util.ByteBuffers; | ||
|
|
||
| class VariantUtil { | ||
| private static final int BASIC_TYPE_MASK = 0b11; | ||
|
|
@@ -30,8 +32,38 @@ class VariantUtil { | |
| private static final int BASIC_TYPE_OBJECT = 2; | ||
| private static final int BASIC_TYPE_ARRAY = 3; | ||
|
|
||
| /** | ||
| * Maximum permitted nesting depth of a Variant value. The top-level value is depth 0, so a | ||
| * Variant may contain up to {@code MAX_VARIANT_DEPTH} nested levels. | ||
| */ | ||
| static final int MAX_VARIANT_DEPTH = 500; | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a non-spec limit — the Parquet VariantEncoding spec doesn't cap nesting depth and Iceberg defers to it. A valid variant with >500 levels written by Spark or another client would read fine elsewhere but throw here at scan time, which reads as silent data loss rather than malformed-input rejection. What did parquet-java#3562 settle on? If it picked a different value (or no cap), we'd diverge on file interop. I'd at minimum cross-reference that decision in a comment here and confirm 500 is high enough to never reject real data (it's below Jackson's default of 2000).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no real reason, except it was what was in org.apache.parquet.variant.VariantJsonParser how about we discuss on parquet dev list? I don't really care, just that there's some limit and that it's broadly consistent.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've started a discussion on the parquet dev list, please get involved. Looking at jackson, seems like 1000 is their limit right now, so copying that would be consistent. Let's see what the mailing list discussion outcome is though and we can all go with that, and add to the test dataset. |
||
| private VariantUtil() {} | ||
|
|
||
| /** Parses a variant value from {@code value} using {@code metadata} for field-name resolution. */ | ||
| static VariantValue fromBuffer(VariantMetadata metadata, ByteBuffer value, int depth) { | ||
|
nssalian marked this conversation as resolved.
|
||
| Preconditions.checkArgument( | ||
| depth <= MAX_VARIANT_DEPTH, | ||
| "Invalid variant: nesting depth %s exceeds maximum %s", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Javadoc on I'd either flip this to |
||
| depth, | ||
| MAX_VARIANT_DEPTH); | ||
| Preconditions.checkArgument(value.remaining() >= 1, "Invalid variant: empty value buffer"); | ||
| int header = ByteBuffers.readByte(value, 0); | ||
| BasicType basicType = basicType(header); | ||
| switch (basicType) { | ||
| case PRIMITIVE: | ||
| return SerializedPrimitive.from(value, header); | ||
| case SHORT_STRING: | ||
| return SerializedShortString.from(value, header); | ||
| case OBJECT: | ||
| return SerializedObject.from(metadata, value, header, depth); | ||
| case ARRAY: | ||
| return SerializedArray.from(metadata, value, header, depth); | ||
| } | ||
|
|
||
| throw new UnsupportedOperationException("Unsupported basic type: " + basicType); | ||
| } | ||
|
|
||
| static float readFloat(ByteBuffer buffer, int offset) { | ||
| return buffer.getFloat(buffer.position() + offset); | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.