From 80ccf33570398b013dc03fa0ca629021aba5c80e Mon Sep 17 00:00:00 2001 From: Grant Nicholas Date: Tue, 23 Jun 2026 13:39:20 -0500 Subject: [PATCH] Core: Fail scans when position deletes/DVs don't match data file partition Position deletes and deletion vectors that reference a data file by path must carry the same partition spec and partition tuple as that data file. When they differ the metadata is corrupt. --- .../org/apache/iceberg/DeleteFileIndex.java | 46 +++++++++++++- .../apache/iceberg/DataTableScanTestBase.java | 61 +++++++++++++++++++ 2 files changed, 104 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java b/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java index 872fcd212b8a..b15114b1f066 100644 --- a/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java +++ b/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java @@ -75,6 +75,7 @@ class DeleteFileIndex { private final PartitionMap posDeletesByPartition; private final Map posDeletesByPath; private final Map dvByPath; + private final LoadingCache> partitionComparatorsBySpecId; private final boolean hasEqDeletes; private final boolean hasPosDeletes; private final boolean isEmpty; @@ -84,12 +85,18 @@ private DeleteFileIndex( PartitionMap eqDeletesByPartition, PartitionMap posDeletesByPartition, Map posDeletesByPath, - Map dvByPath) { + Map dvByPath, + Map specsById) { this.globalDeletes = globalDeletes; this.eqDeletesByPartition = eqDeletesByPartition; this.posDeletesByPartition = posDeletesByPartition; this.posDeletesByPath = posDeletesByPath; this.dvByPath = dvByPath; + this.partitionComparatorsBySpecId = + specsById == null + ? null + : Caffeine.newBuilder() + .build(specId -> Comparators.forType(specsById.get(specId).partitionType())); this.hasEqDeletes = globalDeletes != null || eqDeletesByPartition != null; this.hasPosDeletes = posDeletesByPartition != null || posDeletesByPath != null || dvByPath != null; @@ -196,7 +203,15 @@ private DeleteFile[] findPathDeletes(long seq, DataFile dataFile) { } PositionDeletes deletes = posDeletesByPath.get(dataFile.location()); - return deletes == null ? EMPTY_DELETES : deletes.filter(seq); + if (deletes == null) { + return EMPTY_DELETES; + } + + DeleteFile[] matchingDeletes = deletes.filter(seq); + for (DeleteFile deleteFile : matchingDeletes) { + validatePartitionMatch(deleteFile, dataFile); + } + return matchingDeletes; } private DeleteFile findDV(long seq, DataFile dataFile) { @@ -211,10 +226,34 @@ private DeleteFile findDV(long seq, DataFile dataFile) { "DV data sequence number (%s) must be greater than or equal to data file sequence number (%s)", dv.dataSequenceNumber(), seq); + validatePartitionMatch(dv, dataFile); } return dv; } + private void validatePartitionMatch(DeleteFile deleteFile, DataFile dataFile) { + ValidationException.check( + deleteFile.specId() == dataFile.specId(), + "Mismatched partition specs (%s, %s) for delete file %s and data file %s:" + + " metadata is corrupted", + deleteFile.specId(), + dataFile.specId(), + deleteFile.location(), + dataFile.location()); + if (partitionComparatorsBySpecId != null) { + Comparator partitionComparator = + partitionComparatorsBySpecId.get(deleteFile.specId()); + ValidationException.check( + partitionComparator.compare(deleteFile.partition(), dataFile.partition()) == 0, + "Mismatched partition tuples (%s, %s) for delete file %s and data file %s:" + + " metadata is corrupted", + deleteFile.partition(), + dataFile.partition(), + deleteFile.location(), + dataFile.location()); + } + } + @SuppressWarnings("checkstyle:CyclomaticComplexity") private static boolean canContainEqDeletesForFile( DataFile dataFile, EqualityDeleteFile deleteFile) { @@ -522,7 +561,8 @@ DeleteFileIndex build() { eqDeletesByPartition.isEmpty() ? null : eqDeletesByPartition, posDeletesByPartition.isEmpty() ? null : posDeletesByPartition, posDeletesByPath.isEmpty() ? null : posDeletesByPath, - dvByPath.isEmpty() ? null : dvByPath); + dvByPath.isEmpty() ? null : dvByPath, + specsById.isEmpty() ? null : specsById); } private void add(Map dvByPath, DeleteFile dv) { diff --git a/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java b/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java index b8433ae0856c..f0b7a4daef0a 100644 --- a/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java +++ b/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.UUID; import java.util.stream.Collectors; +import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -310,4 +311,64 @@ public void testManifestLocationsInScanWithDeleteFiles() throws IOException { .get(0); assertThat(deletes.get(0).manifestLocation()).isEqualTo(deleteManifest.path()); } + + @TestTemplate + public void testPlanFilesPositionDeletePathReferenceWithPartitionMismatchFails() { + assumeThat(formatVersion).as("Requires V2 position deletes").isEqualTo(2); + + table.newAppend().appendFile(FILE_A).commit(); + + // Position delete whose metadata partition (data_bucket=1) does not match FILE_A's + // partition (data_bucket=0), but whose path reference points to FILE_A. + DeleteFile posDeleteWrongPartition = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/pos-delete-wrong-partition-" + UUID.randomUUID() + ".parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .withReferencedDataFile(FILE_A.location()) + .build(); + + table.newRowDelta().addDeletes(posDeleteWrongPartition).commit(); + + assertThatThrownBy(() -> Lists.newArrayList(newScan().planFiles())) + .isInstanceOf(ValidationException.class) + .hasMessageContaining( + "Mismatched partition tuples (PartitionData{data_bucket=1}," + + " PartitionData{data_bucket=0})") + .hasMessageContaining(FILE_A.location()) + .hasMessageContaining("metadata is corrupted"); + } + + @TestTemplate + public void testPlanFilesDeletionVectorPathReferenceWithPartitionMismatchFails() { + assumeThat(formatVersion).as("Requires V3 deletion vectors").isGreaterThanOrEqualTo(3); + + table.newAppend().appendFile(FILE_A).commit(); + + // DV whose metadata partition (data_bucket=1) does not match FILE_A's partition + // (data_bucket=0), but whose path reference points to FILE_A. + DeleteFile dvWrongPartition = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/dv-wrong-partition-" + UUID.randomUUID() + ".puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .withReferencedDataFile(FILE_A.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + table.newRowDelta().addDeletes(dvWrongPartition).commit(); + + assertThatThrownBy(() -> Lists.newArrayList(newScan().planFiles())) + .isInstanceOf(ValidationException.class) + .hasMessageContaining( + "Mismatched partition tuples (PartitionData{data_bucket=1}," + + " PartitionData{data_bucket=0})") + .hasMessageContaining(FILE_A.location()) + .hasMessageContaining("metadata is corrupted"); + } }