From 075ea946ebaef2a3c406443d25fb3b2deff7f3d3 Mon Sep 17 00:00:00 2001 From: Grant Nicholas Date: Tue, 23 Jun 2026 13:39:20 -0500 Subject: [PATCH 1/2] Reproduce possible spec inconsistency --- .../iceberg/DeleteFileIndexTestBase.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java b/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java index 08849fe35124..69cf3105e1db 100644 --- a/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java +++ b/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java @@ -365,6 +365,80 @@ public void testPartitionedTableWithPartitionPosDeletes() { .isEqualTo(fileADeletes().location()); } + @TestTemplate + public void testPositionDeletePathReferenceOverridesPartitionMismatch() { + assumeThat(formatVersion).as("Requires V2 position deletes").isEqualTo(2); + + table.newAppend().appendFile(FILE_A).commit(); + + // Position delete whose metadata partition (data_bucket=1) does not match FILE_A's + // partition (data_bucket=0), but whose path reference points to FILE_A. + // Should this commit be disallowed? + DeleteFile posDeleteWrongPartition = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/pos-delete-wrong-partition-" + UUID.randomUUID() + ".parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .withReferencedDataFile(FILE_A.location()) + .build(); + + table.newRowDelta().addDeletes(posDeleteWrongPartition).commit(); + + List tasks = Lists.newArrayList(newScan(table).planFiles().iterator()); + assertThat(tasks).as("Should have one task").hasSize(1); + + FileScanTask task = (FileScanTask) tasks.get(0); + assertThat(task.file().location()) + .as("Should have the correct data file path") + .isEqualTo(FILE_A.location()); + assertThat(task.deletes()) + .as("Path reference should override partition mismatch, the delete must be included") + .hasSize(1); + assertThat(task.deletes().get(0).location()) + .as("Should have the path-scoped position delete file") + .isEqualTo(posDeleteWrongPartition.location()); + } + + @TestTemplate + public void testDeletionVectorPathReferenceOverridesPartitionMismatch() { + assumeThat(formatVersion).as("Requires V3 deletion vectors").isGreaterThanOrEqualTo(3); + + table.newAppend().appendFile(FILE_A).commit(); + + // DV whose metadata partition (data_bucket=1) does not match FILE_A's partition + // (data_bucket=0), but whose path reference points to FILE_A. + // Should this commit be disallowed? + DeleteFile dvWrongPartition = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/dv-wrong-partition-" + UUID.randomUUID() + ".puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .withReferencedDataFile(FILE_A.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + table.newRowDelta().addDeletes(dvWrongPartition).commit(); + + List tasks = Lists.newArrayList(newScan(table).planFiles().iterator()); + assertThat(tasks).as("Should have one task").hasSize(1); + + FileScanTask task = (FileScanTask) tasks.get(0); + assertThat(task.file().location()) + .as("Should have the correct data file path") + .isEqualTo(FILE_A.location()); + assertThat(task.deletes()) + .as("Path reference should override partition mismatch, the DV must be included") + .hasSize(1); + assertThat(task.deletes().get(0).location()) + .as("Should have the deletion vector") + .isEqualTo(dvWrongPartition.location()); + } + @TestTemplate public void testPartitionedTableWithPartitionEqDeletes() { table.newAppend().appendFile(FILE_A).commit(); From e5535414672df38665b9ff44caf84e61af27f48f Mon Sep 17 00:00:00 2001 From: Grant Nicholas Date: Wed, 24 Jun 2026 10:10:50 -0500 Subject: [PATCH 2/2] Add another test case showing impact of spec inconsistency Partition filter expressions over incorrect delete file partition values results in incorrect queries. --- .../apache/iceberg/DataTableScanTestBase.java | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java b/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java index b8433ae0856c..820c348440a4 100644 --- a/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java +++ b/core/src/test/java/org/apache/iceberg/DataTableScanTestBase.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.UUID; import java.util.stream.Collectors; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -310,4 +311,94 @@ public void testManifestLocationsInScanWithDeleteFiles() throws IOException { .get(0); assertThat(deletes.get(0).manifestLocation()).isEqualTo(deleteManifest.path()); } + + @TestTemplate + public void testPlanFilesPositionDeletePathReferenceWithPartitionMismatch() { + assumeThat(formatVersion).as("Requires V2 position deletes").isEqualTo(2); + + table.newAppend().appendFile(FILE_A).commit(); + + // Position delete whose metadata partition (data_bucket=1) does not match FILE_A's + // partition (data_bucket=0), but whose path reference points to FILE_A. + DeleteFile posDeleteWrongPartition = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/pos-delete-wrong-partition-" + UUID.randomUUID() + ".parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .withReferencedDataFile(FILE_A.location()) + .build(); + + table.newRowDelta().addDeletes(posDeleteWrongPartition).commit(); + + // Without a partition filter: delete is associated via path reference despite partition + // mismatch. + List unfiltered = Lists.newArrayList(newScan().planFiles()); + assertThat(unfiltered).as("Should have one task").hasSize(1); + FileScanTask unfilteredTask = (FileScanTask) unfiltered.get(0); + assertThat(unfilteredTask.file().location()).isEqualTo(FILE_A.location()); + assertThat(unfilteredTask.deletes()) + .as("Delete is associated via path reference without a partition filter") + .hasSize(1); + assertThat(unfilteredTask.deletes().get(0).location()) + .isEqualTo(posDeleteWrongPartition.location()); + + // With a partition filter targeting data_bucket=0: the delete manifest contains only + // data_bucket=1 entries and is pruned, so the path-referenced delete is lost. + ScanT filtered = + newScan().filter(Expressions.equal(Expressions.bucket("data", BUCKETS_NUMBER), 0)); + List filteredTasks = Lists.newArrayList(filtered.planFiles()); + assertThat(filteredTasks).as("Should have one task").hasSize(1); + FileScanTask filteredTask = (FileScanTask) filteredTasks.get(0); + assertThat(filteredTask.file().location()).isEqualTo(FILE_A.location()); + assertThat(filteredTask.deletes()) + .as("Delete is dropped when delete manifest is pruned by partition filter") + .hasSize(0); + } + + @TestTemplate + public void testPlanFilesDeletionVectorPathReferenceWithPartitionMismatch() { + assumeThat(formatVersion).as("Requires V3 deletion vectors").isGreaterThanOrEqualTo(3); + + table.newAppend().appendFile(FILE_A).commit(); + + // DV whose metadata partition (data_bucket=1) does not match FILE_A's partition + // (data_bucket=0), but whose path reference points to FILE_A. + DeleteFile dvWrongPartition = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/dv-wrong-partition-" + UUID.randomUUID() + ".puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .withReferencedDataFile(FILE_A.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + table.newRowDelta().addDeletes(dvWrongPartition).commit(); + + // Without a partition filter: DV is associated via path reference despite partition mismatch. + List unfiltered = Lists.newArrayList(newScan().planFiles()); + assertThat(unfiltered).as("Should have one task").hasSize(1); + FileScanTask unfilteredTask = (FileScanTask) unfiltered.get(0); + assertThat(unfilteredTask.file().location()).isEqualTo(FILE_A.location()); + assertThat(unfilteredTask.deletes()) + .as("DV is associated via path reference without a partition filter") + .hasSize(1); + assertThat(unfilteredTask.deletes().get(0).location()).isEqualTo(dvWrongPartition.location()); + + // With a partition filter targeting data_bucket=0: the delete manifest contains only + // data_bucket=1 entries and is pruned, so the path-referenced DV is lost. + ScanT filtered = + newScan().filter(Expressions.equal(Expressions.bucket("data", BUCKETS_NUMBER), 0)); + List filteredTasks = Lists.newArrayList(filtered.planFiles()); + assertThat(filteredTasks).as("Should have one task").hasSize(1); + FileScanTask filteredTask = (FileScanTask) filteredTasks.get(0); + assertThat(filteredTask.file().location()).isEqualTo(FILE_A.location()); + assertThat(filteredTask.deletes()) + .as("DV is dropped when delete manifest is pruned by partition filter") + .hasSize(0); + } }