Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
package org.apache.iceberg;

import java.io.Serializable;
import java.util.Objects;
import org.apache.iceberg.avro.SupportsIndexProjection;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
Expand Down Expand Up @@ -129,26 +128,6 @@ static Builder builder() {
return new Builder();
}

@Override
public boolean equals(Object other) {
if (this == other) {
return true;
} else if (!(other instanceof DeletionVectorStruct)) {
return false;
}

DeletionVectorStruct that = (DeletionVectorStruct) other;
return Objects.equals(location, that.location)
&& offset == that.offset
&& sizeInBytes == that.sizeInBytes
&& cardinality == that.cardinality;
}

@Override
public int hashCode() {
return Objects.hash(location, offset, sizeInBytes, cardinality);
}

@Override
public String toString() {
return MoreObjects.toStringHelper(this)
Expand Down
100 changes: 39 additions & 61 deletions core/src/main/java/org/apache/iceberg/TrackedFileBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;

class TrackedFileBuilder {
private final long snapshotId;
private final FileContent contentType;
private final TrackingBuilder trackingBuilder;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can align my PR with this new shape to avoid the individual setters that you commented.
#16936

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically, nothing changes from the user's perspective, it's just how we create Tracking internally.


// Required fields
private Integer formatVersion = null;
private String location = null;
private FileFormat fileFormat = null;
private Long recordCount = null;
private Long fileSizeInBytes = null;
private PartitionData partitionData = null;

// optional fields
private Integer specId = null;
private PartitionData partition = null;

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

carry-over comment: "PartitionData should be StructLike. We don't want to rely on concrete classes here."

TrackedFileStruct constructor seem to accept PartitionData so we have to do the conversion at some point. Seemed straightforward to keep the same type in the builder as expected by the constructed class.

private ContentStats contentStats = null;
private Integer sortOrderId = null;
private DeletionVector deletionVector = null;
Expand All @@ -44,12 +44,6 @@ class TrackedFileBuilder {
private List<Long> splitOffsets = null;
private List<Integer> equalityIds = null;

// tracking-related fields
private Tracking sourceTracking = null;
private boolean dvUpdated = false;
private ByteBuffer deletedPositions = null;
private ByteBuffer replacedPositions = null;

/**
* Creates a builder for a newly added data file entry.
*
Expand Down Expand Up @@ -111,17 +105,15 @@ static TrackedFile deleted(TrackedFile source, long newSnapshotId) {
/**
* Returns a REPLACED tracked file derived from {@code source}.
*
* <p>Manifest entries cannot transition to REPLACED.
* <p>Manifest files cannot transition to REPLACED.
*
* @param source source tracked file
* @param newSnapshotId the snapshot ID in which the new tracked file will be committed
*/
static TrackedFile replaced(TrackedFile source, long newSnapshotId) {
Preconditions.checkArgument(source != null, "Invalid source: null");
Preconditions.checkArgument(
!isLeafManifest(source.contentType()),
"Manifest entries cannot transition to REPLACED, but entry type is: %s",
source.contentType());
!isLeafManifest(source.contentType()), "Cannot transition manifest files to REPLACED");

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
!isLeafManifest(source.contentType()), "Cannot transition manifest files to REPLACED");
!isLeafManifest(source.contentType()), "Cannot transition manifest entries to REPLACED status");

return terminal(source, TrackingBuilder.replaced(source.tracking(), newSnapshotId));
}

Expand All @@ -147,18 +139,17 @@ private static TrackedFile terminal(TrackedFile source, Tracking tracking) {

private TrackedFileBuilder(FileContent contentType, long snapshotId) {
this.contentType = contentType;
this.snapshotId = snapshotId;
this.trackingBuilder = TrackingBuilder.added(snapshotId);
}

private TrackedFileBuilder(TrackedFile source, long snapshotId) {
this.contentType = source.contentType();
this.snapshotId = snapshotId;
this.formatVersion = source.formatVersion();
this.location = source.location();
this.fileFormat = source.fileFormat();
this.recordCount = source.recordCount();
this.fileSizeInBytes = source.fileSizeInBytes();
this.partitionData = (PartitionData) source.partition();
this.partition = (PartitionData) source.partition();

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

carry-over comment: "We should not need unchecked casts like this."

carry-over answer: "
The difficulty here is the TrackedFile.partition() return StructLike while the TrackedFileStruct constructor accepts PartitionData. Since internally in TrackedFile we also have PartitionData this seemed safe"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Casting partition() to PartitionData where the concrete type is actually needed is already the established pattern in core. For example, ManifestFilterManager:

PartitionData partition = (PartitionData) file.partition();

We can't narrow TrackedFile.partition() to return PartitionData to avoid the cast either. Every file and scan type in the codebase exposes partition() as StructLikeContentFile, BaseFile, V1Metadata/V2Metadata, PartitionScanTask, ContentScanTask, etc.

this.specId = source.specId();
this.contentStats = source.contentStats();
this.sortOrderId = source.sortOrderId();
Expand All @@ -167,7 +158,7 @@ private TrackedFileBuilder(TrackedFile source, long snapshotId) {
this.keyMetadata = source.keyMetadata();
this.splitOffsets = source.splitOffsets();
this.equalityIds = source.equalityIds();
this.sourceTracking = source.tracking();
this.trackingBuilder = TrackingBuilder.from(source.tracking(), snapshotId);
}

TrackedFileBuilder formatVersion(int newFormatVersion) {
Expand Down Expand Up @@ -205,15 +196,16 @@ TrackedFileBuilder fileSizeInBytes(long newFileSizeInBytes) {
return this;
}

TrackedFileBuilder specId(int newSpecId) {
Preconditions.checkArgument(newSpecId >= 0, "Invalid spec ID: %s (must be >= 0)", newSpecId);
this.specId = newSpecId;
return this;
}

TrackedFileBuilder partition(PartitionData newPartitionData) {
Preconditions.checkArgument(newPartitionData != null, "Invalid partition: null");
this.partitionData = newPartitionData;
TrackedFileBuilder partition(PartitionSpec spec, StructLike newPartition) {

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

carry-over comment: "I think that this should be combined with specId: partition(int newSpecId, StructLike partitionTuple). If the tuple is set, then spec ID is required so that we know how to interpret it. I suspect that this should also be responsible for wrapping the partition tuple with a projection to the union type."

carry-over answer: "I probably miss something here, but I see that TrackedFile has a PartitionData member. If we want to produce that, is specId and partitionTuple enough? Don't we need a PartitionSpec to produce PartitionData?"

Preconditions.checkArgument(spec != null, "Invalid spec: null");
if (spec.isUnpartitioned()) {
Preconditions.checkArgument(
newPartition == null, "Invalid partition: must be null for unpartitioned spec");
} else {
Preconditions.checkArgument(newPartition != null, "Invalid partition: null");
this.partition = DataFiles.copyPartitionData(spec, newPartition, null);
}
this.specId = spec.specId();
return this;
}

Expand All @@ -225,9 +217,7 @@ TrackedFileBuilder contentStats(ContentStats newContentStats) {

TrackedFileBuilder sortOrderId(int newSortOrderId) {
Preconditions.checkArgument(
!isLeafManifest(contentType),
"Sort order ID cannot be added to manifest entries, but entry type is: %s",
contentType);
!isLeafManifest(contentType), "Cannot set sort order for manifest files");
Preconditions.checkArgument(
newSortOrderId >= 0, "Invalid sort order ID: %s (must be >= 0)", newSortOrderId);
this.sortOrderId = newSortOrderId;
Expand All @@ -238,21 +228,29 @@ TrackedFileBuilder deletionVector(DeletionVector newDeletionVector) {
Preconditions.checkArgument(newDeletionVector != null, "Invalid deletion vector: null");
Preconditions.checkArgument(
contentType == FileContent.DATA,
"Deletion vector can only be added to DATA entries, but entry type is: %s",
"Cannot add deletion vector for file with content: %s",
contentType);
Preconditions.checkArgument(
this.deletionVector == null || !this.deletionVector.equals(newDeletionVector),
"The same deletion vector already added");
if (deletionVector != null) {
Preconditions.checkArgument(
deletionVector.cardinality() < newDeletionVector.cardinality(),
"Invalid DV update, cardinality must increase: existing=%s, new=%s",
deletionVector == null ? null : deletionVector.cardinality(),
newDeletionVector.cardinality());
Preconditions.checkArgument(
!deletionVector.location().equals(newDeletionVector.location())
|| deletionVector.offset() != newDeletionVector.offset(),
"Invalid DV update: same location and offset");
}
this.deletionVector = newDeletionVector;
this.dvUpdated = true;
trackingBuilder.dvUpdated();
return this;
}

TrackedFileBuilder manifestInfo(ManifestInfo newManifestInfo) {
Preconditions.checkArgument(newManifestInfo != null, "Invalid manifest info: null");
Preconditions.checkArgument(
isLeafManifest(contentType),
"Manifest info can only be added to manifests, but entry type is: %s",
"Cannot add manifest info for file with content: %s",
contentType);
this.manifestInfo = newManifestInfo;
return this;
Expand All @@ -267,9 +265,7 @@ TrackedFileBuilder keyMetadata(ByteBuffer newKeyMetadata) {
TrackedFileBuilder splitOffsets(List<Long> newSplitOffsets) {
Preconditions.checkArgument(newSplitOffsets != null, "Invalid split offsets: null");
Preconditions.checkArgument(
!isLeafManifest(contentType),
"Split offsets cannot be added to manifest entries, but entry type is: %s",
contentType);
!isLeafManifest(contentType), "Cannot set split offsets for manifest files");
this.splitOffsets = newSplitOffsets;
return this;
}
Expand All @@ -278,7 +274,7 @@ TrackedFileBuilder equalityIds(List<Integer> newEqualityIds) {
Preconditions.checkArgument(newEqualityIds != null, "Invalid equality IDs: null");
Preconditions.checkArgument(
contentType == FileContent.EQUALITY_DELETES,
"Equality IDs can only be added to EQUALITY_DELETES entries, but entry type is: %s",
"Cannot add equality IDs for file with content: %s",
contentType);
this.equalityIds = newEqualityIds;
return this;
Expand All @@ -288,19 +284,19 @@ TrackedFileBuilder deletedPositions(ByteBuffer newDeletedPositions) {
Preconditions.checkArgument(newDeletedPositions != null, "Invalid deleted positions: null");
Preconditions.checkArgument(
isLeafManifest(contentType),
"Deleted positions can only be added to manifest entries, but entry type is: %s",
"Cannot add deleted positions for file with content: %s",
contentType);
this.deletedPositions = newDeletedPositions;
trackingBuilder.deletedPositions(newDeletedPositions);
return this;
}

TrackedFileBuilder replacedPositions(ByteBuffer newReplacedPositions) {
Preconditions.checkArgument(newReplacedPositions != null, "Invalid replaced positions: null");
Preconditions.checkArgument(
isLeafManifest(contentType),
"Replaced positions can only be added to manifest entries, but entry type is: %s",
"Cannot add replaced positions for file with content: %s",
contentType);
this.replacedPositions = newReplacedPositions;
trackingBuilder.replacedPositions(newReplacedPositions);
return this;
}

Expand All @@ -315,38 +311,20 @@ TrackedFile build() {
Preconditions.checkArgument(recordCount != null, "Missing required field: record count");
Preconditions.checkArgument(
fileSizeInBytes != null, "Missing required field: file size in bytes");
Preconditions.checkArgument(partitionData != null, "Missing required field: partition data");
Preconditions.checkArgument(
!isLeafManifest(contentType) || manifestInfo != null,
"Missing required field: manifest info");
Preconditions.checkArgument(
contentType != FileContent.EQUALITY_DELETES || equalityIds != null,
"Missing required field: equality IDs");

TrackingBuilder trackingBuilder =
sourceTracking == null
? TrackingBuilder.added(snapshotId)
: TrackingBuilder.from(sourceTracking, snapshotId);

if (dvUpdated) {
trackingBuilder.dvUpdated();
}

if (deletedPositions != null) {
trackingBuilder.deletedPositions(deletedPositions);
}

if (replacedPositions != null) {
trackingBuilder.replacedPositions(replacedPositions);
}

return new TrackedFileStruct(
trackingBuilder.build(),
contentType,
formatVersion,
location,
fileFormat,
partitionData,
partition,
recordCount,
fileSizeInBytes,
specId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,64 +163,6 @@ void testBuilderMissingRequiredFields() {
.hasMessage("Missing required value: cardinality");
}

@Test
void testDvEquality() {
DeletionVectorStruct dv =
DeletionVectorStruct.builder()
.location("s3://bucket/data/dv.puffin")
.offset(256L)
.sizeInBytes(128L)
.cardinality(42L)
.build();

DeletionVectorStruct sameDv =
DeletionVectorStruct.builder()
.location("s3://bucket/data/dv.puffin")
.offset(256L)
.sizeInBytes(128L)
.cardinality(42L)
.build();

DeletionVectorStruct dvWithDifferentLocation =
DeletionVectorStruct.builder()
.location("s3://bucket/data/dv2.puffin")
.offset(256L)
.sizeInBytes(128L)
.cardinality(42L)
.build();

DeletionVectorStruct dvWithDifferentOffset =
DeletionVectorStruct.builder()
.location("s3://bucket/data/dv.puffin")
.offset(1L)
.sizeInBytes(128L)
.cardinality(42L)
.build();

DeletionVectorStruct dvWithDifferentSize =
DeletionVectorStruct.builder()
.location("s3://bucket/data/dv.puffin")
.offset(256L)
.sizeInBytes(8L)
.cardinality(42L)
.build();

DeletionVectorStruct dvWithDifferentCardinality =
DeletionVectorStruct.builder()
.location("s3://bucket/data/dv.puffin")
.offset(256L)
.sizeInBytes(128L)
.cardinality(2L)
.build();

assertThat(dv).isEqualTo(dv);
assertThat(dv).isEqualTo(sameDv);
assertThat(dv).isNotEqualTo(dvWithDifferentLocation);
assertThat(dv).isNotEqualTo(dvWithDifferentOffset);
assertThat(dv).isNotEqualTo(dvWithDifferentSize);
assertThat(dv).isNotEqualTo(dvWithDifferentCardinality);
}

@Test
void testBuilderRejectsInvalidValuesAtSetter() {
assertThatThrownBy(() -> DeletionVectorStruct.builder().location(null))
Expand Down
Loading