Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion table/orphan_cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ func (t Table) getReferencedFiles(ctx context.Context, fs iceio.IO, maxConcurren

// Add version hint file (for Hadoop-style tables)
// Following Java's ReachableFileUtil.versionHintLocation() logic:
versionHintPath := filepath.Join(metadata.Location(), "metadata", "version-hint.text")
versionHintPath := versionHintLocation(metadata.Location())
referenced[normalizeFilePath(versionHintPath)] = false

for sf := range metadata.Statistics() {
Expand Down Expand Up @@ -597,6 +597,16 @@ func normalizeFilePathWithConfig(path string, cfg *orphanCleanupConfig) string {
return normalizeNonURLPath(path)
}

func versionHintLocation(tableLocation string) string {
if strings.Contains(tableLocation, "://") || strings.HasPrefix(tableLocation, "file:") {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch adding || strings.HasPrefix(tableLocation, "file:") — since file:///x and file://host/x already contain ://, this branch is really only here to cover the single-slash file:/x opaque form. Just confirming that's the intent.

if joined, err := url.JoinPath(tableLocation, "metadata", "version-hint.text"); err == nil {
return joined
}
}

return filepath.Join(tableLocation, "metadata", "version-hint.text")
}

// normalizeURLPath normalizes URL-based file paths with scheme/authority equivalence.
//
// This function handles the complexities of cloud storage URIs where the same file
Expand Down
38 changes: 38 additions & 0 deletions table/orphan_cleanup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"fmt"
stdfs "io/fs"
"path/filepath"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -157,6 +158,41 @@ func TestNormalizeNonURLPath(t *testing.T) {
}
}

func TestVersionHintLocation(t *testing.T) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The three cases here all either contain :// (s3, file:///) or fall through to filepath.Join (local), so none of them actually exercise the file:-prefix-only branch above. A file:/tmp/tablefile:/tmp/table/metadata/version-hint.text case would lock in the one input that branch uniquely guards.

tests := []struct {
name string
location string
expected string
}{
{
name: "s3_uri",
location: "s3://bucket/table",
expected: "s3://bucket/table/metadata/version-hint.text",
},
{
name: "file_uri",
location: "file:///tmp/table",
expected: "file:///tmp/table/metadata/version-hint.text",
},
{
name: "file_uri_opaque",
location: "file:/tmp/table",
expected: "file:/tmp/table/metadata/version-hint.text",
},
{
name: "local_path",
location: filepath.Join("local", "table"),
expected: filepath.Join("local", "table", "metadata", "version-hint.text"),
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expected, versionHintLocation(tt.location))
})
}
}

func TestApplySchemeEquivalence(t *testing.T) {
equalSchemes := map[string]string{
"s3,s3a,s3n": "s3",
Expand Down Expand Up @@ -532,6 +568,8 @@ func TestGetReferencedFiles_IncludesStatisticsFiles(t *testing.T) {
assert.Contains(t, refs, normalizeFilePath("s3://bucket/stats/table-stats.puffin"))
assert.Contains(t, refs, normalizeFilePath("s3://bucket/stats/part-stats.puffin"))
assert.Contains(t, refs, normalizeFilePath(tbl.metadataLocation))
assert.Contains(t, refs, normalizeFilePath("s3://bucket/test/location/metadata/version-hint.text"))
assert.NotContains(t, refs, normalizeFilePath("s3:/bucket/test/location/metadata/version-hint.text"))
assert.NotContains(t, refs, normalizeFilePath("s3://bucket/stats/not-referenced.puffin"))
assert.NotContains(t, refs, "")
}
Expand Down
Loading